xref: /plugin/aichat/vendor/mehrab-wj/tiktoken-php/tests/EncoderTest.php (revision 8817535b0c67f8b10e9b8c05dcdf58fc17827423)
1*8817535bSAndreas Gohr<?php
2*8817535bSAndreas Gohr
3*8817535bSAndreas Gohrnamespace TikToken;
4*8817535bSAndreas Gohr
5*8817535bSAndreas Gohruse PHPUnit\Framework\TestCase;
6*8817535bSAndreas Gohr
7*8817535bSAndreas Gohrclass EncoderTest extends TestCase
8*8817535bSAndreas Gohr{
9*8817535bSAndreas Gohr    public function testEncode(): void
10*8817535bSAndreas Gohr    {
11*8817535bSAndreas Gohr        $encoder = new Encoder();
12*8817535bSAndreas Gohr
13*8817535bSAndreas Gohr        $longText = <<<EOT
14*8817535bSAndreas Gohr            BPE ensures that the most common words are represented in the vocabulary as a single token while the rare words are broken down into two or more subword tokens and this is in agreement with what a subword-based tokenization algorithm does.
15*8817535bSAndreas Gohr            EOT;
16*8817535bSAndreas Gohr
17*8817535bSAndreas Gohr        $this->assertEquals([1212, 318, 617, 2420], $encoder->encode('This is some text'));
18*8817535bSAndreas Gohr        $this->assertEquals([10134, 23858, 21746], $encoder->encode('hasOwnProperty'));
19*8817535bSAndreas Gohr        $this->assertEquals([10163, 2231, 30924, 3829], $encoder->encode('1234567890'));
20*8817535bSAndreas Gohr        $this->assertEquals([15496, 11854, 616, 1468, 1545], $encoder->encode('Hello darkness my old friend'));
21*8817535bSAndreas Gohr        $this->assertEquals([31373, 50169, 233, 995, 12520, 234, 235], $encoder->encode('hello �� world ��'));
22*8817535bSAndreas Gohr        $this->assertEquals([33, 11401, 19047, 326, 262, 749, 2219, 2456, 389, 7997, 287, 262, 25818, 355, 257, 2060, 11241, 981, 262, 4071, 2456, 389, 5445, 866, 656, 734, 393, 517, 850, 4775, 16326, 290, 428, 318, 287, 4381, 351, 644, 257, 850, 4775, 12, 3106, 11241, 1634, 11862, 857, 13], $encoder->encode($longText));
23*8817535bSAndreas Gohr        $this->assertEquals([33, 11401, 19047, 326, 262, 749, 2219, 2456, 389, 7997, 287, 262, 25818, 355, 257, 2060, 11241, 981, 262, 4071, 2456, 389, 5445, 866, 656, 734, 393, 517, 850, 4775, 16326, 290, 428, 318, 287, 4381, 351, 644, 257, 850, 4775, 12, 3106, 11241, 1634, 11862, 857, 13], $encoder->encode($longText));
24*8817535bSAndreas Gohr        $this->assertEquals([38374, 268, 292, 256, 446, 274, 31215, 285, 8836, 13], $encoder->encode('Buenas tardes para mí.'));
25*8817535bSAndreas Gohr        $this->assertEquals([65, 2634, 65, 2634], $encoder->encode('bébé'));
26*8817535bSAndreas Gohr        $this->assertEquals([344, 979, 1556, 555, 48659, 660, 18702, 84, 2634, 551, 1216, 272, 16175, 15152, 28141, 1490, 22161, 390, 256, 7834, 8591, 4938, 43816], $encoder->encode('ceci est un texte accentué en français à visée de tester la validité'));
27*8817535bSAndreas Gohr    }
28*8817535bSAndreas Gohr}
29