1*8817535bSAndreas Gohr<?php 2*8817535bSAndreas Gohr 3*8817535bSAndreas Gohrnamespace TikToken; 4*8817535bSAndreas Gohr 5*8817535bSAndreas Gohruse PHPUnit\Framework\TestCase; 6*8817535bSAndreas Gohr 7*8817535bSAndreas Gohrclass EncoderTest extends TestCase 8*8817535bSAndreas Gohr{ 9*8817535bSAndreas Gohr public function testEncode(): void 10*8817535bSAndreas Gohr { 11*8817535bSAndreas Gohr $encoder = new Encoder(); 12*8817535bSAndreas Gohr 13*8817535bSAndreas Gohr $longText = <<<EOT 14*8817535bSAndreas Gohr BPE ensures that the most common words are represented in the vocabulary as a single token while the rare words are broken down into two or more subword tokens and this is in agreement with what a subword-based tokenization algorithm does. 15*8817535bSAndreas Gohr EOT; 16*8817535bSAndreas Gohr 17*8817535bSAndreas Gohr $this->assertEquals([1212, 318, 617, 2420], $encoder->encode('This is some text')); 18*8817535bSAndreas Gohr $this->assertEquals([10134, 23858, 21746], $encoder->encode('hasOwnProperty')); 19*8817535bSAndreas Gohr $this->assertEquals([10163, 2231, 30924, 3829], $encoder->encode('1234567890')); 20*8817535bSAndreas Gohr $this->assertEquals([15496, 11854, 616, 1468, 1545], $encoder->encode('Hello darkness my old friend')); 21*8817535bSAndreas Gohr $this->assertEquals([31373, 50169, 233, 995, 12520, 234, 235], $encoder->encode('hello world ')); 22*8817535bSAndreas Gohr $this->assertEquals([33, 11401, 19047, 326, 262, 749, 2219, 2456, 389, 7997, 287, 262, 25818, 355, 257, 2060, 11241, 981, 262, 4071, 2456, 389, 5445, 866, 656, 734, 393, 517, 850, 4775, 16326, 290, 428, 318, 287, 4381, 351, 644, 257, 850, 4775, 12, 3106, 11241, 1634, 11862, 857, 13], $encoder->encode($longText)); 23*8817535bSAndreas Gohr $this->assertEquals([33, 11401, 19047, 326, 262, 749, 2219, 2456, 389, 7997, 287, 262, 25818, 355, 257, 2060, 11241, 981, 262, 4071, 2456, 389, 5445, 866, 656, 734, 393, 517, 850, 4775, 16326, 290, 428, 318, 287, 4381, 351, 644, 257, 850, 4775, 12, 3106, 11241, 1634, 11862, 857, 13], $encoder->encode($longText)); 24*8817535bSAndreas Gohr $this->assertEquals([38374, 268, 292, 256, 446, 274, 31215, 285, 8836, 13], $encoder->encode('Buenas tardes para mí.')); 25*8817535bSAndreas Gohr $this->assertEquals([65, 2634, 65, 2634], $encoder->encode('bébé')); 26*8817535bSAndreas Gohr $this->assertEquals([344, 979, 1556, 555, 48659, 660, 18702, 84, 2634, 551, 1216, 272, 16175, 15152, 28141, 1490, 22161, 390, 256, 7834, 8591, 4938, 43816], $encoder->encode('ceci est un texte accentué en français à visée de tester la validité')); 27*8817535bSAndreas Gohr } 28*8817535bSAndreas Gohr} 29