1*8817535bSAndreas Gohr<?php 2*8817535bSAndreas Gohr 3*8817535bSAndreas Gohrnamespace TikToken; 4*8817535bSAndreas Gohr 5*8817535bSAndreas Gohrclass Encoder 6*8817535bSAndreas Gohr{ 7*8817535bSAndreas Gohr private bool $initialized = false; 8*8817535bSAndreas Gohr 9*8817535bSAndreas Gohr /** @var array<string> */ 10*8817535bSAndreas Gohr private array $bpeCache = []; 11*8817535bSAndreas Gohr 12*8817535bSAndreas Gohr /** @var array<string> */ 13*8817535bSAndreas Gohr private array $rawCharacters = []; 14*8817535bSAndreas Gohr 15*8817535bSAndreas Gohr /** @var array<string> */ 16*8817535bSAndreas Gohr private array $encoder = []; 17*8817535bSAndreas Gohr 18*8817535bSAndreas Gohr /** @var array<array<int>> */ 19*8817535bSAndreas Gohr private array $bpeRanks = []; 20*8817535bSAndreas Gohr 21*8817535bSAndreas Gohr private function initialize(): void 22*8817535bSAndreas Gohr { 23*8817535bSAndreas Gohr if ($this->initialized) { 24*8817535bSAndreas Gohr return; 25*8817535bSAndreas Gohr } 26*8817535bSAndreas Gohr $rawCharacters = file_get_contents(__DIR__.'/../data/characters.json'); 27*8817535bSAndreas Gohr if (false === $rawCharacters) { 28*8817535bSAndreas Gohr throw new \RuntimeException('Unable to load characters.json'); 29*8817535bSAndreas Gohr } 30*8817535bSAndreas Gohr $this->rawCharacters = json_decode($rawCharacters, true, 512, JSON_THROW_ON_ERROR); 31*8817535bSAndreas Gohr 32*8817535bSAndreas Gohr $encoder = file_get_contents(__DIR__.'/../data/encoder.json'); 33*8817535bSAndreas Gohr if (false === $encoder) { 34*8817535bSAndreas Gohr throw new \RuntimeException('Unable to load encoder.json'); 35*8817535bSAndreas Gohr } 36*8817535bSAndreas Gohr $this->encoder = json_decode($encoder, true, 512, JSON_THROW_ON_ERROR); 37*8817535bSAndreas Gohr 38*8817535bSAndreas Gohr $bpeDictionary = file_get_contents(__DIR__.'/../data/vocab.bpe'); 39*8817535bSAndreas Gohr if (false === $bpeDictionary) { 40*8817535bSAndreas Gohr throw new \RuntimeException('Unable to load vocab.bpe'); 41*8817535bSAndreas Gohr } 42*8817535bSAndreas Gohr 43*8817535bSAndreas Gohr $lines = preg_split('#\r\n|\r|\n#', $bpeDictionary); 44*8817535bSAndreas Gohr if (false === $lines) { 45*8817535bSAndreas Gohr throw new \RuntimeException('Unable to split vocab.bpe'); 46*8817535bSAndreas Gohr } 47*8817535bSAndreas Gohr 48*8817535bSAndreas Gohr $bpeMerges = []; 49*8817535bSAndreas Gohr $rawDictionaryLines = array_slice($lines, 1, count($lines), true); 50*8817535bSAndreas Gohr foreach ($rawDictionaryLines as $rawDictionaryLine) { 51*8817535bSAndreas Gohr $splitLine = preg_split('#(\s+)#', (string) $rawDictionaryLine); 52*8817535bSAndreas Gohr if (false === $splitLine) { 53*8817535bSAndreas Gohr continue; 54*8817535bSAndreas Gohr } 55*8817535bSAndreas Gohr $splitLine = array_filter($splitLine, $this->filterEmpty(...)); 56*8817535bSAndreas Gohr if ([] !== $splitLine) { 57*8817535bSAndreas Gohr $bpeMerges[] = $splitLine; 58*8817535bSAndreas Gohr } 59*8817535bSAndreas Gohr } 60*8817535bSAndreas Gohr 61*8817535bSAndreas Gohr $this->bpeRanks = $this->buildBpeRanks($bpeMerges); 62*8817535bSAndreas Gohr $this->initialized = true; 63*8817535bSAndreas Gohr } 64*8817535bSAndreas Gohr 65*8817535bSAndreas Gohr /** @return array<string> */ 66*8817535bSAndreas Gohr public function encode(string $text): array 67*8817535bSAndreas Gohr { 68*8817535bSAndreas Gohr if (empty($text)) { 69*8817535bSAndreas Gohr return []; 70*8817535bSAndreas Gohr } 71*8817535bSAndreas Gohr 72*8817535bSAndreas Gohr $this->initialize(); 73*8817535bSAndreas Gohr 74*8817535bSAndreas Gohr preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches); 75*8817535bSAndreas Gohr if (!isset($matches[0]) || 0 == (is_countable($matches[0]) ? count($matches[0]) : 0)) { 76*8817535bSAndreas Gohr return []; 77*8817535bSAndreas Gohr } 78*8817535bSAndreas Gohr 79*8817535bSAndreas Gohr $bpeTokens = []; 80*8817535bSAndreas Gohr foreach ($matches[0] as $token) { 81*8817535bSAndreas Gohr $token = mb_convert_encoding((string) $token, "UTF-8", "ISO-8859-1"); 82*8817535bSAndreas Gohr $characters = mb_str_split($token, 1, 'UTF-8'); 83*8817535bSAndreas Gohr 84*8817535bSAndreas Gohr $resultWord = ''; 85*8817535bSAndreas Gohr foreach ($characters as $char) { 86*8817535bSAndreas Gohr if (!isset($this->rawCharacters[$this->characterToUnicode($char)])) { 87*8817535bSAndreas Gohr continue; 88*8817535bSAndreas Gohr } 89*8817535bSAndreas Gohr $resultWord .= $this->rawCharacters[$this->characterToUnicode($char)]; 90*8817535bSAndreas Gohr } 91*8817535bSAndreas Gohr 92*8817535bSAndreas Gohr $newTokensBpe = $this->bpe($resultWord); 93*8817535bSAndreas Gohr $newTokensBpe = explode(' ', $newTokensBpe); 94*8817535bSAndreas Gohr foreach ($newTokensBpe as $newBpeToken) { 95*8817535bSAndreas Gohr $encoded = $this->encoder[$newBpeToken] ?? $newBpeToken; 96*8817535bSAndreas Gohr if (isset($bpeTokens[$newBpeToken])) { 97*8817535bSAndreas Gohr $bpeTokens[] = $encoded; 98*8817535bSAndreas Gohr } else { 99*8817535bSAndreas Gohr $bpeTokens[$newBpeToken] = $encoded; 100*8817535bSAndreas Gohr } 101*8817535bSAndreas Gohr } 102*8817535bSAndreas Gohr } 103*8817535bSAndreas Gohr 104*8817535bSAndreas Gohr return array_values($bpeTokens); 105*8817535bSAndreas Gohr } 106*8817535bSAndreas Gohr 107*8817535bSAndreas Gohr private function filterEmpty(mixed $var): bool 108*8817535bSAndreas Gohr { 109*8817535bSAndreas Gohr return null !== $var && false !== $var && '' !== $var; 110*8817535bSAndreas Gohr } 111*8817535bSAndreas Gohr 112*8817535bSAndreas Gohr private function characterToUnicode(string $characters): int 113*8817535bSAndreas Gohr { 114*8817535bSAndreas Gohr $firstCharacterCode = ord($characters[0]); 115*8817535bSAndreas Gohr 116*8817535bSAndreas Gohr if ($firstCharacterCode <= 127) { 117*8817535bSAndreas Gohr return $firstCharacterCode; 118*8817535bSAndreas Gohr } 119*8817535bSAndreas Gohr 120*8817535bSAndreas Gohr if ($firstCharacterCode >= 192 && $firstCharacterCode <= 223) { 121*8817535bSAndreas Gohr return ($firstCharacterCode - 192) * 64 + (ord($characters[1]) - 128); 122*8817535bSAndreas Gohr } 123*8817535bSAndreas Gohr 124*8817535bSAndreas Gohr if ($firstCharacterCode >= 224 && $firstCharacterCode <= 239) { 125*8817535bSAndreas Gohr return ($firstCharacterCode - 224) * 4096 + (ord($characters[1]) - 128) * 64 + (ord($characters[2]) - 128); 126*8817535bSAndreas Gohr } 127*8817535bSAndreas Gohr 128*8817535bSAndreas Gohr if ($firstCharacterCode >= 240 && $firstCharacterCode <= 247) { 129*8817535bSAndreas Gohr return ($firstCharacterCode - 240) * 262144 + (ord($characters[1]) - 128) * 4096 + (ord($characters[2]) - 128) * 64 + (ord($characters[3]) - 128); 130*8817535bSAndreas Gohr } 131*8817535bSAndreas Gohr 132*8817535bSAndreas Gohr if ($firstCharacterCode >= 248 && $firstCharacterCode <= 251) { 133*8817535bSAndreas Gohr return ($firstCharacterCode - 248) * 16_777_216 + (ord($characters[1]) - 128) * 262144 + (ord($characters[2]) - 128) * 4096 + (ord($characters[3]) - 128) * 64 + (ord($characters[4]) - 128); 134*8817535bSAndreas Gohr } 135*8817535bSAndreas Gohr 136*8817535bSAndreas Gohr if ($firstCharacterCode >= 252 && $firstCharacterCode <= 253) { 137*8817535bSAndreas Gohr return ($firstCharacterCode - 252) * 1_073_741_824 + (ord($characters[1]) - 128) * 16_777_216 + (ord($characters[2]) - 128) * 262144 + (ord($characters[3]) - 128) * 4096 + (ord($characters[4]) - 128) * 64 + (ord($characters[5]) - 128); 138*8817535bSAndreas Gohr } 139*8817535bSAndreas Gohr 140*8817535bSAndreas Gohr if ($firstCharacterCode >= 254) { 141*8817535bSAndreas Gohr return 0; 142*8817535bSAndreas Gohr } 143*8817535bSAndreas Gohr 144*8817535bSAndreas Gohr return 0; 145*8817535bSAndreas Gohr } 146*8817535bSAndreas Gohr 147*8817535bSAndreas Gohr /** 148*8817535bSAndreas Gohr * @param array<array<mixed>> $bpes 149*8817535bSAndreas Gohr * 150*8817535bSAndreas Gohr * @return array<array<int>> 151*8817535bSAndreas Gohr */ 152*8817535bSAndreas Gohr private function buildBpeRanks(array $bpes): array 153*8817535bSAndreas Gohr { 154*8817535bSAndreas Gohr $result = []; 155*8817535bSAndreas Gohr $rank = 0; 156*8817535bSAndreas Gohr foreach ($bpes as $bpe) { 157*8817535bSAndreas Gohr if (!isset($bpe[1], $bpe[0])) { 158*8817535bSAndreas Gohr continue; 159*8817535bSAndreas Gohr } 160*8817535bSAndreas Gohr 161*8817535bSAndreas Gohr $result[$bpe[0]][$bpe[1]] = $rank; 162*8817535bSAndreas Gohr ++$rank; 163*8817535bSAndreas Gohr } 164*8817535bSAndreas Gohr 165*8817535bSAndreas Gohr return $result; 166*8817535bSAndreas Gohr } 167*8817535bSAndreas Gohr 168*8817535bSAndreas Gohr /** 169*8817535bSAndreas Gohr * Return set of symbol pairs in a word. 170*8817535bSAndreas Gohr * Word is represented as tuple of symbols (symbols being variable-length strings). 171*8817535bSAndreas Gohr * 172*8817535bSAndreas Gohr * @param array<int, string> $word 173*8817535bSAndreas Gohr * 174*8817535bSAndreas Gohr * @return mixed[] 175*8817535bSAndreas Gohr */ 176*8817535bSAndreas Gohr private function buildSymbolPairs(array $word): array 177*8817535bSAndreas Gohr { 178*8817535bSAndreas Gohr $pairs = []; 179*8817535bSAndreas Gohr $previousPart = null; 180*8817535bSAndreas Gohr foreach ($word as $i => $part) { 181*8817535bSAndreas Gohr if ($i > 0) { 182*8817535bSAndreas Gohr $pairs[] = [$previousPart, $part]; 183*8817535bSAndreas Gohr } 184*8817535bSAndreas Gohr 185*8817535bSAndreas Gohr $previousPart = $part; 186*8817535bSAndreas Gohr } 187*8817535bSAndreas Gohr 188*8817535bSAndreas Gohr return $pairs; 189*8817535bSAndreas Gohr } 190*8817535bSAndreas Gohr 191*8817535bSAndreas Gohr private function bpe(string $token): string 192*8817535bSAndreas Gohr { 193*8817535bSAndreas Gohr if (isset($this->bpeCache[$token])) { 194*8817535bSAndreas Gohr return $this->bpeCache[$token]; 195*8817535bSAndreas Gohr } 196*8817535bSAndreas Gohr 197*8817535bSAndreas Gohr $word = mb_str_split($token, 1, 'UTF-8'); 198*8817535bSAndreas Gohr $initialLength = count($word); 199*8817535bSAndreas Gohr $pairs = $this->buildSymbolPairs($word); 200*8817535bSAndreas Gohr if ([] === $pairs) { 201*8817535bSAndreas Gohr return $token; 202*8817535bSAndreas Gohr } 203*8817535bSAndreas Gohr 204*8817535bSAndreas Gohr while (true) { 205*8817535bSAndreas Gohr $minPairs = []; 206*8817535bSAndreas Gohr foreach ($pairs as $pair) { 207*8817535bSAndreas Gohr if (isset($this->bpeRanks[$pair[0]][$pair[1]])) { 208*8817535bSAndreas Gohr $rank = $this->bpeRanks[$pair[0]][$pair[1]]; 209*8817535bSAndreas Gohr $minPairs[$rank] = $pair; 210*8817535bSAndreas Gohr } else { 211*8817535bSAndreas Gohr $minPairs[10e10] = $pair; 212*8817535bSAndreas Gohr } 213*8817535bSAndreas Gohr } 214*8817535bSAndreas Gohr 215*8817535bSAndreas Gohr $minPairsKeys = array_keys($minPairs); 216*8817535bSAndreas Gohr sort($minPairsKeys, SORT_NUMERIC); 217*8817535bSAndreas Gohr $minimumKey = $minPairsKeys[0] ?? null; 218*8817535bSAndreas Gohr 219*8817535bSAndreas Gohr $bigram = $minPairs[$minimumKey]; 220*8817535bSAndreas Gohr if (!isset($this->bpeRanks[$bigram[0]][$bigram[1]])) { 221*8817535bSAndreas Gohr break; 222*8817535bSAndreas Gohr } 223*8817535bSAndreas Gohr 224*8817535bSAndreas Gohr $first = $bigram[0]; 225*8817535bSAndreas Gohr $second = $bigram[1]; 226*8817535bSAndreas Gohr $newWord = []; 227*8817535bSAndreas Gohr $i = 0; 228*8817535bSAndreas Gohr while ($i < count($word)) { 229*8817535bSAndreas Gohr $j = $this->indexOf($word, $first, $i); 230*8817535bSAndreas Gohr if (-1 === $j) { 231*8817535bSAndreas Gohr $newWord = [ 232*8817535bSAndreas Gohr ...$newWord, 233*8817535bSAndreas Gohr ...array_slice($word, $i, null, true), 234*8817535bSAndreas Gohr ]; 235*8817535bSAndreas Gohr break; 236*8817535bSAndreas Gohr } 237*8817535bSAndreas Gohr 238*8817535bSAndreas Gohr $slicer = $i > $j || 0 === $j ? [] : array_slice($word, $i, $j - $i, true); 239*8817535bSAndreas Gohr 240*8817535bSAndreas Gohr $newWord = [ 241*8817535bSAndreas Gohr ...$newWord, 242*8817535bSAndreas Gohr ...$slicer, 243*8817535bSAndreas Gohr ]; 244*8817535bSAndreas Gohr if (count($newWord) > $initialLength) { 245*8817535bSAndreas Gohr break; 246*8817535bSAndreas Gohr } 247*8817535bSAndreas Gohr 248*8817535bSAndreas Gohr $i = $j; 249*8817535bSAndreas Gohr if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) { 250*8817535bSAndreas Gohr $newWord[] = $first.$second; 251*8817535bSAndreas Gohr $i += 2; 252*8817535bSAndreas Gohr } else { 253*8817535bSAndreas Gohr $newWord[] = $word[$i]; 254*8817535bSAndreas Gohr ++$i; 255*8817535bSAndreas Gohr } 256*8817535bSAndreas Gohr } 257*8817535bSAndreas Gohr 258*8817535bSAndreas Gohr if ($word === $newWord) { 259*8817535bSAndreas Gohr break; 260*8817535bSAndreas Gohr } 261*8817535bSAndreas Gohr 262*8817535bSAndreas Gohr $word = $newWord; 263*8817535bSAndreas Gohr if (1 === count($word)) { 264*8817535bSAndreas Gohr break; 265*8817535bSAndreas Gohr } 266*8817535bSAndreas Gohr 267*8817535bSAndreas Gohr $pairs = $this->buildSymbolPairs($word); 268*8817535bSAndreas Gohr } 269*8817535bSAndreas Gohr 270*8817535bSAndreas Gohr $word = implode(' ', $word); 271*8817535bSAndreas Gohr $this->bpeCache[$token] = $word; 272*8817535bSAndreas Gohr 273*8817535bSAndreas Gohr return $word; 274*8817535bSAndreas Gohr } 275*8817535bSAndreas Gohr 276*8817535bSAndreas Gohr /** 277*8817535bSAndreas Gohr * @param array<int, string> $array 278*8817535bSAndreas Gohr */ 279*8817535bSAndreas Gohr private function indexOf(array $array, string $searchElement, int $fromIndex): int 280*8817535bSAndreas Gohr { 281*8817535bSAndreas Gohr $slicedArray = array_slice($array, $fromIndex, preserve_keys: true); 282*8817535bSAndreas Gohr 283*8817535bSAndreas Gohr $indexed = array_search($searchElement, $slicedArray); 284*8817535bSAndreas Gohr 285*8817535bSAndreas Gohr return false === $indexed ? -1 : $indexed; 286*8817535bSAndreas Gohr } 287*8817535bSAndreas Gohr} 288