xref: /plugin/aichat/vendor/mehrab-wj/tiktoken-php/src/Encoder.php (revision 8817535b0c67f8b10e9b8c05dcdf58fc17827423)
1*8817535bSAndreas Gohr<?php
2*8817535bSAndreas Gohr
3*8817535bSAndreas Gohrnamespace TikToken;
4*8817535bSAndreas Gohr
5*8817535bSAndreas Gohrclass Encoder
6*8817535bSAndreas Gohr{
7*8817535bSAndreas Gohr    private bool $initialized = false;
8*8817535bSAndreas Gohr
9*8817535bSAndreas Gohr    /** @var array<string> */
10*8817535bSAndreas Gohr    private array $bpeCache = [];
11*8817535bSAndreas Gohr
12*8817535bSAndreas Gohr    /** @var array<string> */
13*8817535bSAndreas Gohr    private array $rawCharacters = [];
14*8817535bSAndreas Gohr
15*8817535bSAndreas Gohr    /** @var array<string> */
16*8817535bSAndreas Gohr    private array $encoder = [];
17*8817535bSAndreas Gohr
18*8817535bSAndreas Gohr    /** @var array<array<int>> */
19*8817535bSAndreas Gohr    private array $bpeRanks = [];
20*8817535bSAndreas Gohr
21*8817535bSAndreas Gohr    private function initialize(): void
22*8817535bSAndreas Gohr    {
23*8817535bSAndreas Gohr        if ($this->initialized) {
24*8817535bSAndreas Gohr            return;
25*8817535bSAndreas Gohr        }
26*8817535bSAndreas Gohr        $rawCharacters = file_get_contents(__DIR__.'/../data/characters.json');
27*8817535bSAndreas Gohr        if (false === $rawCharacters) {
28*8817535bSAndreas Gohr            throw new \RuntimeException('Unable to load characters.json');
29*8817535bSAndreas Gohr        }
30*8817535bSAndreas Gohr        $this->rawCharacters = json_decode($rawCharacters, true, 512, JSON_THROW_ON_ERROR);
31*8817535bSAndreas Gohr
32*8817535bSAndreas Gohr        $encoder = file_get_contents(__DIR__.'/../data/encoder.json');
33*8817535bSAndreas Gohr        if (false === $encoder) {
34*8817535bSAndreas Gohr            throw new \RuntimeException('Unable to load encoder.json');
35*8817535bSAndreas Gohr        }
36*8817535bSAndreas Gohr        $this->encoder = json_decode($encoder, true, 512, JSON_THROW_ON_ERROR);
37*8817535bSAndreas Gohr
38*8817535bSAndreas Gohr        $bpeDictionary = file_get_contents(__DIR__.'/../data/vocab.bpe');
39*8817535bSAndreas Gohr        if (false === $bpeDictionary) {
40*8817535bSAndreas Gohr            throw new \RuntimeException('Unable to load vocab.bpe');
41*8817535bSAndreas Gohr        }
42*8817535bSAndreas Gohr
43*8817535bSAndreas Gohr        $lines = preg_split('#\r\n|\r|\n#', $bpeDictionary);
44*8817535bSAndreas Gohr        if (false === $lines) {
45*8817535bSAndreas Gohr            throw new \RuntimeException('Unable to split vocab.bpe');
46*8817535bSAndreas Gohr        }
47*8817535bSAndreas Gohr
48*8817535bSAndreas Gohr        $bpeMerges = [];
49*8817535bSAndreas Gohr        $rawDictionaryLines = array_slice($lines, 1, count($lines), true);
50*8817535bSAndreas Gohr        foreach ($rawDictionaryLines as $rawDictionaryLine) {
51*8817535bSAndreas Gohr            $splitLine = preg_split('#(\s+)#', (string) $rawDictionaryLine);
52*8817535bSAndreas Gohr            if (false === $splitLine) {
53*8817535bSAndreas Gohr                continue;
54*8817535bSAndreas Gohr            }
55*8817535bSAndreas Gohr            $splitLine = array_filter($splitLine, $this->filterEmpty(...));
56*8817535bSAndreas Gohr            if ([] !== $splitLine) {
57*8817535bSAndreas Gohr                $bpeMerges[] = $splitLine;
58*8817535bSAndreas Gohr            }
59*8817535bSAndreas Gohr        }
60*8817535bSAndreas Gohr
61*8817535bSAndreas Gohr        $this->bpeRanks = $this->buildBpeRanks($bpeMerges);
62*8817535bSAndreas Gohr        $this->initialized = true;
63*8817535bSAndreas Gohr    }
64*8817535bSAndreas Gohr
65*8817535bSAndreas Gohr    /** @return array<string> */
66*8817535bSAndreas Gohr    public function encode(string $text): array
67*8817535bSAndreas Gohr    {
68*8817535bSAndreas Gohr        if (empty($text)) {
69*8817535bSAndreas Gohr            return [];
70*8817535bSAndreas Gohr        }
71*8817535bSAndreas Gohr
72*8817535bSAndreas Gohr        $this->initialize();
73*8817535bSAndreas Gohr
74*8817535bSAndreas Gohr        preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
75*8817535bSAndreas Gohr        if (!isset($matches[0]) || 0 == (is_countable($matches[0]) ? count($matches[0]) : 0)) {
76*8817535bSAndreas Gohr            return [];
77*8817535bSAndreas Gohr        }
78*8817535bSAndreas Gohr
79*8817535bSAndreas Gohr        $bpeTokens = [];
80*8817535bSAndreas Gohr        foreach ($matches[0] as $token) {
81*8817535bSAndreas Gohr            $token = mb_convert_encoding((string) $token, "UTF-8", "ISO-8859-1");
82*8817535bSAndreas Gohr            $characters = mb_str_split($token, 1, 'UTF-8');
83*8817535bSAndreas Gohr
84*8817535bSAndreas Gohr            $resultWord = '';
85*8817535bSAndreas Gohr            foreach ($characters as $char) {
86*8817535bSAndreas Gohr                if (!isset($this->rawCharacters[$this->characterToUnicode($char)])) {
87*8817535bSAndreas Gohr                    continue;
88*8817535bSAndreas Gohr                }
89*8817535bSAndreas Gohr                $resultWord .= $this->rawCharacters[$this->characterToUnicode($char)];
90*8817535bSAndreas Gohr            }
91*8817535bSAndreas Gohr
92*8817535bSAndreas Gohr            $newTokensBpe = $this->bpe($resultWord);
93*8817535bSAndreas Gohr            $newTokensBpe = explode(' ', $newTokensBpe);
94*8817535bSAndreas Gohr            foreach ($newTokensBpe as $newBpeToken) {
95*8817535bSAndreas Gohr                $encoded = $this->encoder[$newBpeToken] ?? $newBpeToken;
96*8817535bSAndreas Gohr                if (isset($bpeTokens[$newBpeToken])) {
97*8817535bSAndreas Gohr                    $bpeTokens[] = $encoded;
98*8817535bSAndreas Gohr                } else {
99*8817535bSAndreas Gohr                    $bpeTokens[$newBpeToken] = $encoded;
100*8817535bSAndreas Gohr                }
101*8817535bSAndreas Gohr            }
102*8817535bSAndreas Gohr        }
103*8817535bSAndreas Gohr
104*8817535bSAndreas Gohr        return array_values($bpeTokens);
105*8817535bSAndreas Gohr    }
106*8817535bSAndreas Gohr
107*8817535bSAndreas Gohr    private function filterEmpty(mixed $var): bool
108*8817535bSAndreas Gohr    {
109*8817535bSAndreas Gohr        return null !== $var && false !== $var && '' !== $var;
110*8817535bSAndreas Gohr    }
111*8817535bSAndreas Gohr
112*8817535bSAndreas Gohr    private function characterToUnicode(string $characters): int
113*8817535bSAndreas Gohr    {
114*8817535bSAndreas Gohr        $firstCharacterCode = ord($characters[0]);
115*8817535bSAndreas Gohr
116*8817535bSAndreas Gohr        if ($firstCharacterCode <= 127) {
117*8817535bSAndreas Gohr            return $firstCharacterCode;
118*8817535bSAndreas Gohr        }
119*8817535bSAndreas Gohr
120*8817535bSAndreas Gohr        if ($firstCharacterCode >= 192 && $firstCharacterCode <= 223) {
121*8817535bSAndreas Gohr            return ($firstCharacterCode - 192) * 64 + (ord($characters[1]) - 128);
122*8817535bSAndreas Gohr        }
123*8817535bSAndreas Gohr
124*8817535bSAndreas Gohr        if ($firstCharacterCode >= 224 && $firstCharacterCode <= 239) {
125*8817535bSAndreas Gohr            return ($firstCharacterCode - 224) * 4096 + (ord($characters[1]) - 128) * 64 + (ord($characters[2]) - 128);
126*8817535bSAndreas Gohr        }
127*8817535bSAndreas Gohr
128*8817535bSAndreas Gohr        if ($firstCharacterCode >= 240 && $firstCharacterCode <= 247) {
129*8817535bSAndreas Gohr            return ($firstCharacterCode - 240) * 262144 + (ord($characters[1]) - 128) * 4096 + (ord($characters[2]) - 128) * 64 + (ord($characters[3]) - 128);
130*8817535bSAndreas Gohr        }
131*8817535bSAndreas Gohr
132*8817535bSAndreas Gohr        if ($firstCharacterCode >= 248 && $firstCharacterCode <= 251) {
133*8817535bSAndreas Gohr            return ($firstCharacterCode - 248) * 16_777_216 + (ord($characters[1]) - 128) * 262144 + (ord($characters[2]) - 128) * 4096 + (ord($characters[3]) - 128) * 64 + (ord($characters[4]) - 128);
134*8817535bSAndreas Gohr        }
135*8817535bSAndreas Gohr
136*8817535bSAndreas Gohr        if ($firstCharacterCode >= 252 && $firstCharacterCode <= 253) {
137*8817535bSAndreas Gohr            return ($firstCharacterCode - 252) * 1_073_741_824 + (ord($characters[1]) - 128) * 16_777_216 + (ord($characters[2]) - 128) * 262144 + (ord($characters[3]) - 128) * 4096 + (ord($characters[4]) - 128) * 64 + (ord($characters[5]) - 128);
138*8817535bSAndreas Gohr        }
139*8817535bSAndreas Gohr
140*8817535bSAndreas Gohr        if ($firstCharacterCode >= 254) {
141*8817535bSAndreas Gohr            return 0;
142*8817535bSAndreas Gohr        }
143*8817535bSAndreas Gohr
144*8817535bSAndreas Gohr        return 0;
145*8817535bSAndreas Gohr    }
146*8817535bSAndreas Gohr
147*8817535bSAndreas Gohr    /**
148*8817535bSAndreas Gohr     * @param array<array<mixed>> $bpes
149*8817535bSAndreas Gohr     *
150*8817535bSAndreas Gohr     * @return array<array<int>>
151*8817535bSAndreas Gohr     */
152*8817535bSAndreas Gohr    private function buildBpeRanks(array $bpes): array
153*8817535bSAndreas Gohr    {
154*8817535bSAndreas Gohr        $result = [];
155*8817535bSAndreas Gohr        $rank = 0;
156*8817535bSAndreas Gohr        foreach ($bpes as $bpe) {
157*8817535bSAndreas Gohr            if (!isset($bpe[1], $bpe[0])) {
158*8817535bSAndreas Gohr                continue;
159*8817535bSAndreas Gohr            }
160*8817535bSAndreas Gohr
161*8817535bSAndreas Gohr            $result[$bpe[0]][$bpe[1]] = $rank;
162*8817535bSAndreas Gohr            ++$rank;
163*8817535bSAndreas Gohr        }
164*8817535bSAndreas Gohr
165*8817535bSAndreas Gohr        return $result;
166*8817535bSAndreas Gohr    }
167*8817535bSAndreas Gohr
168*8817535bSAndreas Gohr    /**
169*8817535bSAndreas Gohr     * Return set of symbol pairs in a word.
170*8817535bSAndreas Gohr     * Word is represented as tuple of symbols (symbols being variable-length strings).
171*8817535bSAndreas Gohr     *
172*8817535bSAndreas Gohr     * @param array<int, string> $word
173*8817535bSAndreas Gohr     *
174*8817535bSAndreas Gohr     * @return mixed[]
175*8817535bSAndreas Gohr     */
176*8817535bSAndreas Gohr    private function buildSymbolPairs(array $word): array
177*8817535bSAndreas Gohr    {
178*8817535bSAndreas Gohr        $pairs = [];
179*8817535bSAndreas Gohr        $previousPart = null;
180*8817535bSAndreas Gohr        foreach ($word as $i => $part) {
181*8817535bSAndreas Gohr            if ($i > 0) {
182*8817535bSAndreas Gohr                $pairs[] = [$previousPart, $part];
183*8817535bSAndreas Gohr            }
184*8817535bSAndreas Gohr
185*8817535bSAndreas Gohr            $previousPart = $part;
186*8817535bSAndreas Gohr        }
187*8817535bSAndreas Gohr
188*8817535bSAndreas Gohr        return $pairs;
189*8817535bSAndreas Gohr    }
190*8817535bSAndreas Gohr
191*8817535bSAndreas Gohr    private function bpe(string $token): string
192*8817535bSAndreas Gohr    {
193*8817535bSAndreas Gohr        if (isset($this->bpeCache[$token])) {
194*8817535bSAndreas Gohr            return $this->bpeCache[$token];
195*8817535bSAndreas Gohr        }
196*8817535bSAndreas Gohr
197*8817535bSAndreas Gohr        $word = mb_str_split($token, 1, 'UTF-8');
198*8817535bSAndreas Gohr        $initialLength = count($word);
199*8817535bSAndreas Gohr        $pairs = $this->buildSymbolPairs($word);
200*8817535bSAndreas Gohr        if ([] === $pairs) {
201*8817535bSAndreas Gohr            return $token;
202*8817535bSAndreas Gohr        }
203*8817535bSAndreas Gohr
204*8817535bSAndreas Gohr        while (true) {
205*8817535bSAndreas Gohr            $minPairs = [];
206*8817535bSAndreas Gohr            foreach ($pairs as $pair) {
207*8817535bSAndreas Gohr                if (isset($this->bpeRanks[$pair[0]][$pair[1]])) {
208*8817535bSAndreas Gohr                    $rank = $this->bpeRanks[$pair[0]][$pair[1]];
209*8817535bSAndreas Gohr                    $minPairs[$rank] = $pair;
210*8817535bSAndreas Gohr                } else {
211*8817535bSAndreas Gohr                    $minPairs[10e10] = $pair;
212*8817535bSAndreas Gohr                }
213*8817535bSAndreas Gohr            }
214*8817535bSAndreas Gohr
215*8817535bSAndreas Gohr            $minPairsKeys = array_keys($minPairs);
216*8817535bSAndreas Gohr            sort($minPairsKeys, SORT_NUMERIC);
217*8817535bSAndreas Gohr            $minimumKey = $minPairsKeys[0] ?? null;
218*8817535bSAndreas Gohr
219*8817535bSAndreas Gohr            $bigram = $minPairs[$minimumKey];
220*8817535bSAndreas Gohr            if (!isset($this->bpeRanks[$bigram[0]][$bigram[1]])) {
221*8817535bSAndreas Gohr                break;
222*8817535bSAndreas Gohr            }
223*8817535bSAndreas Gohr
224*8817535bSAndreas Gohr            $first = $bigram[0];
225*8817535bSAndreas Gohr            $second = $bigram[1];
226*8817535bSAndreas Gohr            $newWord = [];
227*8817535bSAndreas Gohr            $i = 0;
228*8817535bSAndreas Gohr            while ($i < count($word)) {
229*8817535bSAndreas Gohr                $j = $this->indexOf($word, $first, $i);
230*8817535bSAndreas Gohr                if (-1 === $j) {
231*8817535bSAndreas Gohr                    $newWord = [
232*8817535bSAndreas Gohr                        ...$newWord,
233*8817535bSAndreas Gohr                        ...array_slice($word, $i, null, true),
234*8817535bSAndreas Gohr                    ];
235*8817535bSAndreas Gohr                    break;
236*8817535bSAndreas Gohr                }
237*8817535bSAndreas Gohr
238*8817535bSAndreas Gohr                $slicer = $i > $j || 0 === $j ? [] : array_slice($word, $i, $j - $i, true);
239*8817535bSAndreas Gohr
240*8817535bSAndreas Gohr                $newWord = [
241*8817535bSAndreas Gohr                    ...$newWord,
242*8817535bSAndreas Gohr                    ...$slicer,
243*8817535bSAndreas Gohr                ];
244*8817535bSAndreas Gohr                if (count($newWord) > $initialLength) {
245*8817535bSAndreas Gohr                    break;
246*8817535bSAndreas Gohr                }
247*8817535bSAndreas Gohr
248*8817535bSAndreas Gohr                $i = $j;
249*8817535bSAndreas Gohr                if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) {
250*8817535bSAndreas Gohr                    $newWord[] = $first.$second;
251*8817535bSAndreas Gohr                    $i += 2;
252*8817535bSAndreas Gohr                } else {
253*8817535bSAndreas Gohr                    $newWord[] = $word[$i];
254*8817535bSAndreas Gohr                    ++$i;
255*8817535bSAndreas Gohr                }
256*8817535bSAndreas Gohr            }
257*8817535bSAndreas Gohr
258*8817535bSAndreas Gohr            if ($word === $newWord) {
259*8817535bSAndreas Gohr                break;
260*8817535bSAndreas Gohr            }
261*8817535bSAndreas Gohr
262*8817535bSAndreas Gohr            $word = $newWord;
263*8817535bSAndreas Gohr            if (1 === count($word)) {
264*8817535bSAndreas Gohr                break;
265*8817535bSAndreas Gohr            }
266*8817535bSAndreas Gohr
267*8817535bSAndreas Gohr            $pairs = $this->buildSymbolPairs($word);
268*8817535bSAndreas Gohr        }
269*8817535bSAndreas Gohr
270*8817535bSAndreas Gohr        $word = implode(' ', $word);
271*8817535bSAndreas Gohr        $this->bpeCache[$token] = $word;
272*8817535bSAndreas Gohr
273*8817535bSAndreas Gohr        return $word;
274*8817535bSAndreas Gohr    }
275*8817535bSAndreas Gohr
276*8817535bSAndreas Gohr    /**
277*8817535bSAndreas Gohr     * @param array<int, string> $array
278*8817535bSAndreas Gohr     */
279*8817535bSAndreas Gohr    private function indexOf(array $array, string $searchElement, int $fromIndex): int
280*8817535bSAndreas Gohr    {
281*8817535bSAndreas Gohr        $slicedArray = array_slice($array, $fromIndex, preserve_keys: true);
282*8817535bSAndreas Gohr
283*8817535bSAndreas Gohr        $indexed = array_search($searchElement, $slicedArray);
284*8817535bSAndreas Gohr
285*8817535bSAndreas Gohr        return false === $indexed ? -1 : $indexed;
286*8817535bSAndreas Gohr    }
287*8817535bSAndreas Gohr}
288