1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Utf8\PhpString;
6use TikToken\Encoder;
7use Vanderlee\Sentence\Sentence;
8
9/**
10 * Class to split text into chunks of a given size in tokens
11 *
12 * Prefers to split at sentence boundaries, but will split long sentences if necessary.
13 * Also keeps some overlap between chunks to preserve context.
14 */
15class TextSplitter
16{
17    protected int $chunkSize;
18    protected Encoder $tiktok;
19    protected array $sentenceQueue = [];
20    protected int $overlap;
21
22    /**
23     * Constructor
24     *
25     * @param int $chunksize maximum chunk size in tokens
26     * @param Encoder $tiktok token encoder
27     * @param int $overlap desired overlap between chunks in tokens
28     */
29    public function __construct(int $chunksize, Encoder $tiktok, $overlap = 200)
30    {
31        $this->chunkSize = $chunksize;
32        $this->tiktok = $tiktok;
33        $this->overlap = $overlap;
34    }
35
36    /**
37     * Split the given text into chunks of the configured size
38     *
39     * @param string $text
40     * @return string[]
41     */
42    public function splitIntoChunks(string $text): array
43    {
44        $this->sentenceQueue = []; // reset sentence queue
45        $chunks = [];
46
47        $sentenceSplitter = new Sentence();
48        $sentences = $sentenceSplitter->split($text);
49
50        $chunklen = 0;
51        $chunk = '';
52        while ($sentence = array_shift($sentences)) {
53            $slen = count($this->tiktok->encode($sentence));
54            if ($slen > $this->chunkSize) {
55                // Sentence is too long, split into smaller parts and push the results back to the front of the queue
56                array_unshift($sentences, ...$this->splitLongSentence($sentence));
57                continue;
58            }
59
60            if ($chunklen + $slen < $this->chunkSize) {
61                // add to current chunk
62                $chunk .= $sentence;
63                $chunklen += $slen;
64                // remember sentence for overlap check
65                $this->rememberSentence($sentence);
66            } else {
67                // add current chunk to result
68                $chunk = trim($chunk);
69                if ($chunk !== '') $chunks[] = $chunk;
70
71                // start new chunk with remembered sentences
72                $chunk = implode(' ', $this->sentenceQueue);
73                $chunk .= $sentence;
74                $chunklen = count($this->tiktok->encode($chunk));
75            }
76        }
77
78        // Add the last chunk if not empty
79        $chunk = trim($chunk);
80        if ($chunk !== '') $chunks[] = $chunk;
81
82        return $chunks;
83    }
84
85    /**
86     * Force splitting of a too long sentence into smaller parts, preferably at word boundaries
87     *
88     * @param string $sentence
89     * @return string[]
90     */
91    protected function splitLongSentence(string $sentence): array
92    {
93        $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size
94
95        // Try naive approach first: split by spaces
96        $words = preg_split('/\b/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE);
97        $subSentences = [];
98        $currentSubSentence = '';
99        $currentSubSentenceLen = 0;
100
101        foreach ($words as $word) {
102            $wordLen = count($this->tiktok->encode($word));
103
104            if ($wordLen > $chunkSize) {
105                // word is too long, probably no spaces, split it further
106                array_merge($subSentences, $this->splitString($word, $wordLen, $chunkSize));
107            } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) {
108                // Add to current sub-sentence
109                $currentSubSentence .= $word;
110                $currentSubSentenceLen += $wordLen;
111            } else {
112                // Add current sub-sentence to result
113                $subSentences[] = $currentSubSentence;
114                // Start new sub-sentence
115                $currentSubSentence = $word;
116                $currentSubSentenceLen = $wordLen;
117            }
118        }
119
120        // Add last sub-sentence to result
121        $subSentences[] = $currentSubSentence;
122
123        return $subSentences;
124    }
125
126    /**
127     * Split a string into smaller parts of approximately the given size
128     * This is a naive split that does not care about word boundaries
129     *
130     * @param string $text text to split
131     * @param int $tokenlength length of the text in tokens
132     * @param int $chunksize desired chunk size in tokens
133     * @return string[]
134     */
135    protected function splitString(string $text, int $tokenlength, int $chunksize): array
136    {
137        $numPieces = ceil($tokenlength / $chunksize);
138        $pieceLength = ceil(PhpString::strlen($text) / $numPieces);
139
140        // utf8 aware split
141        $pieces = [];
142        for ($i = 0; $i < $numPieces; $i++) {
143            $pieces[] = PhpString::substr($text, $i * $pieceLength, $pieceLength);
144        }
145        return $pieces;
146    }
147
148    /**
149     * Add a sentence to the queue of remembered sentences
150     *
151     * @param string $sentence
152     * @return void
153     */
154    protected function rememberSentence($sentence)
155    {
156        // add sentence to queue
157        $this->sentenceQueue[] = $sentence;
158
159        // remove oldest sentences from queue until we are below the max overlap
160        while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > $this->overlap) {
161            array_shift($this->sentenceQueue);
162        }
163    }
164}
165