xref: /plugin/aichat/TextSplitter.php (revision 072e009990858d649f31eceb61c1bc980d28f40c)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use TikToken\Encoder;
6use Vanderlee\Sentence\Sentence;
7
8/**
9 * Class to split text into chunks of a given size in tokens
10 *
11 * Prefers to split at sentence boundaries, but will split long sentences if necessary.
12 * Also keeps some overlap between chunks to preserve context.
13 */
14class TextSplitter
15{
16    /** @var int maximum overlap between chunks in tokens */
17    final public const MAX_OVERLAP_LEN = 200;
18
19    protected int $chunkSize;
20    protected Encoder $tiktok;
21    protected array $sentenceQueue = [];
22
23    /**
24     * Constructor
25     *
26     * @param int $chunksize maximum chunk size in tokens
27     * @param Encoder $tiktok token encoder
28     */
29    public function __construct(int $chunksize, Encoder $tiktok)
30    {
31        $this->chunkSize = $chunksize;
32        $this->tiktok = $tiktok;
33    }
34
35    /**
36     * Split the given text into chunks of the configured size
37     *
38     * @param string $text
39     * @return string[]
40     */
41    public function splitIntoChunks(string $text): array
42    {
43        $this->sentenceQueue = []; // reset sentence queue
44        $chunks = [];
45
46        $sentenceSplitter = new Sentence();
47        $sentences = $sentenceSplitter->split($text);
48
49        $chunklen = 0;
50        $chunk = '';
51        while ($sentence = array_shift($sentences)) {
52            $slen = count($this->tiktok->encode($sentence));
53            if ($slen > $this->chunkSize) {
54                // Sentence is too long, split into smaller parts and push the results back to the front of the queue
55                array_unshift($sentences, ...$this->splitLongSentence($sentence));
56                continue;
57            }
58
59            if ($chunklen + $slen < $this->chunkSize) {
60                // add to current chunk
61                $chunk .= $sentence;
62                $chunklen += $slen;
63                // remember sentence for overlap check
64                $this->rememberSentence($sentence);
65            } else {
66                // add current chunk to result
67                $chunk = trim($chunk);
68                if ($chunk !== '') $chunks[] = $chunk;
69
70                // start new chunk with remembered sentences
71                $chunk = implode(' ', $this->sentenceQueue);
72                $chunk .= $sentence;
73                $chunklen = count($this->tiktok->encode($chunk));
74            }
75        }
76
77        // Add the last chunk if not empty
78        $chunk = trim($chunk);
79        if ($chunk !== '') $chunks[] = $chunk;
80
81        return $chunks;
82    }
83
84    /**
85     * Force splitting of a too long sentence into smaller parts
86     *
87     * @param string $sentence
88     * @return string[]
89     */
90    protected function splitLongSentence($sentence)
91    {
92        $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size
93
94        // Try naive approach first: split by spaces
95        $words = preg_split('/(\s+)/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE);
96        $subSentences = [];
97        $currentSubSentence = '';
98        $currentSubSentenceLen = 0;
99
100        foreach ($words as $word) {
101            $wordLen = count($this->tiktok->encode($word));
102
103            if ($wordLen > $chunkSize) {
104                // If a single word is too long, split it into smaller chunks
105                $wordChunks = str_split($word, $chunkSize); // Split into smaller parts //FIXME this splitting should be done by tokens, not by characters
106                foreach ($wordChunks as $chunk) {
107                    $subSentences[] = $chunk;
108                }
109            } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) {
110                // Add to current sub-sentence
111                $currentSubSentence .= $word;
112                $currentSubSentenceLen += $wordLen;
113            } else {
114                // Add current sub-sentence to result
115                $subSentences[] = $currentSubSentence;
116                // Start new sub-sentence
117                $currentSubSentence = $word;
118                $currentSubSentenceLen = $wordLen;
119            }
120        }
121
122        // Add last sub-sentence to result
123        $subSentences[] = $currentSubSentence;
124
125        return $subSentences;
126    }
127
128
129    /**
130     * Add a sentence to the queue of remembered sentences
131     *
132     * @param string $sentence
133     * @return void
134     */
135    protected function rememberSentence($sentence)
136    {
137        // add sentence to queue
138        $this->sentenceQueue[] = $sentence;
139
140        // remove oldest sentences from queue until we are below the max overlap
141        while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
142            array_shift($this->sentenceQueue);
143        }
144    }
145}
146