xref: /plugin/aichat/TextSplitter.php (revision 14b3ee00e0a989e3a7302d235ba8ce85ca383832)
1072e0099SAndreas Gohr<?php
2072e0099SAndreas Gohr
3072e0099SAndreas Gohrnamespace dokuwiki\plugin\aichat;
4072e0099SAndreas Gohr
5028fe6dbSAndreas Gohruse dokuwiki\Utf8\PhpString;
6072e0099SAndreas Gohruse TikToken\Encoder;
7072e0099SAndreas Gohruse Vanderlee\Sentence\Sentence;
8072e0099SAndreas Gohr
9072e0099SAndreas Gohr/**
10072e0099SAndreas Gohr * Class to split text into chunks of a given size in tokens
11072e0099SAndreas Gohr *
12072e0099SAndreas Gohr * Prefers to split at sentence boundaries, but will split long sentences if necessary.
13072e0099SAndreas Gohr * Also keeps some overlap between chunks to preserve context.
14072e0099SAndreas Gohr */
15072e0099SAndreas Gohrclass TextSplitter
16072e0099SAndreas Gohr{
17072e0099SAndreas Gohr    protected int $chunkSize;
18072e0099SAndreas Gohr    protected Encoder $tiktok;
19072e0099SAndreas Gohr    protected array $sentenceQueue = [];
20*14b3ee00SAndreas Gohr    protected int $overlap;
21072e0099SAndreas Gohr
22072e0099SAndreas Gohr    /**
23072e0099SAndreas Gohr     * Constructor
24072e0099SAndreas Gohr     *
25072e0099SAndreas Gohr     * @param int $chunksize maximum chunk size in tokens
26072e0099SAndreas Gohr     * @param Encoder $tiktok token encoder
27*14b3ee00SAndreas Gohr     * @param int $overlap desired overlap between chunks in tokens
28072e0099SAndreas Gohr     */
29*14b3ee00SAndreas Gohr    public function __construct(int $chunksize, Encoder $tiktok, $overlap = 200)
30072e0099SAndreas Gohr    {
31072e0099SAndreas Gohr        $this->chunkSize = $chunksize;
32072e0099SAndreas Gohr        $this->tiktok = $tiktok;
33*14b3ee00SAndreas Gohr        $this->overlap = $overlap;
34072e0099SAndreas Gohr    }
35072e0099SAndreas Gohr
36072e0099SAndreas Gohr    /**
37072e0099SAndreas Gohr     * Split the given text into chunks of the configured size
38072e0099SAndreas Gohr     *
39072e0099SAndreas Gohr     * @param string $text
40072e0099SAndreas Gohr     * @return string[]
41072e0099SAndreas Gohr     */
42072e0099SAndreas Gohr    public function splitIntoChunks(string $text): array
43072e0099SAndreas Gohr    {
44072e0099SAndreas Gohr        $this->sentenceQueue = []; // reset sentence queue
45072e0099SAndreas Gohr        $chunks = [];
46072e0099SAndreas Gohr
47072e0099SAndreas Gohr        $sentenceSplitter = new Sentence();
48072e0099SAndreas Gohr        $sentences = $sentenceSplitter->split($text);
49072e0099SAndreas Gohr
50072e0099SAndreas Gohr        $chunklen = 0;
51072e0099SAndreas Gohr        $chunk = '';
52072e0099SAndreas Gohr        while ($sentence = array_shift($sentences)) {
53072e0099SAndreas Gohr            $slen = count($this->tiktok->encode($sentence));
54072e0099SAndreas Gohr            if ($slen > $this->chunkSize) {
55072e0099SAndreas Gohr                // Sentence is too long, split into smaller parts and push the results back to the front of the queue
56072e0099SAndreas Gohr                array_unshift($sentences, ...$this->splitLongSentence($sentence));
57072e0099SAndreas Gohr                continue;
58072e0099SAndreas Gohr            }
59072e0099SAndreas Gohr
60072e0099SAndreas Gohr            if ($chunklen + $slen < $this->chunkSize) {
61072e0099SAndreas Gohr                // add to current chunk
62072e0099SAndreas Gohr                $chunk .= $sentence;
63072e0099SAndreas Gohr                $chunklen += $slen;
64072e0099SAndreas Gohr                // remember sentence for overlap check
65072e0099SAndreas Gohr                $this->rememberSentence($sentence);
66072e0099SAndreas Gohr            } else {
67072e0099SAndreas Gohr                // add current chunk to result
68072e0099SAndreas Gohr                $chunk = trim($chunk);
69072e0099SAndreas Gohr                if ($chunk !== '') $chunks[] = $chunk;
70072e0099SAndreas Gohr
71072e0099SAndreas Gohr                // start new chunk with remembered sentences
72072e0099SAndreas Gohr                $chunk = implode(' ', $this->sentenceQueue);
73072e0099SAndreas Gohr                $chunk .= $sentence;
74072e0099SAndreas Gohr                $chunklen = count($this->tiktok->encode($chunk));
75072e0099SAndreas Gohr            }
76072e0099SAndreas Gohr        }
77072e0099SAndreas Gohr
78072e0099SAndreas Gohr        // Add the last chunk if not empty
79072e0099SAndreas Gohr        $chunk = trim($chunk);
80072e0099SAndreas Gohr        if ($chunk !== '') $chunks[] = $chunk;
81072e0099SAndreas Gohr
82072e0099SAndreas Gohr        return $chunks;
83072e0099SAndreas Gohr    }
84072e0099SAndreas Gohr
85072e0099SAndreas Gohr    /**
86028fe6dbSAndreas Gohr     * Force splitting of a too long sentence into smaller parts, preferably at word boundaries
87072e0099SAndreas Gohr     *
88072e0099SAndreas Gohr     * @param string $sentence
89072e0099SAndreas Gohr     * @return string[]
90072e0099SAndreas Gohr     */
91028fe6dbSAndreas Gohr    protected function splitLongSentence(string $sentence): array
92072e0099SAndreas Gohr    {
93072e0099SAndreas Gohr        $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size
94072e0099SAndreas Gohr
95072e0099SAndreas Gohr        // Try naive approach first: split by spaces
96*14b3ee00SAndreas Gohr        $words = preg_split('/\b/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE);
97072e0099SAndreas Gohr        $subSentences = [];
98072e0099SAndreas Gohr        $currentSubSentence = '';
99072e0099SAndreas Gohr        $currentSubSentenceLen = 0;
100072e0099SAndreas Gohr
101072e0099SAndreas Gohr        foreach ($words as $word) {
102072e0099SAndreas Gohr            $wordLen = count($this->tiktok->encode($word));
103072e0099SAndreas Gohr
104072e0099SAndreas Gohr            if ($wordLen > $chunkSize) {
105028fe6dbSAndreas Gohr                // word is too long, probably no spaces, split it further
106028fe6dbSAndreas Gohr                array_merge($subSentences, $this->splitString($word, $wordLen, $chunkSize));
107072e0099SAndreas Gohr            } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) {
108072e0099SAndreas Gohr                // Add to current sub-sentence
109072e0099SAndreas Gohr                $currentSubSentence .= $word;
110072e0099SAndreas Gohr                $currentSubSentenceLen += $wordLen;
111072e0099SAndreas Gohr            } else {
112072e0099SAndreas Gohr                // Add current sub-sentence to result
113072e0099SAndreas Gohr                $subSentences[] = $currentSubSentence;
114072e0099SAndreas Gohr                // Start new sub-sentence
115072e0099SAndreas Gohr                $currentSubSentence = $word;
116072e0099SAndreas Gohr                $currentSubSentenceLen = $wordLen;
117072e0099SAndreas Gohr            }
118072e0099SAndreas Gohr        }
119072e0099SAndreas Gohr
120072e0099SAndreas Gohr        // Add last sub-sentence to result
121072e0099SAndreas Gohr        $subSentences[] = $currentSubSentence;
122072e0099SAndreas Gohr
123072e0099SAndreas Gohr        return $subSentences;
124072e0099SAndreas Gohr    }
125072e0099SAndreas Gohr
126028fe6dbSAndreas Gohr    /**
127028fe6dbSAndreas Gohr     * Split a string into smaller parts of approximately the given size
128028fe6dbSAndreas Gohr     * This is a naive split that does not care about word boundaries
129028fe6dbSAndreas Gohr     *
130028fe6dbSAndreas Gohr     * @param string $text text to split
131028fe6dbSAndreas Gohr     * @param int $tokenlength length of the text in tokens
132028fe6dbSAndreas Gohr     * @param int $chunksize desired chunk size in tokens
133028fe6dbSAndreas Gohr     * @return string[]
134028fe6dbSAndreas Gohr     */
135028fe6dbSAndreas Gohr    protected function splitString(string $text, int $tokenlength, int $chunksize): array
136028fe6dbSAndreas Gohr    {
137028fe6dbSAndreas Gohr        $numPieces = ceil($tokenlength / $chunksize);
138028fe6dbSAndreas Gohr        $pieceLength = ceil(PhpString::strlen($text) / $numPieces);
139028fe6dbSAndreas Gohr
140028fe6dbSAndreas Gohr        // utf8 aware split
141028fe6dbSAndreas Gohr        $pieces = [];
142028fe6dbSAndreas Gohr        for ($i = 0; $i < $numPieces; $i++) {
143028fe6dbSAndreas Gohr            $pieces[] = PhpString::substr($text, $i * $pieceLength, $pieceLength);
144028fe6dbSAndreas Gohr        }
145028fe6dbSAndreas Gohr        return $pieces;
146028fe6dbSAndreas Gohr    }
147072e0099SAndreas Gohr
148072e0099SAndreas Gohr    /**
149072e0099SAndreas Gohr     * Add a sentence to the queue of remembered sentences
150072e0099SAndreas Gohr     *
151072e0099SAndreas Gohr     * @param string $sentence
152072e0099SAndreas Gohr     * @return void
153072e0099SAndreas Gohr     */
154072e0099SAndreas Gohr    protected function rememberSentence($sentence)
155072e0099SAndreas Gohr    {
156072e0099SAndreas Gohr        // add sentence to queue
157072e0099SAndreas Gohr        $this->sentenceQueue[] = $sentence;
158072e0099SAndreas Gohr
159072e0099SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
160*14b3ee00SAndreas Gohr        while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > $this->overlap) {
161072e0099SAndreas Gohr            array_shift($this->sentenceQueue);
162072e0099SAndreas Gohr        }
163072e0099SAndreas Gohr    }
164072e0099SAndreas Gohr}
165