xref: /plugin/aichat/TextSplitter.php (revision 072e009990858d649f31eceb61c1bc980d28f40c)
1*072e0099SAndreas Gohr<?php
2*072e0099SAndreas Gohr
3*072e0099SAndreas Gohrnamespace dokuwiki\plugin\aichat;
4*072e0099SAndreas Gohr
5*072e0099SAndreas Gohruse TikToken\Encoder;
6*072e0099SAndreas Gohruse Vanderlee\Sentence\Sentence;
7*072e0099SAndreas Gohr
8*072e0099SAndreas Gohr/**
9*072e0099SAndreas Gohr * Class to split text into chunks of a given size in tokens
10*072e0099SAndreas Gohr *
11*072e0099SAndreas Gohr * Prefers to split at sentence boundaries, but will split long sentences if necessary.
12*072e0099SAndreas Gohr * Also keeps some overlap between chunks to preserve context.
13*072e0099SAndreas Gohr */
14*072e0099SAndreas Gohrclass TextSplitter
15*072e0099SAndreas Gohr{
16*072e0099SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
17*072e0099SAndreas Gohr    final public const MAX_OVERLAP_LEN = 200;
18*072e0099SAndreas Gohr
19*072e0099SAndreas Gohr    protected int $chunkSize;
20*072e0099SAndreas Gohr    protected Encoder $tiktok;
21*072e0099SAndreas Gohr    protected array $sentenceQueue = [];
22*072e0099SAndreas Gohr
23*072e0099SAndreas Gohr    /**
24*072e0099SAndreas Gohr     * Constructor
25*072e0099SAndreas Gohr     *
26*072e0099SAndreas Gohr     * @param int $chunksize maximum chunk size in tokens
27*072e0099SAndreas Gohr     * @param Encoder $tiktok token encoder
28*072e0099SAndreas Gohr     */
29*072e0099SAndreas Gohr    public function __construct(int $chunksize, Encoder $tiktok)
30*072e0099SAndreas Gohr    {
31*072e0099SAndreas Gohr        $this->chunkSize = $chunksize;
32*072e0099SAndreas Gohr        $this->tiktok = $tiktok;
33*072e0099SAndreas Gohr    }
34*072e0099SAndreas Gohr
35*072e0099SAndreas Gohr    /**
36*072e0099SAndreas Gohr     * Split the given text into chunks of the configured size
37*072e0099SAndreas Gohr     *
38*072e0099SAndreas Gohr     * @param string $text
39*072e0099SAndreas Gohr     * @return string[]
40*072e0099SAndreas Gohr     */
41*072e0099SAndreas Gohr    public function splitIntoChunks(string $text): array
42*072e0099SAndreas Gohr    {
43*072e0099SAndreas Gohr        $this->sentenceQueue = []; // reset sentence queue
44*072e0099SAndreas Gohr        $chunks = [];
45*072e0099SAndreas Gohr
46*072e0099SAndreas Gohr        $sentenceSplitter = new Sentence();
47*072e0099SAndreas Gohr        $sentences = $sentenceSplitter->split($text);
48*072e0099SAndreas Gohr
49*072e0099SAndreas Gohr        $chunklen = 0;
50*072e0099SAndreas Gohr        $chunk = '';
51*072e0099SAndreas Gohr        while ($sentence = array_shift($sentences)) {
52*072e0099SAndreas Gohr            $slen = count($this->tiktok->encode($sentence));
53*072e0099SAndreas Gohr            if ($slen > $this->chunkSize) {
54*072e0099SAndreas Gohr                // Sentence is too long, split into smaller parts and push the results back to the front of the queue
55*072e0099SAndreas Gohr                array_unshift($sentences, ...$this->splitLongSentence($sentence));
56*072e0099SAndreas Gohr                continue;
57*072e0099SAndreas Gohr            }
58*072e0099SAndreas Gohr
59*072e0099SAndreas Gohr            if ($chunklen + $slen < $this->chunkSize) {
60*072e0099SAndreas Gohr                // add to current chunk
61*072e0099SAndreas Gohr                $chunk .= $sentence;
62*072e0099SAndreas Gohr                $chunklen += $slen;
63*072e0099SAndreas Gohr                // remember sentence for overlap check
64*072e0099SAndreas Gohr                $this->rememberSentence($sentence);
65*072e0099SAndreas Gohr            } else {
66*072e0099SAndreas Gohr                // add current chunk to result
67*072e0099SAndreas Gohr                $chunk = trim($chunk);
68*072e0099SAndreas Gohr                if ($chunk !== '') $chunks[] = $chunk;
69*072e0099SAndreas Gohr
70*072e0099SAndreas Gohr                // start new chunk with remembered sentences
71*072e0099SAndreas Gohr                $chunk = implode(' ', $this->sentenceQueue);
72*072e0099SAndreas Gohr                $chunk .= $sentence;
73*072e0099SAndreas Gohr                $chunklen = count($this->tiktok->encode($chunk));
74*072e0099SAndreas Gohr            }
75*072e0099SAndreas Gohr        }
76*072e0099SAndreas Gohr
77*072e0099SAndreas Gohr        // Add the last chunk if not empty
78*072e0099SAndreas Gohr        $chunk = trim($chunk);
79*072e0099SAndreas Gohr        if ($chunk !== '') $chunks[] = $chunk;
80*072e0099SAndreas Gohr
81*072e0099SAndreas Gohr        return $chunks;
82*072e0099SAndreas Gohr    }
83*072e0099SAndreas Gohr
84*072e0099SAndreas Gohr    /**
85*072e0099SAndreas Gohr     * Force splitting of a too long sentence into smaller parts
86*072e0099SAndreas Gohr     *
87*072e0099SAndreas Gohr     * @param string $sentence
88*072e0099SAndreas Gohr     * @return string[]
89*072e0099SAndreas Gohr     */
90*072e0099SAndreas Gohr    protected function splitLongSentence($sentence)
91*072e0099SAndreas Gohr    {
92*072e0099SAndreas Gohr        $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size
93*072e0099SAndreas Gohr
94*072e0099SAndreas Gohr        // Try naive approach first: split by spaces
95*072e0099SAndreas Gohr        $words = preg_split('/(\s+)/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE);
96*072e0099SAndreas Gohr        $subSentences = [];
97*072e0099SAndreas Gohr        $currentSubSentence = '';
98*072e0099SAndreas Gohr        $currentSubSentenceLen = 0;
99*072e0099SAndreas Gohr
100*072e0099SAndreas Gohr        foreach ($words as $word) {
101*072e0099SAndreas Gohr            $wordLen = count($this->tiktok->encode($word));
102*072e0099SAndreas Gohr
103*072e0099SAndreas Gohr            if ($wordLen > $chunkSize) {
104*072e0099SAndreas Gohr                // If a single word is too long, split it into smaller chunks
105*072e0099SAndreas Gohr                $wordChunks = str_split($word, $chunkSize); // Split into smaller parts //FIXME this splitting should be done by tokens, not by characters
106*072e0099SAndreas Gohr                foreach ($wordChunks as $chunk) {
107*072e0099SAndreas Gohr                    $subSentences[] = $chunk;
108*072e0099SAndreas Gohr                }
109*072e0099SAndreas Gohr            } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) {
110*072e0099SAndreas Gohr                // Add to current sub-sentence
111*072e0099SAndreas Gohr                $currentSubSentence .= $word;
112*072e0099SAndreas Gohr                $currentSubSentenceLen += $wordLen;
113*072e0099SAndreas Gohr            } else {
114*072e0099SAndreas Gohr                // Add current sub-sentence to result
115*072e0099SAndreas Gohr                $subSentences[] = $currentSubSentence;
116*072e0099SAndreas Gohr                // Start new sub-sentence
117*072e0099SAndreas Gohr                $currentSubSentence = $word;
118*072e0099SAndreas Gohr                $currentSubSentenceLen = $wordLen;
119*072e0099SAndreas Gohr            }
120*072e0099SAndreas Gohr        }
121*072e0099SAndreas Gohr
122*072e0099SAndreas Gohr        // Add last sub-sentence to result
123*072e0099SAndreas Gohr        $subSentences[] = $currentSubSentence;
124*072e0099SAndreas Gohr
125*072e0099SAndreas Gohr        return $subSentences;
126*072e0099SAndreas Gohr    }
127*072e0099SAndreas Gohr
128*072e0099SAndreas Gohr
129*072e0099SAndreas Gohr    /**
130*072e0099SAndreas Gohr     * Add a sentence to the queue of remembered sentences
131*072e0099SAndreas Gohr     *
132*072e0099SAndreas Gohr     * @param string $sentence
133*072e0099SAndreas Gohr     * @return void
134*072e0099SAndreas Gohr     */
135*072e0099SAndreas Gohr    protected function rememberSentence($sentence)
136*072e0099SAndreas Gohr    {
137*072e0099SAndreas Gohr        // add sentence to queue
138*072e0099SAndreas Gohr        $this->sentenceQueue[] = $sentence;
139*072e0099SAndreas Gohr
140*072e0099SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
141*072e0099SAndreas Gohr        while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
142*072e0099SAndreas Gohr            array_shift($this->sentenceQueue);
143*072e0099SAndreas Gohr        }
144*072e0099SAndreas Gohr    }
145*072e0099SAndreas Gohr}
146