1072e0099SAndreas Gohr<?php 2072e0099SAndreas Gohr 3072e0099SAndreas Gohrnamespace dokuwiki\plugin\aichat; 4072e0099SAndreas Gohr 5028fe6dbSAndreas Gohruse dokuwiki\Utf8\PhpString; 6072e0099SAndreas Gohruse TikToken\Encoder; 7072e0099SAndreas Gohruse Vanderlee\Sentence\Sentence; 8072e0099SAndreas Gohr 9072e0099SAndreas Gohr/** 10072e0099SAndreas Gohr * Class to split text into chunks of a given size in tokens 11072e0099SAndreas Gohr * 12072e0099SAndreas Gohr * Prefers to split at sentence boundaries, but will split long sentences if necessary. 13072e0099SAndreas Gohr * Also keeps some overlap between chunks to preserve context. 14072e0099SAndreas Gohr */ 15072e0099SAndreas Gohrclass TextSplitter 16072e0099SAndreas Gohr{ 17072e0099SAndreas Gohr protected int $chunkSize; 18072e0099SAndreas Gohr protected Encoder $tiktok; 19072e0099SAndreas Gohr protected array $sentenceQueue = []; 20*14b3ee00SAndreas Gohr protected int $overlap; 21072e0099SAndreas Gohr 22072e0099SAndreas Gohr /** 23072e0099SAndreas Gohr * Constructor 24072e0099SAndreas Gohr * 25072e0099SAndreas Gohr * @param int $chunksize maximum chunk size in tokens 26072e0099SAndreas Gohr * @param Encoder $tiktok token encoder 27*14b3ee00SAndreas Gohr * @param int $overlap desired overlap between chunks in tokens 28072e0099SAndreas Gohr */ 29*14b3ee00SAndreas Gohr public function __construct(int $chunksize, Encoder $tiktok, $overlap = 200) 30072e0099SAndreas Gohr { 31072e0099SAndreas Gohr $this->chunkSize = $chunksize; 32072e0099SAndreas Gohr $this->tiktok = $tiktok; 33*14b3ee00SAndreas Gohr $this->overlap = $overlap; 34072e0099SAndreas Gohr } 35072e0099SAndreas Gohr 36072e0099SAndreas Gohr /** 37072e0099SAndreas Gohr * Split the given text into chunks of the configured size 38072e0099SAndreas Gohr * 39072e0099SAndreas Gohr * @param string $text 40072e0099SAndreas Gohr * @return string[] 41072e0099SAndreas Gohr */ 42072e0099SAndreas Gohr public function splitIntoChunks(string $text): array 43072e0099SAndreas Gohr { 44072e0099SAndreas Gohr $this->sentenceQueue = []; // reset sentence queue 45072e0099SAndreas Gohr $chunks = []; 46072e0099SAndreas Gohr 47072e0099SAndreas Gohr $sentenceSplitter = new Sentence(); 48072e0099SAndreas Gohr $sentences = $sentenceSplitter->split($text); 49072e0099SAndreas Gohr 50072e0099SAndreas Gohr $chunklen = 0; 51072e0099SAndreas Gohr $chunk = ''; 52072e0099SAndreas Gohr while ($sentence = array_shift($sentences)) { 53072e0099SAndreas Gohr $slen = count($this->tiktok->encode($sentence)); 54072e0099SAndreas Gohr if ($slen > $this->chunkSize) { 55072e0099SAndreas Gohr // Sentence is too long, split into smaller parts and push the results back to the front of the queue 56072e0099SAndreas Gohr array_unshift($sentences, ...$this->splitLongSentence($sentence)); 57072e0099SAndreas Gohr continue; 58072e0099SAndreas Gohr } 59072e0099SAndreas Gohr 60072e0099SAndreas Gohr if ($chunklen + $slen < $this->chunkSize) { 61072e0099SAndreas Gohr // add to current chunk 62072e0099SAndreas Gohr $chunk .= $sentence; 63072e0099SAndreas Gohr $chunklen += $slen; 64072e0099SAndreas Gohr // remember sentence for overlap check 65072e0099SAndreas Gohr $this->rememberSentence($sentence); 66072e0099SAndreas Gohr } else { 67072e0099SAndreas Gohr // add current chunk to result 68072e0099SAndreas Gohr $chunk = trim($chunk); 69072e0099SAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 70072e0099SAndreas Gohr 71072e0099SAndreas Gohr // start new chunk with remembered sentences 72072e0099SAndreas Gohr $chunk = implode(' ', $this->sentenceQueue); 73072e0099SAndreas Gohr $chunk .= $sentence; 74072e0099SAndreas Gohr $chunklen = count($this->tiktok->encode($chunk)); 75072e0099SAndreas Gohr } 76072e0099SAndreas Gohr } 77072e0099SAndreas Gohr 78072e0099SAndreas Gohr // Add the last chunk if not empty 79072e0099SAndreas Gohr $chunk = trim($chunk); 80072e0099SAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 81072e0099SAndreas Gohr 82072e0099SAndreas Gohr return $chunks; 83072e0099SAndreas Gohr } 84072e0099SAndreas Gohr 85072e0099SAndreas Gohr /** 86028fe6dbSAndreas Gohr * Force splitting of a too long sentence into smaller parts, preferably at word boundaries 87072e0099SAndreas Gohr * 88072e0099SAndreas Gohr * @param string $sentence 89072e0099SAndreas Gohr * @return string[] 90072e0099SAndreas Gohr */ 91028fe6dbSAndreas Gohr protected function splitLongSentence(string $sentence): array 92072e0099SAndreas Gohr { 93072e0099SAndreas Gohr $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size 94072e0099SAndreas Gohr 95072e0099SAndreas Gohr // Try naive approach first: split by spaces 96*14b3ee00SAndreas Gohr $words = preg_split('/\b/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE); 97072e0099SAndreas Gohr $subSentences = []; 98072e0099SAndreas Gohr $currentSubSentence = ''; 99072e0099SAndreas Gohr $currentSubSentenceLen = 0; 100072e0099SAndreas Gohr 101072e0099SAndreas Gohr foreach ($words as $word) { 102072e0099SAndreas Gohr $wordLen = count($this->tiktok->encode($word)); 103072e0099SAndreas Gohr 104072e0099SAndreas Gohr if ($wordLen > $chunkSize) { 105028fe6dbSAndreas Gohr // word is too long, probably no spaces, split it further 106028fe6dbSAndreas Gohr array_merge($subSentences, $this->splitString($word, $wordLen, $chunkSize)); 107072e0099SAndreas Gohr } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) { 108072e0099SAndreas Gohr // Add to current sub-sentence 109072e0099SAndreas Gohr $currentSubSentence .= $word; 110072e0099SAndreas Gohr $currentSubSentenceLen += $wordLen; 111072e0099SAndreas Gohr } else { 112072e0099SAndreas Gohr // Add current sub-sentence to result 113072e0099SAndreas Gohr $subSentences[] = $currentSubSentence; 114072e0099SAndreas Gohr // Start new sub-sentence 115072e0099SAndreas Gohr $currentSubSentence = $word; 116072e0099SAndreas Gohr $currentSubSentenceLen = $wordLen; 117072e0099SAndreas Gohr } 118072e0099SAndreas Gohr } 119072e0099SAndreas Gohr 120072e0099SAndreas Gohr // Add last sub-sentence to result 121072e0099SAndreas Gohr $subSentences[] = $currentSubSentence; 122072e0099SAndreas Gohr 123072e0099SAndreas Gohr return $subSentences; 124072e0099SAndreas Gohr } 125072e0099SAndreas Gohr 126028fe6dbSAndreas Gohr /** 127028fe6dbSAndreas Gohr * Split a string into smaller parts of approximately the given size 128028fe6dbSAndreas Gohr * This is a naive split that does not care about word boundaries 129028fe6dbSAndreas Gohr * 130028fe6dbSAndreas Gohr * @param string $text text to split 131028fe6dbSAndreas Gohr * @param int $tokenlength length of the text in tokens 132028fe6dbSAndreas Gohr * @param int $chunksize desired chunk size in tokens 133028fe6dbSAndreas Gohr * @return string[] 134028fe6dbSAndreas Gohr */ 135028fe6dbSAndreas Gohr protected function splitString(string $text, int $tokenlength, int $chunksize): array 136028fe6dbSAndreas Gohr { 137028fe6dbSAndreas Gohr $numPieces = ceil($tokenlength / $chunksize); 138028fe6dbSAndreas Gohr $pieceLength = ceil(PhpString::strlen($text) / $numPieces); 139028fe6dbSAndreas Gohr 140028fe6dbSAndreas Gohr // utf8 aware split 141028fe6dbSAndreas Gohr $pieces = []; 142028fe6dbSAndreas Gohr for ($i = 0; $i < $numPieces; $i++) { 143028fe6dbSAndreas Gohr $pieces[] = PhpString::substr($text, $i * $pieceLength, $pieceLength); 144028fe6dbSAndreas Gohr } 145028fe6dbSAndreas Gohr return $pieces; 146028fe6dbSAndreas Gohr } 147072e0099SAndreas Gohr 148072e0099SAndreas Gohr /** 149072e0099SAndreas Gohr * Add a sentence to the queue of remembered sentences 150072e0099SAndreas Gohr * 151072e0099SAndreas Gohr * @param string $sentence 152072e0099SAndreas Gohr * @return void 153072e0099SAndreas Gohr */ 154072e0099SAndreas Gohr protected function rememberSentence($sentence) 155072e0099SAndreas Gohr { 156072e0099SAndreas Gohr // add sentence to queue 157072e0099SAndreas Gohr $this->sentenceQueue[] = $sentence; 158072e0099SAndreas Gohr 159072e0099SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 160*14b3ee00SAndreas Gohr while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > $this->overlap) { 161072e0099SAndreas Gohr array_shift($this->sentenceQueue); 162072e0099SAndreas Gohr } 163072e0099SAndreas Gohr } 164072e0099SAndreas Gohr} 165