1*072e0099SAndreas Gohr<?php 2*072e0099SAndreas Gohr 3*072e0099SAndreas Gohrnamespace dokuwiki\plugin\aichat; 4*072e0099SAndreas Gohr 5*072e0099SAndreas Gohruse TikToken\Encoder; 6*072e0099SAndreas Gohruse Vanderlee\Sentence\Sentence; 7*072e0099SAndreas Gohr 8*072e0099SAndreas Gohr/** 9*072e0099SAndreas Gohr * Class to split text into chunks of a given size in tokens 10*072e0099SAndreas Gohr * 11*072e0099SAndreas Gohr * Prefers to split at sentence boundaries, but will split long sentences if necessary. 12*072e0099SAndreas Gohr * Also keeps some overlap between chunks to preserve context. 13*072e0099SAndreas Gohr */ 14*072e0099SAndreas Gohrclass TextSplitter 15*072e0099SAndreas Gohr{ 16*072e0099SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 17*072e0099SAndreas Gohr final public const MAX_OVERLAP_LEN = 200; 18*072e0099SAndreas Gohr 19*072e0099SAndreas Gohr protected int $chunkSize; 20*072e0099SAndreas Gohr protected Encoder $tiktok; 21*072e0099SAndreas Gohr protected array $sentenceQueue = []; 22*072e0099SAndreas Gohr 23*072e0099SAndreas Gohr /** 24*072e0099SAndreas Gohr * Constructor 25*072e0099SAndreas Gohr * 26*072e0099SAndreas Gohr * @param int $chunksize maximum chunk size in tokens 27*072e0099SAndreas Gohr * @param Encoder $tiktok token encoder 28*072e0099SAndreas Gohr */ 29*072e0099SAndreas Gohr public function __construct(int $chunksize, Encoder $tiktok) 30*072e0099SAndreas Gohr { 31*072e0099SAndreas Gohr $this->chunkSize = $chunksize; 32*072e0099SAndreas Gohr $this->tiktok = $tiktok; 33*072e0099SAndreas Gohr } 34*072e0099SAndreas Gohr 35*072e0099SAndreas Gohr /** 36*072e0099SAndreas Gohr * Split the given text into chunks of the configured size 37*072e0099SAndreas Gohr * 38*072e0099SAndreas Gohr * @param string $text 39*072e0099SAndreas Gohr * @return string[] 40*072e0099SAndreas Gohr */ 41*072e0099SAndreas Gohr public function splitIntoChunks(string $text): array 42*072e0099SAndreas Gohr { 43*072e0099SAndreas Gohr $this->sentenceQueue = []; // reset sentence queue 44*072e0099SAndreas Gohr $chunks = []; 45*072e0099SAndreas Gohr 46*072e0099SAndreas Gohr $sentenceSplitter = new Sentence(); 47*072e0099SAndreas Gohr $sentences = $sentenceSplitter->split($text); 48*072e0099SAndreas Gohr 49*072e0099SAndreas Gohr $chunklen = 0; 50*072e0099SAndreas Gohr $chunk = ''; 51*072e0099SAndreas Gohr while ($sentence = array_shift($sentences)) { 52*072e0099SAndreas Gohr $slen = count($this->tiktok->encode($sentence)); 53*072e0099SAndreas Gohr if ($slen > $this->chunkSize) { 54*072e0099SAndreas Gohr // Sentence is too long, split into smaller parts and push the results back to the front of the queue 55*072e0099SAndreas Gohr array_unshift($sentences, ...$this->splitLongSentence($sentence)); 56*072e0099SAndreas Gohr continue; 57*072e0099SAndreas Gohr } 58*072e0099SAndreas Gohr 59*072e0099SAndreas Gohr if ($chunklen + $slen < $this->chunkSize) { 60*072e0099SAndreas Gohr // add to current chunk 61*072e0099SAndreas Gohr $chunk .= $sentence; 62*072e0099SAndreas Gohr $chunklen += $slen; 63*072e0099SAndreas Gohr // remember sentence for overlap check 64*072e0099SAndreas Gohr $this->rememberSentence($sentence); 65*072e0099SAndreas Gohr } else { 66*072e0099SAndreas Gohr // add current chunk to result 67*072e0099SAndreas Gohr $chunk = trim($chunk); 68*072e0099SAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 69*072e0099SAndreas Gohr 70*072e0099SAndreas Gohr // start new chunk with remembered sentences 71*072e0099SAndreas Gohr $chunk = implode(' ', $this->sentenceQueue); 72*072e0099SAndreas Gohr $chunk .= $sentence; 73*072e0099SAndreas Gohr $chunklen = count($this->tiktok->encode($chunk)); 74*072e0099SAndreas Gohr } 75*072e0099SAndreas Gohr } 76*072e0099SAndreas Gohr 77*072e0099SAndreas Gohr // Add the last chunk if not empty 78*072e0099SAndreas Gohr $chunk = trim($chunk); 79*072e0099SAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 80*072e0099SAndreas Gohr 81*072e0099SAndreas Gohr return $chunks; 82*072e0099SAndreas Gohr } 83*072e0099SAndreas Gohr 84*072e0099SAndreas Gohr /** 85*072e0099SAndreas Gohr * Force splitting of a too long sentence into smaller parts 86*072e0099SAndreas Gohr * 87*072e0099SAndreas Gohr * @param string $sentence 88*072e0099SAndreas Gohr * @return string[] 89*072e0099SAndreas Gohr */ 90*072e0099SAndreas Gohr protected function splitLongSentence($sentence) 91*072e0099SAndreas Gohr { 92*072e0099SAndreas Gohr $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size 93*072e0099SAndreas Gohr 94*072e0099SAndreas Gohr // Try naive approach first: split by spaces 95*072e0099SAndreas Gohr $words = preg_split('/(\s+)/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE); 96*072e0099SAndreas Gohr $subSentences = []; 97*072e0099SAndreas Gohr $currentSubSentence = ''; 98*072e0099SAndreas Gohr $currentSubSentenceLen = 0; 99*072e0099SAndreas Gohr 100*072e0099SAndreas Gohr foreach ($words as $word) { 101*072e0099SAndreas Gohr $wordLen = count($this->tiktok->encode($word)); 102*072e0099SAndreas Gohr 103*072e0099SAndreas Gohr if ($wordLen > $chunkSize) { 104*072e0099SAndreas Gohr // If a single word is too long, split it into smaller chunks 105*072e0099SAndreas Gohr $wordChunks = str_split($word, $chunkSize); // Split into smaller parts //FIXME this splitting should be done by tokens, not by characters 106*072e0099SAndreas Gohr foreach ($wordChunks as $chunk) { 107*072e0099SAndreas Gohr $subSentences[] = $chunk; 108*072e0099SAndreas Gohr } 109*072e0099SAndreas Gohr } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) { 110*072e0099SAndreas Gohr // Add to current sub-sentence 111*072e0099SAndreas Gohr $currentSubSentence .= $word; 112*072e0099SAndreas Gohr $currentSubSentenceLen += $wordLen; 113*072e0099SAndreas Gohr } else { 114*072e0099SAndreas Gohr // Add current sub-sentence to result 115*072e0099SAndreas Gohr $subSentences[] = $currentSubSentence; 116*072e0099SAndreas Gohr // Start new sub-sentence 117*072e0099SAndreas Gohr $currentSubSentence = $word; 118*072e0099SAndreas Gohr $currentSubSentenceLen = $wordLen; 119*072e0099SAndreas Gohr } 120*072e0099SAndreas Gohr } 121*072e0099SAndreas Gohr 122*072e0099SAndreas Gohr // Add last sub-sentence to result 123*072e0099SAndreas Gohr $subSentences[] = $currentSubSentence; 124*072e0099SAndreas Gohr 125*072e0099SAndreas Gohr return $subSentences; 126*072e0099SAndreas Gohr } 127*072e0099SAndreas Gohr 128*072e0099SAndreas Gohr 129*072e0099SAndreas Gohr /** 130*072e0099SAndreas Gohr * Add a sentence to the queue of remembered sentences 131*072e0099SAndreas Gohr * 132*072e0099SAndreas Gohr * @param string $sentence 133*072e0099SAndreas Gohr * @return void 134*072e0099SAndreas Gohr */ 135*072e0099SAndreas Gohr protected function rememberSentence($sentence) 136*072e0099SAndreas Gohr { 137*072e0099SAndreas Gohr // add sentence to queue 138*072e0099SAndreas Gohr $this->sentenceQueue[] = $sentence; 139*072e0099SAndreas Gohr 140*072e0099SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 141*072e0099SAndreas Gohr while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 142*072e0099SAndreas Gohr array_shift($this->sentenceQueue); 143*072e0099SAndreas Gohr } 144*072e0099SAndreas Gohr } 145*072e0099SAndreas Gohr} 146