1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Utf8\PhpString; 6use TikToken\Encoder; 7use Vanderlee\Sentence\Sentence; 8 9/** 10 * Class to split text into chunks of a given size in tokens 11 * 12 * Prefers to split at sentence boundaries, but will split long sentences if necessary. 13 * Also keeps some overlap between chunks to preserve context. 14 */ 15class TextSplitter 16{ 17 protected int $chunkSize; 18 protected Encoder $tiktok; 19 protected array $sentenceQueue = []; 20 protected int $overlap; 21 22 /** 23 * Constructor 24 * 25 * @param int $chunksize maximum chunk size in tokens 26 * @param Encoder $tiktok token encoder 27 * @param int $overlap desired overlap between chunks in tokens 28 */ 29 public function __construct(int $chunksize, Encoder $tiktok, $overlap = 200) 30 { 31 $this->chunkSize = $chunksize; 32 $this->tiktok = $tiktok; 33 $this->overlap = $overlap; 34 } 35 36 /** 37 * Split the given text into chunks of the configured size 38 * 39 * @param string $text 40 * @return string[] 41 */ 42 public function splitIntoChunks(string $text): array 43 { 44 $this->sentenceQueue = []; // reset sentence queue 45 $chunks = []; 46 47 $sentenceSplitter = new Sentence(); 48 $sentences = $sentenceSplitter->split($text); 49 50 $chunklen = 0; 51 $chunk = ''; 52 while ($sentence = array_shift($sentences)) { 53 $slen = count($this->tiktok->encode($sentence)); 54 if ($slen > $this->chunkSize) { 55 // Sentence is too long, split into smaller parts and push the results back to the front of the queue 56 array_unshift($sentences, ...$this->splitLongSentence($sentence)); 57 continue; 58 } 59 60 if ($chunklen + $slen < $this->chunkSize) { 61 // add to current chunk 62 $chunk .= $sentence; 63 $chunklen += $slen; 64 // remember sentence for overlap check 65 $this->rememberSentence($sentence); 66 } else { 67 // add current chunk to result 68 $chunk = trim($chunk); 69 if ($chunk !== '') $chunks[] = $chunk; 70 71 // start new chunk with remembered sentences 72 $chunk = implode(' ', $this->sentenceQueue); 73 $chunk .= $sentence; 74 $chunklen = count($this->tiktok->encode($chunk)); 75 } 76 } 77 78 // Add the last chunk if not empty 79 $chunk = trim($chunk); 80 if ($chunk !== '') $chunks[] = $chunk; 81 82 return $chunks; 83 } 84 85 /** 86 * Force splitting of a too long sentence into smaller parts, preferably at word boundaries 87 * 88 * @param string $sentence 89 * @return string[] 90 */ 91 protected function splitLongSentence(string $sentence): array 92 { 93 $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size 94 95 // Try naive approach first: split by spaces 96 $words = preg_split('/\b/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE); 97 $subSentences = []; 98 $currentSubSentence = ''; 99 $currentSubSentenceLen = 0; 100 101 foreach ($words as $word) { 102 $wordLen = count($this->tiktok->encode($word)); 103 104 if ($wordLen > $chunkSize) { 105 // word is too long, probably no spaces, split it further 106 array_merge($subSentences, $this->splitString($word, $wordLen, $chunkSize)); 107 } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) { 108 // Add to current sub-sentence 109 $currentSubSentence .= $word; 110 $currentSubSentenceLen += $wordLen; 111 } else { 112 // Add current sub-sentence to result 113 $subSentences[] = $currentSubSentence; 114 // Start new sub-sentence 115 $currentSubSentence = $word; 116 $currentSubSentenceLen = $wordLen; 117 } 118 } 119 120 // Add last sub-sentence to result 121 $subSentences[] = $currentSubSentence; 122 123 return $subSentences; 124 } 125 126 /** 127 * Split a string into smaller parts of approximately the given size 128 * This is a naive split that does not care about word boundaries 129 * 130 * @param string $text text to split 131 * @param int $tokenlength length of the text in tokens 132 * @param int $chunksize desired chunk size in tokens 133 * @return string[] 134 */ 135 protected function splitString(string $text, int $tokenlength, int $chunksize): array 136 { 137 $numPieces = ceil($tokenlength / $chunksize); 138 $pieceLength = ceil(PhpString::strlen($text) / $numPieces); 139 140 // utf8 aware split 141 $pieces = []; 142 for ($i = 0; $i < $numPieces; $i++) { 143 $pieces[] = PhpString::substr($text, $i * $pieceLength, $pieceLength); 144 } 145 return $pieces; 146 } 147 148 /** 149 * Add a sentence to the queue of remembered sentences 150 * 151 * @param string $sentence 152 * @return void 153 */ 154 protected function rememberSentence($sentence) 155 { 156 // add sentence to queue 157 $this->sentenceQueue[] = $sentence; 158 159 // remove oldest sentences from queue until we are below the max overlap 160 while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > $this->overlap) { 161 array_shift($this->sentenceQueue); 162 } 163 } 164} 165