1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use TikToken\Encoder; 6use Vanderlee\Sentence\Sentence; 7 8/** 9 * Class to split text into chunks of a given size in tokens 10 * 11 * Prefers to split at sentence boundaries, but will split long sentences if necessary. 12 * Also keeps some overlap between chunks to preserve context. 13 */ 14class TextSplitter 15{ 16 /** @var int maximum overlap between chunks in tokens */ 17 final public const MAX_OVERLAP_LEN = 200; 18 19 protected int $chunkSize; 20 protected Encoder $tiktok; 21 protected array $sentenceQueue = []; 22 23 /** 24 * Constructor 25 * 26 * @param int $chunksize maximum chunk size in tokens 27 * @param Encoder $tiktok token encoder 28 */ 29 public function __construct(int $chunksize, Encoder $tiktok) 30 { 31 $this->chunkSize = $chunksize; 32 $this->tiktok = $tiktok; 33 } 34 35 /** 36 * Split the given text into chunks of the configured size 37 * 38 * @param string $text 39 * @return string[] 40 */ 41 public function splitIntoChunks(string $text): array 42 { 43 $this->sentenceQueue = []; // reset sentence queue 44 $chunks = []; 45 46 $sentenceSplitter = new Sentence(); 47 $sentences = $sentenceSplitter->split($text); 48 49 $chunklen = 0; 50 $chunk = ''; 51 while ($sentence = array_shift($sentences)) { 52 $slen = count($this->tiktok->encode($sentence)); 53 if ($slen > $this->chunkSize) { 54 // Sentence is too long, split into smaller parts and push the results back to the front of the queue 55 array_unshift($sentences, ...$this->splitLongSentence($sentence)); 56 continue; 57 } 58 59 if ($chunklen + $slen < $this->chunkSize) { 60 // add to current chunk 61 $chunk .= $sentence; 62 $chunklen += $slen; 63 // remember sentence for overlap check 64 $this->rememberSentence($sentence); 65 } else { 66 // add current chunk to result 67 $chunk = trim($chunk); 68 if ($chunk !== '') $chunks[] = $chunk; 69 70 // start new chunk with remembered sentences 71 $chunk = implode(' ', $this->sentenceQueue); 72 $chunk .= $sentence; 73 $chunklen = count($this->tiktok->encode($chunk)); 74 } 75 } 76 77 // Add the last chunk if not empty 78 $chunk = trim($chunk); 79 if ($chunk !== '') $chunks[] = $chunk; 80 81 return $chunks; 82 } 83 84 /** 85 * Force splitting of a too long sentence into smaller parts 86 * 87 * @param string $sentence 88 * @return string[] 89 */ 90 protected function splitLongSentence($sentence) 91 { 92 $chunkSize = $this->chunkSize / 4; // when force splitting, make sentences a quarter of the chunk size 93 94 // Try naive approach first: split by spaces 95 $words = preg_split('/(\s+)/', $sentence, -1, PREG_SPLIT_DELIM_CAPTURE); 96 $subSentences = []; 97 $currentSubSentence = ''; 98 $currentSubSentenceLen = 0; 99 100 foreach ($words as $word) { 101 $wordLen = count($this->tiktok->encode($word)); 102 103 if ($wordLen > $chunkSize) { 104 // If a single word is too long, split it into smaller chunks 105 $wordChunks = str_split($word, $chunkSize); // Split into smaller parts //FIXME this splitting should be done by tokens, not by characters 106 foreach ($wordChunks as $chunk) { 107 $subSentences[] = $chunk; 108 } 109 } elseif ($currentSubSentenceLen + $wordLen < $chunkSize) { 110 // Add to current sub-sentence 111 $currentSubSentence .= $word; 112 $currentSubSentenceLen += $wordLen; 113 } else { 114 // Add current sub-sentence to result 115 $subSentences[] = $currentSubSentence; 116 // Start new sub-sentence 117 $currentSubSentence = $word; 118 $currentSubSentenceLen = $wordLen; 119 } 120 } 121 122 // Add last sub-sentence to result 123 $subSentences[] = $currentSubSentence; 124 125 return $subSentences; 126 } 127 128 129 /** 130 * Add a sentence to the queue of remembered sentences 131 * 132 * @param string $sentence 133 * @return void 134 */ 135 protected function rememberSentence($sentence) 136 { 137 // add sentence to queue 138 $this->sentenceQueue[] = $sentence; 139 140 // remove oldest sentences from queue until we are below the max overlap 141 while (count($this->tiktok->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 142 array_shift($this->sentenceQueue); 143 } 144 } 145} 146