1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\plugin\aichat\Model\AbstractModel; 6use dokuwiki\plugin\aichat\Storage\AbstractStorage; 7use dokuwiki\Search\Indexer; 8use splitbrain\phpcli\CLI; 9use TikToken\Encoder; 10use Vanderlee\Sentence\Sentence; 11 12/** 13 * Manage the embeddings index 14 * 15 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 16 * OpenAI and stored in the Storage backend. 17 */ 18class Embeddings 19{ 20 /** @var int maximum overlap between chunks in tokens */ 21 const MAX_OVERLAP_LEN = 200; 22 23 /** @var AbstractModel */ 24 protected $model; 25 /** @var CLI|null */ 26 protected $logger; 27 /** @var Encoder */ 28 protected $tokenEncoder; 29 30 /** @var AbstractStorage */ 31 protected $storage; 32 33 /** @var array remember sentences when chunking */ 34 private $sentenceQueue = []; 35 36 /** 37 * @param AbstractModel $model 38 */ 39 public function __construct(AbstractModel $model, AbstractStorage $storage) 40 { 41 $this->model = $model; 42 $this->storage = $storage; 43 } 44 45 /** 46 * Access storage 47 * 48 * @return AbstractStorage 49 */ 50 public function getStorage() 51 { 52 return $this->storage; 53 } 54 55 /** 56 * Add a logger instance 57 * 58 * @param CLI $logger 59 * @return void 60 */ 61 public function setLogger(CLI $logger) 62 { 63 $this->logger = $logger; 64 } 65 66 /** 67 * Get the token encoder instance 68 * 69 * @return Encoder 70 */ 71 public function getTokenEncoder() 72 { 73 if ($this->tokenEncoder === null) { 74 $this->tokenEncoder = new Encoder(); 75 } 76 return $this->tokenEncoder; 77 } 78 79 /** 80 * Update the embeddings storage 81 * 82 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 83 * @param bool $clear Should any existing storage be cleared before updating? 84 * @return void 85 * @throws \Exception 86 */ 87 public function createNewIndex($skipRE = '', $clear = false) 88 { 89 $indexer = new Indexer(); 90 $pages = $indexer->getPages(); 91 92 $this->storage->startCreation($clear); 93 foreach ($pages as $pid => $page) { 94 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 95 96 if ( 97 !page_exists($page) || 98 isHiddenPage($page) || 99 filesize(wikiFN($page)) < 150 || // skip very small pages 100 ($skipRE && preg_match($skipRE, $page)) 101 ) { 102 // this page should not be in the index (anymore) 103 $this->storage->deletePageChunks($page, $chunkID); 104 continue; 105 } 106 107 $firstChunk = $this->storage->getChunk($chunkID); 108 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 109 // page is older than the chunks we have, reuse the existing chunks 110 $this->storage->reusePageChunks($page, $chunkID); 111 if ($this->logger) $this->logger->info("Reusing chunks for $page"); 112 } else { 113 // page is newer than the chunks we have, create new chunks 114 $this->storage->deletePageChunks($page, $chunkID); 115 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 116 } 117 } 118 $this->storage->finalizeCreation(); 119 } 120 121 /** 122 * Split the given page, fetch embedding vectors and return Chunks 123 * 124 * Will use the text renderer plugin if available to get the rendered text. 125 * Otherwise the raw wiki text is used. 126 * 127 * @param string $page Name of the page to split 128 * @param int $firstChunkID The ID of the first chunk of this page 129 * @return Chunk[] A list of chunks created for this page 130 * @throws \Exception 131 */ 132 protected function createPageChunks($page, $firstChunkID) 133 { 134 $chunkList = []; 135 136 $textRenderer = plugin_load('renderer', 'text'); 137 if ($textRenderer) { 138 global $ID; 139 $ID = $page; 140 $text = p_cached_output(wikiFN($page), 'text', $page); 141 } else { 142 $text = rawWiki($page); 143 } 144 145 $parts = $this->splitIntoChunks($text); 146 foreach ($parts as $part) { 147 if (trim($part) == '') continue; // skip empty chunks 148 149 try { 150 $embedding = $this->model->getEmbedding($part); 151 } catch (\Exception $e) { 152 if ($this->logger) { 153 $this->logger->error( 154 'Failed to get embedding for chunk of page {page}: {msg}', 155 ['page' => $page, 'msg' => $e->getMessage()] 156 ); 157 } 158 continue; 159 } 160 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 161 $firstChunkID++; 162 } 163 if ($this->logger) { 164 if (count($chunkList)) { 165 $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]); 166 } else { 167 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 168 } 169 } 170 return $chunkList; 171 } 172 173 /** 174 * Do a nearest neighbor search for chunks similar to the given question 175 * 176 * Returns only chunks the current user is allowed to read, may return an empty result. 177 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 178 * 179 * @param string $query The question 180 * @return Chunk[] 181 * @throws \Exception 182 */ 183 public function getSimilarChunks($query) 184 { 185 global $auth; 186 $vector = $this->model->getEmbedding($query); 187 188 $fetch = ceil( 189 ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength()) 190 * 1.5 // fetch a few more than needed, since not all chunks are maximum length 191 ); 192 $chunks = $this->storage->getSimilarChunks($vector, $fetch); 193 194 $size = 0; 195 $result = []; 196 foreach ($chunks as $chunk) { 197 // filter out chunks the user is not allowed to read 198 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 199 200 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 201 if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough 202 203 $result[] = $chunk; 204 $size += $chunkSize; 205 } 206 return $result; 207 } 208 209 210 /** 211 * @param $text 212 * @return array 213 * @throws \Exception 214 * @todo support splitting too long sentences 215 */ 216 public function splitIntoChunks($text) 217 { 218 $sentenceSplitter = new Sentence(); 219 $tiktok = $this->getTokenEncoder(); 220 221 $chunks = []; 222 $sentences = $sentenceSplitter->split($text); 223 224 $chunklen = 0; 225 $chunk = ''; 226 while ($sentence = array_shift($sentences)) { 227 $slen = count($tiktok->encode($sentence)); 228 if ($slen > $this->model->getMaxEmbeddingTokenLength()) { 229 // sentence is too long, we need to split it further 230 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 231 continue; 232 } 233 234 if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) { 235 // add to current chunk 236 $chunk .= $sentence; 237 $chunklen += $slen; 238 // remember sentence for overlap check 239 $this->rememberSentence($sentence); 240 } else { 241 // add current chunk to result 242 $chunks[] = $chunk; 243 244 // start new chunk with remembered sentences 245 $chunk = join(' ', $this->sentenceQueue); 246 $chunk .= $sentence; 247 $chunklen = count($tiktok->encode($chunk)); 248 } 249 } 250 $chunks[] = $chunk; 251 252 return $chunks; 253 } 254 255 /** 256 * Add a sentence to the queue of remembered sentences 257 * 258 * @param string $sentence 259 * @return void 260 */ 261 protected function rememberSentence($sentence) 262 { 263 // add sentence to queue 264 $this->sentenceQueue[] = $sentence; 265 266 // remove oldest sentences from queue until we are below the max overlap 267 $encoder = $this->getTokenEncoder(); 268 while (count($encoder->encode(join(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 269 array_shift($this->sentenceQueue); 270 } 271 } 272} 273