1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\File\PageResolver; 7use dokuwiki\plugin\aichat\Model\ChatInterface; 8use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9use dokuwiki\plugin\aichat\Storage\AbstractStorage; 10use dokuwiki\Search\Indexer; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 /** @var int maximum overlap between chunks in tokens */ 24 final public const MAX_OVERLAP_LEN = 200; 25 26 /** @var ChatInterface */ 27 protected $chatModel; 28 29 /** @var EmbeddingInterface */ 30 protected $embedModel; 31 32 /** @var CLI|null */ 33 protected $logger; 34 /** @var Encoder */ 35 protected $tokenEncoder; 36 37 /** @var AbstractStorage */ 38 protected $storage; 39 40 /** @var array remember sentences when chunking */ 41 private $sentenceQueue = []; 42 43 /** @var int the time spent for the last similar chunk retrieval */ 44 public $timeSpent = 0; 45 46 protected $configChunkSize; 47 protected $configContextChunks; 48 protected $similarityThreshold; 49 50 /** 51 * Embeddings constructor. 52 * 53 * @param ChatInterface $chatModel 54 * @param EmbeddingInterface $embedModel 55 * @param AbstractStorage $storage 56 * @param array $config The plugin configuration 57 */ 58 public function __construct( 59 ChatInterface $chatModel, 60 EmbeddingInterface $embedModel, 61 AbstractStorage $storage, 62 $config 63 ) 64 { 65 $this->chatModel = $chatModel; 66 $this->embedModel = $embedModel; 67 $this->storage = $storage; 68 $this->configChunkSize = $config['chunkSize']; 69 $this->configContextChunks = $config['contextChunks']; 70 $this->similarityThreshold = $config['similarityThreshold'] / 100; 71 } 72 73 /** 74 * Access storage 75 * 76 * @return AbstractStorage 77 */ 78 public function getStorage() 79 { 80 return $this->storage; 81 } 82 83 /** 84 * Override the number of used context chunks 85 * 86 * @param int $max 87 * @return void 88 */ 89 public function setConfigContextChunks(int $max) 90 { 91 if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 92 $this->configContextChunks = $max; 93 } 94 95 /** 96 * Override the similiarity threshold 97 * 98 * @param float $threshold 99 * @return void 100 */ 101 public function setSimilarityThreshold(float $threshold) 102 { 103 if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 104 $this->similarityThreshold = $threshold; 105 } 106 107 /** 108 * Add a logger instance 109 * 110 * @return void 111 */ 112 public function setLogger(CLI $logger) 113 { 114 $this->logger = $logger; 115 } 116 117 /** 118 * Get the token encoder instance 119 * 120 * @return Encoder 121 */ 122 public function getTokenEncoder() 123 { 124 if (!$this->tokenEncoder instanceof Encoder) { 125 $this->tokenEncoder = new Encoder(); 126 } 127 return $this->tokenEncoder; 128 } 129 130 /** 131 * Return the chunk size to use 132 * 133 * @return int 134 */ 135 public function getChunkSize() 136 { 137 return min( 138 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 139 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 140 $this->configChunkSize, // this is usually the smallest 141 ); 142 } 143 144 /** 145 * Update the embeddings storage 146 * 147 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 148 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 149 * @param bool $clear Should any existing storage be cleared before updating? 150 * @return void 151 * @throws \Exception 152 */ 153 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 154 { 155 $indexer = new Indexer(); 156 $pages = $indexer->getPages(); 157 158 $this->storage->startCreation($clear); 159 foreach ($pages as $pid => $page) { 160 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 161 162 if ( 163 !page_exists($page) || 164 isHiddenPage($page) || 165 filesize(wikiFN($page)) < 150 || // skip very small pages 166 ($skipRE && preg_match($skipRE, (string)$page)) || 167 ($matchRE && !preg_match($matchRE, ":$page")) 168 ) { 169 // this page should not be in the index (anymore) 170 $this->storage->deletePageChunks($page, $chunkID); 171 continue; 172 } 173 174 $firstChunk = $this->storage->getChunk($chunkID); 175 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 176 // page is older than the chunks we have, reuse the existing chunks 177 $this->storage->reusePageChunks($page, $chunkID); 178 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 179 } else { 180 // page is newer than the chunks we have, create new chunks 181 $this->storage->deletePageChunks($page, $chunkID); 182 $chunks = $this->createPageChunks($page, $chunkID); 183 if ($chunks) $this->storage->addPageChunks($chunks); 184 } 185 } 186 $this->storage->finalizeCreation(); 187 } 188 189 /** 190 * Split the given page, fetch embedding vectors and return Chunks 191 * 192 * Will use the text renderer plugin if available to get the rendered text. 193 * Otherwise the raw wiki text is used. 194 * 195 * @param string $page Name of the page to split 196 * @param int $firstChunkID The ID of the first chunk of this page 197 * @return Chunk[] A list of chunks created for this page 198 * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 199 * @throws \Exception 200 */ 201 public function createPageChunks($page, $firstChunkID) 202 { 203 $chunkList = []; 204 205 global $ID; 206 $ID = $page; 207 try { 208 $text = p_cached_output(wikiFN($page), 'aichat', $page); 209 } catch (\Throwable $e) { 210 if ($this->logger) $this->logger->error( 211 'Failed to render page {page}. Using raw text instead. {msg}', 212 ['page' => $page, 'msg' => $e->getMessage()] 213 ); 214 $text = rawWiki($page); 215 } 216 217 $crumbs = $this->breadcrumbTrail($page); 218 219 // allow plugins to modify the text before splitting 220 $eventData = [ 221 'page' => $page, 222 'body' => '', 223 'metadata' => ['title' => $page, 'relation_references' => []], 224 ]; 225 $event = new Event('INDEXER_PAGE_ADD', $eventData); 226 if ($event->advise_before()) { 227 $text = $eventData['body'] . ' ' . $text; 228 } else { 229 $text = $eventData['body']; 230 } 231 232 $parts = $this->splitIntoChunks($text); 233 foreach ($parts as $part) { 234 if (trim((string)$part) == '') continue; // skip empty chunks 235 236 $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 237 238 try { 239 $embedding = $this->embedModel->getEmbedding($part); 240 } catch (\Exception $e) { 241 if ($this->logger instanceof CLI) { 242 $this->logger->error( 243 'Failed to get embedding for chunk of page {page}: {msg}', 244 ['page' => $page, 'msg' => $e->getMessage()] 245 ); 246 } 247 continue; 248 } 249 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 250 $firstChunkID++; 251 } 252 if ($this->logger instanceof CLI) { 253 if ($chunkList !== []) { 254 $this->logger->success( 255 '{id} split into {count} chunks', 256 ['id' => $page, 'count' => count($chunkList)] 257 ); 258 } else { 259 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 260 } 261 } 262 return $chunkList; 263 } 264 265 /** 266 * Do a nearest neighbor search for chunks similar to the given question 267 * 268 * Returns only chunks the current user is allowed to read, may return an empty result. 269 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 270 * 271 * @param string $query The question 272 * @param string $lang Limit results to this language 273 * @param bool $limits Apply chat token limits to the number of chunks returned? 274 * @return Chunk[] 275 * @throws \Exception 276 */ 277 public function getSimilarChunks($query, $lang = '', $limits = true) 278 { 279 global $auth; 280 $vector = $this->embedModel->getEmbedding($query); 281 282 if ($limits) { 283 $fetch = min( 284 ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 285 $this->configContextChunks 286 ); 287 } else { 288 $fetch = $this->configContextChunks; 289 } 290 291 $time = microtime(true); 292 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 293 $this->timeSpent = round(microtime(true) - $time, 2); 294 if ($this->logger instanceof CLI) { 295 $this->logger->info( 296 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 297 ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 298 ); 299 } 300 301 $size = 0; 302 $result = []; 303 foreach ($chunks as $chunk) { 304 // filter out chunks the user is not allowed to read 305 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 306 if ($chunk->getScore() < $this->similarityThreshold) continue; 307 308 if ($limits) { 309 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 310 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 311 } 312 313 $result[] = $chunk; 314 $size += $chunkSize ?? 0; 315 316 if (count($result) >= $this->configContextChunks) break; // we have enough 317 } 318 return $result; 319 } 320 321 /** 322 * Returns all chunks for a page 323 * 324 * Does not apply configContextChunks but checks token limits if requested 325 * 326 * @param string $page 327 * @param bool $limits Apply chat token limits to the number of chunks returned? 328 * @return Chunk[] 329 */ 330 public function getPageChunks($page, $limits = true) 331 { 332 global $auth; 333 if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 334 if ($this->logger instanceof CLI) $this->logger->warning( 335 'User not allowed to read context page {page}', ['page' => $page] 336 ); 337 return []; 338 } 339 340 $indexer = new Indexer(); 341 $pages = $indexer->getPages(); 342 $pos = array_search(cleanID($page), $pages); 343 344 if ($pos === false) { 345 if ($this->logger instanceof CLI) $this->logger->warning( 346 'Context page {page} is not in index', ['page' => $page] 347 ); 348 return []; 349 } 350 351 $chunks = $this->storage->getPageChunks($page, $pos * 100); 352 353 $size = 0; 354 $result = []; 355 foreach ($chunks as $chunk) { 356 if ($limits) { 357 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 358 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 359 } 360 361 $result[] = $chunk; 362 $size += $chunkSize ?? 0; 363 } 364 365 return $result; 366 } 367 368 369 /** 370 * Create a breadcrumb trail for the given page 371 * 372 * Uses the first heading of each namespace and the page itself. This is added as a prefix to 373 * each chunk to give the AI some context. 374 * 375 * @param string $id 376 * @return string 377 */ 378 protected function breadcrumbTrail($id) 379 { 380 $namespaces = explode(':', getNS($id)); 381 $resolver = new PageResolver($id); 382 $crumbs = []; 383 384 // all namespaces 385 $check = ''; 386 foreach ($namespaces as $namespace) { 387 $check .= $namespace . ':'; 388 $page = $resolver->resolveId($check); 389 $title = p_get_first_heading($page); 390 $crumbs[] = $title ? "$title ($namespace)" : $namespace; 391 } 392 393 // the page itself 394 $title = p_get_first_heading($id); 395 $page = noNS($id); 396 $crumbs[] = $title ? "$title ($page)" : $page; 397 398 return implode(' » ', $crumbs); 399 } 400 401 /** 402 * @param $text 403 * @return array 404 * @throws \Exception 405 * @todo support splitting too long sentences 406 */ 407 protected function splitIntoChunks($text) 408 { 409 $sentenceSplitter = new Sentence(); 410 $tiktok = $this->getTokenEncoder(); 411 412 $chunks = []; 413 $sentences = $sentenceSplitter->split($text); 414 415 $chunklen = 0; 416 $chunk = ''; 417 while ($sentence = array_shift($sentences)) { 418 $slen = count($tiktok->encode($sentence)); 419 if ($slen > $this->getChunkSize()) { 420 // sentence is too long, we need to split it further 421 if ($this->logger instanceof CLI) $this->logger->warning( 422 'Sentence too long, splitting not implemented yet' 423 ); 424 continue; 425 } 426 427 if ($chunklen + $slen < $this->getChunkSize()) { 428 // add to current chunk 429 $chunk .= $sentence; 430 $chunklen += $slen; 431 // remember sentence for overlap check 432 $this->rememberSentence($sentence); 433 } else { 434 // add current chunk to result 435 $chunk = trim($chunk); 436 if ($chunk !== '') $chunks[] = $chunk; 437 438 // start new chunk with remembered sentences 439 $chunk = implode(' ', $this->sentenceQueue); 440 $chunk .= $sentence; 441 $chunklen = count($tiktok->encode($chunk)); 442 } 443 } 444 $chunks[] = $chunk; 445 446 return $chunks; 447 } 448 449 /** 450 * Add a sentence to the queue of remembered sentences 451 * 452 * @param string $sentence 453 * @return void 454 */ 455 protected function rememberSentence($sentence) 456 { 457 // add sentence to queue 458 $this->sentenceQueue[] = $sentence; 459 460 // remove oldest sentences from queue until we are below the max overlap 461 $encoder = $this->getTokenEncoder(); 462 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 463 array_shift($this->sentenceQueue); 464 } 465 } 466} 467