1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\File\PageResolver; 7use dokuwiki\plugin\aichat\Model\ChatInterface; 8use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9use dokuwiki\plugin\aichat\Storage\AbstractStorage; 10use dokuwiki\Search\Indexer; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 /** @var int maximum overlap between chunks in tokens */ 24 final public const MAX_OVERLAP_LEN = 200; 25 26 /** @var ChatInterface */ 27 protected $chatModel; 28 29 /** @var EmbeddingInterface */ 30 protected $embedModel; 31 32 /** @var CLI|null */ 33 protected $logger; 34 /** @var Encoder */ 35 protected $tokenEncoder; 36 37 /** @var AbstractStorage */ 38 protected $storage; 39 40 /** @var array remember sentences when chunking */ 41 private $sentenceQueue = []; 42 43 /** @var int the time spent for the last similar chunk retrieval */ 44 public $timeSpent = 0; 45 46 protected $configChunkSize; 47 protected $configContextChunks; 48 protected $similarityThreshold; 49 50 /** 51 * Embeddings constructor. 52 * 53 * @param ChatInterface $chatModel 54 * @param EmbeddingInterface $embedModel 55 * @param AbstractStorage $storage 56 * @param array $config The plugin configuration 57 */ 58 public function __construct( 59 ChatInterface $chatModel, 60 EmbeddingInterface $embedModel, 61 AbstractStorage $storage, 62 $config 63 ) 64 { 65 $this->chatModel = $chatModel; 66 $this->embedModel = $embedModel; 67 $this->storage = $storage; 68 $this->configChunkSize = $config['chunkSize']; 69 $this->configContextChunks = $config['contextChunks']; 70 $this->similarityThreshold = $config['similarityThreshold'] / 100; 71 } 72 73 /** 74 * Access storage 75 * 76 * @return AbstractStorage 77 */ 78 public function getStorage() 79 { 80 return $this->storage; 81 } 82 83 /** 84 * Override the number of used context chunks 85 * 86 * @param int $max 87 * @return void 88 */ 89 public function setConfigContextChunks(int $max) 90 { 91 if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 92 $this->configContextChunks = $max; 93 } 94 95 /** 96 * Override the similiarity threshold 97 * 98 * @param float $threshold 99 * @return void 100 */ 101 public function setSimilarityThreshold(float $threshold) 102 { 103 if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 104 $this->similarityThreshold = $threshold; 105 } 106 107 /** 108 * Add a logger instance 109 * 110 * @return void 111 */ 112 public function setLogger(CLI $logger) 113 { 114 $this->logger = $logger; 115 } 116 117 /** 118 * Get the token encoder instance 119 * 120 * @return Encoder 121 */ 122 public function getTokenEncoder() 123 { 124 if (!$this->tokenEncoder instanceof Encoder) { 125 $this->tokenEncoder = new Encoder(); 126 } 127 return $this->tokenEncoder; 128 } 129 130 /** 131 * Return the chunk size to use 132 * 133 * @return int 134 */ 135 public function getChunkSize() 136 { 137 $tokenlimit = $this->chatModel->getMaxInputTokenLength(); 138 if (!$tokenlimit) { 139 // no token limit, use the configured chunk size 140 return $this->configChunkSize; 141 } 142 143 return min( 144 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 145 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 146 $this->configChunkSize, // this is usually the smallest 147 ); 148 } 149 150 /** 151 * Update the embeddings storage 152 * 153 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 154 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 155 * @param bool $clear Should any existing storage be cleared before updating? 156 * @return void 157 * @throws \Exception 158 */ 159 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 160 { 161 $indexer = new Indexer(); 162 $pages = $indexer->getPages(); 163 164 $this->storage->startCreation($clear); 165 foreach ($pages as $pid => $page) { 166 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 167 168 if ( 169 !page_exists($page) || 170 isHiddenPage($page) || 171 filesize(wikiFN($page)) < 150 || // skip very small pages 172 ($skipRE && preg_match($skipRE, (string)$page)) || 173 ($matchRE && !preg_match($matchRE, ":$page")) 174 ) { 175 // this page should not be in the index (anymore) 176 $this->storage->deletePageChunks($page, $chunkID); 177 continue; 178 } 179 180 $firstChunk = $this->storage->getChunk($chunkID); 181 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 182 // page is older than the chunks we have, reuse the existing chunks 183 $this->storage->reusePageChunks($page, $chunkID); 184 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 185 } else { 186 // page is newer than the chunks we have, create new chunks 187 $this->storage->deletePageChunks($page, $chunkID); 188 $chunks = $this->createPageChunks($page, $chunkID); 189 if ($chunks) $this->storage->addPageChunks($chunks); 190 } 191 } 192 $this->storage->finalizeCreation(); 193 } 194 195 /** 196 * Get the content of a page 197 * 198 * Uses our own renderer to format the contents in an LLM friendly way. Falls back to 199 * raw syntax if the renderer fails for some reason 200 * 201 * @param string $page Name of the page to read 202 * @return string The content of the page 203 */ 204 public function getPageContent($page) 205 { 206 global $ID; 207 $ID = $page; 208 try { 209 $text = p_cached_output(wikiFN($page), 'aichat', $page); 210 } catch (\Throwable $e) { 211 if ($this->logger) $this->logger->error( 212 'Failed to render page {page}. Using raw text instead. {msg}', 213 ['page' => $page, 'msg' => $e->getMessage()] 214 ); 215 $text = rawWiki($page); 216 } 217 return $text; 218 } 219 220 /** 221 * Split the given page, fetch embedding vectors and return Chunks 222 * 223 * Will use the text renderer plugin if available to get the rendered text. 224 * Otherwise the raw wiki text is used. 225 * 226 * @param string $page Name of the page to split 227 * @param int $firstChunkID The ID of the first chunk of this page 228 * @return Chunk[] A list of chunks created for this page 229 * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 230 * @throws \Exception 231 */ 232 public function createPageChunks($page, $firstChunkID) 233 { 234 $chunkList = []; 235 236 $text = $this->getPageContent($page); 237 $crumbs = $this->breadcrumbTrail($page); 238 239 // allow plugins to modify the text before splitting 240 $eventData = [ 241 'page' => $page, 242 'body' => '', 243 'metadata' => ['title' => $page, 'relation_references' => []], 244 ]; 245 $event = new Event('INDEXER_PAGE_ADD', $eventData); 246 if ($event->advise_before()) { 247 $text = $eventData['body'] . ' ' . $text; 248 } else { 249 $text = $eventData['body']; 250 } 251 252 $parts = $this->splitIntoChunks($text); 253 foreach ($parts as $part) { 254 if (trim((string)$part) == '') continue; // skip empty chunks 255 256 $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 257 258 try { 259 $embedding = $this->embedModel->getEmbedding($part); 260 } catch (\Exception $e) { 261 if ($this->logger instanceof CLI) { 262 $this->logger->error( 263 'Failed to get embedding for chunk of page {page}: {msg}', 264 ['page' => $page, 'msg' => $e->getMessage()] 265 ); 266 } 267 continue; 268 } 269 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 270 $firstChunkID++; 271 } 272 if ($this->logger instanceof CLI) { 273 if ($chunkList !== []) { 274 $this->logger->success( 275 '{id} split into {count} chunks', 276 ['id' => $page, 'count' => count($chunkList)] 277 ); 278 } else { 279 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 280 } 281 } 282 return $chunkList; 283 } 284 285 /** 286 * Do a nearest neighbor search for chunks similar to the given question 287 * 288 * Returns only chunks the current user is allowed to read, may return an empty result. 289 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 290 * 291 * @param string $query The question 292 * @param string $lang Limit results to this language 293 * @param bool $limits Apply chat token limits to the number of chunks returned? 294 * @return Chunk[] 295 * @throws \Exception 296 */ 297 public function getSimilarChunks($query, $lang = '', $limits = true) 298 { 299 global $auth; 300 $vector = $this->embedModel->getEmbedding($query); 301 302 $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 303 304 if ($tokenlimit) { 305 $fetch = min( 306 ($tokenlimit / $this->getChunkSize()), 307 $this->configContextChunks 308 ); 309 } else { 310 $fetch = $this->configContextChunks; 311 } 312 313 $time = microtime(true); 314 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 315 $this->timeSpent = round(microtime(true) - $time, 2); 316 if ($this->logger instanceof CLI) { 317 $this->logger->info( 318 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 319 ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 320 ); 321 } 322 323 $size = 0; 324 $result = []; 325 foreach ($chunks as $chunk) { 326 // filter out chunks the user is not allowed to read 327 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 328 if ($chunk->getScore() < $this->similarityThreshold) continue; 329 330 if ($tokenlimit) { 331 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 332 if ($size + $chunkSize > $tokenlimit) break; // we have enough 333 } 334 335 $result[] = $chunk; 336 $size += $chunkSize ?? 0; 337 338 if (count($result) >= $this->configContextChunks) break; // we have enough 339 } 340 return $result; 341 } 342 343 /** 344 * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk 345 * 346 * This will not apply any token limits 347 * 348 * @param string $query The question 349 * @param string $lang Limit results to this language 350 * @return Chunk[] 351 * @throws \Exception 352 */ 353 public function getSimilarPages($query, $lang = '') 354 { 355 $chunks = $this->getSimilarChunks($query, $lang, false); 356 $pages = []; 357 358 foreach ($chunks as $chunk) { 359 $page = $chunk->getPage(); 360 if (isset($pages[$page])) continue; // we already have this page 361 362 $content = $this->getPageContent($chunk->getPage()); 363 $crumbs = $this->breadcrumbTrail($chunk->getPage()); 364 365 $pages[$page] = new Chunk( 366 $page, 367 $chunk->getId(), 368 $crumbs . "\n\n" . $content, 369 $chunk->getEmbedding(), 370 $chunk->getLanguage(), 371 $chunk->getCreated(), 372 $chunk->getScore() 373 ); 374 } 375 return $pages; 376 } 377 378 /** 379 * Returns all chunks for a page 380 * 381 * Does not apply configContextChunks but checks token limits if requested 382 * 383 * @param string $page 384 * @param bool $limits Apply chat token limits to the number of chunks returned? 385 * @return Chunk[] 386 */ 387 public function getPageChunks($page, $limits = true) 388 { 389 global $auth; 390 if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 391 if ($this->logger instanceof CLI) $this->logger->warning( 392 'User not allowed to read context page {page}', ['page' => $page] 393 ); 394 return []; 395 } 396 397 $indexer = new Indexer(); 398 $pages = $indexer->getPages(); 399 $pos = array_search(cleanID($page), $pages); 400 401 if ($pos === false) { 402 if ($this->logger instanceof CLI) $this->logger->warning( 403 'Context page {page} is not in index', ['page' => $page] 404 ); 405 return []; 406 } 407 408 $chunks = $this->storage->getPageChunks($page, $pos * 100); 409 410 $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 411 412 $size = 0; 413 $result = []; 414 foreach ($chunks as $chunk) { 415 if ($tokenlimit) { 416 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 417 if ($size + $chunkSize > $tokenlimit) break; // we have enough 418 } 419 420 $result[] = $chunk; 421 $size += $chunkSize ?? 0; 422 } 423 424 return $result; 425 } 426 427 428 /** 429 * Create a breadcrumb trail for the given page 430 * 431 * Uses the first heading of each namespace and the page itself. This is added as a prefix to 432 * each chunk to give the AI some context. 433 * 434 * @param string $id 435 * @return string 436 */ 437 protected function breadcrumbTrail($id) 438 { 439 $namespaces = explode(':', getNS($id)); 440 $resolver = new PageResolver($id); 441 $crumbs = []; 442 443 // all namespaces 444 $check = ''; 445 foreach ($namespaces as $namespace) { 446 $check .= $namespace . ':'; 447 $page = $resolver->resolveId($check); 448 $title = p_get_first_heading($page); 449 $crumbs[] = $title ? "$title ($namespace)" : $namespace; 450 } 451 452 // the page itself 453 $title = p_get_first_heading($id); 454 $page = noNS($id); 455 $crumbs[] = $title ? "$title ($page)" : $page; 456 457 return implode(' » ', $crumbs); 458 } 459 460 /** 461 * @param $text 462 * @return array 463 * @throws \Exception 464 * @todo support splitting too long sentences 465 */ 466 protected function splitIntoChunks($text) 467 { 468 $sentenceSplitter = new Sentence(); 469 $tiktok = $this->getTokenEncoder(); 470 471 $chunks = []; 472 $sentences = $sentenceSplitter->split($text); 473 474 $chunklen = 0; 475 $chunk = ''; 476 while ($sentence = array_shift($sentences)) { 477 $slen = count($tiktok->encode($sentence)); 478 if ($slen > $this->getChunkSize()) { 479 // sentence is too long, we need to split it further 480 if ($this->logger instanceof CLI) $this->logger->warning( 481 'Sentence too long, splitting not implemented yet' 482 ); 483 continue; 484 } 485 486 if ($chunklen + $slen < $this->getChunkSize()) { 487 // add to current chunk 488 $chunk .= $sentence; 489 $chunklen += $slen; 490 // remember sentence for overlap check 491 $this->rememberSentence($sentence); 492 } else { 493 // add current chunk to result 494 $chunk = trim($chunk); 495 if ($chunk !== '') $chunks[] = $chunk; 496 497 // start new chunk with remembered sentences 498 $chunk = implode(' ', $this->sentenceQueue); 499 $chunk .= $sentence; 500 $chunklen = count($tiktok->encode($chunk)); 501 } 502 } 503 $chunks[] = $chunk; 504 505 return $chunks; 506 } 507 508 /** 509 * Add a sentence to the queue of remembered sentences 510 * 511 * @param string $sentence 512 * @return void 513 */ 514 protected function rememberSentence($sentence) 515 { 516 // add sentence to queue 517 $this->sentenceQueue[] = $sentence; 518 519 // remove oldest sentences from queue until we are below the max overlap 520 $encoder = $this->getTokenEncoder(); 521 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 522 array_shift($this->sentenceQueue); 523 } 524 } 525} 526