1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\File\PageResolver; 7use dokuwiki\plugin\aichat\Model\ChatInterface; 8use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9use dokuwiki\plugin\aichat\Storage\AbstractStorage; 10use dokuwiki\Search\Indexer; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 /** @var int maximum overlap between chunks in tokens */ 24 final public const MAX_OVERLAP_LEN = 200; 25 26 /** @var ChatInterface */ 27 protected $chatModel; 28 29 /** @var EmbeddingInterface */ 30 protected $embedModel; 31 32 /** @var CLI|null */ 33 protected $logger; 34 /** @var Encoder */ 35 protected $tokenEncoder; 36 37 /** @var AbstractStorage */ 38 protected $storage; 39 40 /** @var array remember sentences when chunking */ 41 private $sentenceQueue = []; 42 43 /** @var int the time spent for the last similar chunk retrieval */ 44 public $timeSpent = 0; 45 46 protected $configChunkSize; 47 protected $configContextChunks; 48 protected $similarityThreshold; 49 50 /** 51 * Embeddings constructor. 52 * 53 * @param ChatInterface $chatModel 54 * @param EmbeddingInterface $embedModel 55 * @param AbstractStorage $storage 56 * @param array $config The plugin configuration 57 */ 58 public function __construct( 59 ChatInterface $chatModel, 60 EmbeddingInterface $embedModel, 61 AbstractStorage $storage, 62 $config 63 ) 64 { 65 $this->chatModel = $chatModel; 66 $this->embedModel = $embedModel; 67 $this->storage = $storage; 68 $this->configChunkSize = $config['chunkSize']; 69 $this->configContextChunks = $config['contextChunks']; 70 $this->similarityThreshold = $config['similarityThreshold'] / 100; 71 } 72 73 /** 74 * Access storage 75 * 76 * @return AbstractStorage 77 */ 78 public function getStorage() 79 { 80 return $this->storage; 81 } 82 83 /** 84 * Override the number of used context chunks 85 * 86 * @param int $max 87 * @return void 88 */ 89 public function setConfigContextChunks(int $max) 90 { 91 if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 92 $this->configContextChunks = $max; 93 } 94 95 /** 96 * Override the similiarity threshold 97 * 98 * @param float $threshold 99 * @return void 100 */ 101 public function setSimilarityThreshold(float $threshold) 102 { 103 if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 104 $this->similarityThreshold = $threshold; 105 } 106 107 /** 108 * Add a logger instance 109 * 110 * @return void 111 */ 112 public function setLogger(CLI $logger) 113 { 114 $this->logger = $logger; 115 } 116 117 /** 118 * Get the token encoder instance 119 * 120 * @return Encoder 121 */ 122 public function getTokenEncoder() 123 { 124 if (!$this->tokenEncoder instanceof Encoder) { 125 $this->tokenEncoder = new Encoder(); 126 } 127 return $this->tokenEncoder; 128 } 129 130 /** 131 * Return the chunk size to use 132 * 133 * @return int 134 */ 135 public function getChunkSize() 136 { 137 $tokenlimit = $this->chatModel->getMaxInputTokenLength(); 138 if(!$tokenlimit) { 139 // no token limit, use the configured chunk size 140 return $this->configChunkSize; 141 } 142 143 return min( 144 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 145 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 146 $this->configChunkSize, // this is usually the smallest 147 ); 148 } 149 150 /** 151 * Update the embeddings storage 152 * 153 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 154 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 155 * @param bool $clear Should any existing storage be cleared before updating? 156 * @return void 157 * @throws \Exception 158 */ 159 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 160 { 161 $indexer = new Indexer(); 162 $pages = $indexer->getPages(); 163 164 $this->storage->startCreation($clear); 165 foreach ($pages as $pid => $page) { 166 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 167 168 if ( 169 !page_exists($page) || 170 isHiddenPage($page) || 171 filesize(wikiFN($page)) < 150 || // skip very small pages 172 ($skipRE && preg_match($skipRE, (string)$page)) || 173 ($matchRE && !preg_match($matchRE, ":$page")) 174 ) { 175 // this page should not be in the index (anymore) 176 $this->storage->deletePageChunks($page, $chunkID); 177 continue; 178 } 179 180 $firstChunk = $this->storage->getChunk($chunkID); 181 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 182 // page is older than the chunks we have, reuse the existing chunks 183 $this->storage->reusePageChunks($page, $chunkID); 184 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 185 } else { 186 // page is newer than the chunks we have, create new chunks 187 $this->storage->deletePageChunks($page, $chunkID); 188 $chunks = $this->createPageChunks($page, $chunkID); 189 if ($chunks) $this->storage->addPageChunks($chunks); 190 } 191 } 192 $this->storage->finalizeCreation(); 193 } 194 195 /** 196 * Split the given page, fetch embedding vectors and return Chunks 197 * 198 * Will use the text renderer plugin if available to get the rendered text. 199 * Otherwise the raw wiki text is used. 200 * 201 * @param string $page Name of the page to split 202 * @param int $firstChunkID The ID of the first chunk of this page 203 * @return Chunk[] A list of chunks created for this page 204 * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 205 * @throws \Exception 206 */ 207 public function createPageChunks($page, $firstChunkID) 208 { 209 $chunkList = []; 210 211 global $ID; 212 $ID = $page; 213 try { 214 $text = p_cached_output(wikiFN($page), 'aichat', $page); 215 } catch (\Throwable $e) { 216 if ($this->logger) $this->logger->error( 217 'Failed to render page {page}. Using raw text instead. {msg}', 218 ['page' => $page, 'msg' => $e->getMessage()] 219 ); 220 $text = rawWiki($page); 221 } 222 223 $crumbs = $this->breadcrumbTrail($page); 224 225 // allow plugins to modify the text before splitting 226 $eventData = [ 227 'page' => $page, 228 'body' => '', 229 'metadata' => ['title' => $page, 'relation_references' => []], 230 ]; 231 $event = new Event('INDEXER_PAGE_ADD', $eventData); 232 if ($event->advise_before()) { 233 $text = $eventData['body'] . ' ' . $text; 234 } else { 235 $text = $eventData['body']; 236 } 237 238 $parts = $this->splitIntoChunks($text); 239 foreach ($parts as $part) { 240 if (trim((string)$part) == '') continue; // skip empty chunks 241 242 $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 243 244 try { 245 $embedding = $this->embedModel->getEmbedding($part); 246 } catch (\Exception $e) { 247 if ($this->logger instanceof CLI) { 248 $this->logger->error( 249 'Failed to get embedding for chunk of page {page}: {msg}', 250 ['page' => $page, 'msg' => $e->getMessage()] 251 ); 252 } 253 continue; 254 } 255 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 256 $firstChunkID++; 257 } 258 if ($this->logger instanceof CLI) { 259 if ($chunkList !== []) { 260 $this->logger->success( 261 '{id} split into {count} chunks', 262 ['id' => $page, 'count' => count($chunkList)] 263 ); 264 } else { 265 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 266 } 267 } 268 return $chunkList; 269 } 270 271 /** 272 * Do a nearest neighbor search for chunks similar to the given question 273 * 274 * Returns only chunks the current user is allowed to read, may return an empty result. 275 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 276 * 277 * @param string $query The question 278 * @param string $lang Limit results to this language 279 * @param bool $limits Apply chat token limits to the number of chunks returned? 280 * @return Chunk[] 281 * @throws \Exception 282 */ 283 public function getSimilarChunks($query, $lang = '', $limits = true) 284 { 285 global $auth; 286 $vector = $this->embedModel->getEmbedding($query); 287 288 $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 289 290 if ($tokenlimit) { 291 $fetch = min( 292 ($tokenlimit / $this->getChunkSize()), 293 $this->configContextChunks 294 ); 295 } else { 296 $fetch = $this->configContextChunks; 297 } 298 299 $time = microtime(true); 300 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 301 $this->timeSpent = round(microtime(true) - $time, 2); 302 if ($this->logger instanceof CLI) { 303 $this->logger->info( 304 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 305 ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 306 ); 307 } 308 309 $size = 0; 310 $result = []; 311 foreach ($chunks as $chunk) { 312 // filter out chunks the user is not allowed to read 313 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 314 if ($chunk->getScore() < $this->similarityThreshold) continue; 315 316 if ($tokenlimit) { 317 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 318 if ($size + $chunkSize > $tokenlimit) break; // we have enough 319 } 320 321 $result[] = $chunk; 322 $size += $chunkSize ?? 0; 323 324 if (count($result) >= $this->configContextChunks) break; // we have enough 325 } 326 return $result; 327 } 328 329 /** 330 * Returns all chunks for a page 331 * 332 * Does not apply configContextChunks but checks token limits if requested 333 * 334 * @param string $page 335 * @param bool $limits Apply chat token limits to the number of chunks returned? 336 * @return Chunk[] 337 */ 338 public function getPageChunks($page, $limits = true) 339 { 340 global $auth; 341 if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 342 if ($this->logger instanceof CLI) $this->logger->warning( 343 'User not allowed to read context page {page}', ['page' => $page] 344 ); 345 return []; 346 } 347 348 $indexer = new Indexer(); 349 $pages = $indexer->getPages(); 350 $pos = array_search(cleanID($page), $pages); 351 352 if ($pos === false) { 353 if ($this->logger instanceof CLI) $this->logger->warning( 354 'Context page {page} is not in index', ['page' => $page] 355 ); 356 return []; 357 } 358 359 $chunks = $this->storage->getPageChunks($page, $pos * 100); 360 361 $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 362 363 $size = 0; 364 $result = []; 365 foreach ($chunks as $chunk) { 366 if ($tokenlimit) { 367 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 368 if ($size + $chunkSize > $tokenlimit) break; // we have enough 369 } 370 371 $result[] = $chunk; 372 $size += $chunkSize ?? 0; 373 } 374 375 return $result; 376 } 377 378 379 /** 380 * Create a breadcrumb trail for the given page 381 * 382 * Uses the first heading of each namespace and the page itself. This is added as a prefix to 383 * each chunk to give the AI some context. 384 * 385 * @param string $id 386 * @return string 387 */ 388 protected function breadcrumbTrail($id) 389 { 390 $namespaces = explode(':', getNS($id)); 391 $resolver = new PageResolver($id); 392 $crumbs = []; 393 394 // all namespaces 395 $check = ''; 396 foreach ($namespaces as $namespace) { 397 $check .= $namespace . ':'; 398 $page = $resolver->resolveId($check); 399 $title = p_get_first_heading($page); 400 $crumbs[] = $title ? "$title ($namespace)" : $namespace; 401 } 402 403 // the page itself 404 $title = p_get_first_heading($id); 405 $page = noNS($id); 406 $crumbs[] = $title ? "$title ($page)" : $page; 407 408 return implode(' » ', $crumbs); 409 } 410 411 /** 412 * @param $text 413 * @return array 414 * @throws \Exception 415 * @todo support splitting too long sentences 416 */ 417 protected function splitIntoChunks($text) 418 { 419 $sentenceSplitter = new Sentence(); 420 $tiktok = $this->getTokenEncoder(); 421 422 $chunks = []; 423 $sentences = $sentenceSplitter->split($text); 424 425 $chunklen = 0; 426 $chunk = ''; 427 while ($sentence = array_shift($sentences)) { 428 $slen = count($tiktok->encode($sentence)); 429 if ($slen > $this->getChunkSize()) { 430 // sentence is too long, we need to split it further 431 if ($this->logger instanceof CLI) $this->logger->warning( 432 'Sentence too long, splitting not implemented yet' 433 ); 434 continue; 435 } 436 437 if ($chunklen + $slen < $this->getChunkSize()) { 438 // add to current chunk 439 $chunk .= $sentence; 440 $chunklen += $slen; 441 // remember sentence for overlap check 442 $this->rememberSentence($sentence); 443 } else { 444 // add current chunk to result 445 $chunk = trim($chunk); 446 if ($chunk !== '') $chunks[] = $chunk; 447 448 // start new chunk with remembered sentences 449 $chunk = implode(' ', $this->sentenceQueue); 450 $chunk .= $sentence; 451 $chunklen = count($tiktok->encode($chunk)); 452 } 453 } 454 $chunks[] = $chunk; 455 456 return $chunks; 457 } 458 459 /** 460 * Add a sentence to the queue of remembered sentences 461 * 462 * @param string $sentence 463 * @return void 464 */ 465 protected function rememberSentence($sentence) 466 { 467 // add sentence to queue 468 $this->sentenceQueue[] = $sentence; 469 470 // remove oldest sentences from queue until we are below the max overlap 471 $encoder = $this->getTokenEncoder(); 472 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 473 array_shift($this->sentenceQueue); 474 } 475 } 476} 477