1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\File\PageResolver; 7use dokuwiki\plugin\aichat\Model\ChatInterface; 8use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9use dokuwiki\plugin\aichat\Storage\AbstractStorage; 10use dokuwiki\Search\Indexer; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 /** @var ChatInterface */ 24 protected $chatModel; 25 26 /** @var EmbeddingInterface */ 27 protected $embedModel; 28 29 /** @var CLI|null */ 30 protected $logger; 31 /** @var Encoder */ 32 protected $tokenEncoder; 33 34 /** @var AbstractStorage */ 35 protected $storage; 36 37 /** @var array remember sentences when chunking */ 38 private $sentenceQueue = []; 39 40 /** @var int the time spent for the last similar chunk retrieval */ 41 public $timeSpent = 0; 42 43 protected $configChunkSize; 44 protected $configContextChunks; 45 protected $similarityThreshold; 46 47 /** 48 * Embeddings constructor. 49 * 50 * @param ChatInterface $chatModel 51 * @param EmbeddingInterface $embedModel 52 * @param AbstractStorage $storage 53 * @param array $config The plugin configuration 54 */ 55 public function __construct( 56 ChatInterface $chatModel, 57 EmbeddingInterface $embedModel, 58 AbstractStorage $storage, 59 $config 60 ) 61 { 62 $this->chatModel = $chatModel; 63 $this->embedModel = $embedModel; 64 $this->storage = $storage; 65 $this->configChunkSize = $config['chunkSize']; 66 $this->configContextChunks = $config['contextChunks']; 67 $this->similarityThreshold = $config['similarityThreshold'] / 100; 68 } 69 70 /** 71 * Access storage 72 * 73 * @return AbstractStorage 74 */ 75 public function getStorage() 76 { 77 return $this->storage; 78 } 79 80 /** 81 * Override the number of used context chunks 82 * 83 * @param int $max 84 * @return void 85 */ 86 public function setConfigContextChunks(int $max) 87 { 88 if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 89 $this->configContextChunks = $max; 90 } 91 92 /** 93 * Override the similiarity threshold 94 * 95 * @param float $threshold 96 * @return void 97 */ 98 public function setSimilarityThreshold(float $threshold) 99 { 100 if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 101 $this->similarityThreshold = $threshold; 102 } 103 104 /** 105 * Add a logger instance 106 * 107 * @return void 108 */ 109 public function setLogger(CLI $logger) 110 { 111 $this->logger = $logger; 112 } 113 114 /** 115 * Get the token encoder instance 116 * 117 * @return Encoder 118 */ 119 public function getTokenEncoder() 120 { 121 if (!$this->tokenEncoder instanceof Encoder) { 122 $this->tokenEncoder = new Encoder(); 123 } 124 return $this->tokenEncoder; 125 } 126 127 /** 128 * Return the chunk size to use 129 * 130 * @return int 131 */ 132 public function getChunkSize() 133 { 134 $tokenlimit = $this->chatModel->getMaxInputTokenLength(); 135 if (!$tokenlimit) { 136 // no token limit, use the configured chunk size 137 return $this->configChunkSize; 138 } 139 140 return min( 141 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 142 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 143 $this->configChunkSize, // this is usually the smallest 144 ); 145 } 146 147 /** 148 * Update the embeddings storage 149 * 150 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 151 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 152 * @param bool $clear Should any existing storage be cleared before updating? 153 * @return void 154 * @throws \Exception 155 */ 156 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 157 { 158 $indexer = new Indexer(); 159 $pages = $indexer->getPages(); 160 161 $this->storage->startCreation($clear); 162 foreach ($pages as $pid => $page) { 163 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 164 165 if ( 166 !page_exists($page) || 167 isHiddenPage($page) || 168 filesize(wikiFN($page)) < 150 || // skip very small pages 169 ($skipRE && preg_match($skipRE, (string)$page)) || 170 ($matchRE && !preg_match($matchRE, ":$page")) 171 ) { 172 // this page should not be in the index (anymore) 173 $this->storage->deletePageChunks($page, $chunkID); 174 continue; 175 } 176 177 $firstChunk = $this->storage->getChunk($chunkID); 178 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 179 // page is older than the chunks we have, reuse the existing chunks 180 $this->storage->reusePageChunks($page, $chunkID); 181 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 182 } else { 183 // page is newer than the chunks we have, create new chunks 184 $this->storage->deletePageChunks($page, $chunkID); 185 $chunks = $this->createPageChunks($page, $chunkID); 186 if ($chunks) $this->storage->addPageChunks($chunks); 187 } 188 } 189 $this->storage->finalizeCreation(); 190 } 191 192 /** 193 * Get the content of a page 194 * 195 * Uses our own renderer to format the contents in an LLM friendly way. Falls back to 196 * raw syntax if the renderer fails for some reason 197 * 198 * @param string $page Name of the page to read 199 * @return string The content of the page 200 */ 201 public function getPageContent($page) 202 { 203 global $ID; 204 $ID = $page; 205 try { 206 $text = p_cached_output(wikiFN($page), 'aichat', $page); 207 } catch (\Throwable $e) { 208 if ($this->logger) $this->logger->error( 209 'Failed to render page {page}. Using raw text instead. {msg}', 210 ['page' => $page, 'msg' => $e->getMessage()] 211 ); 212 $text = rawWiki($page); 213 } 214 return $text; 215 } 216 217 /** 218 * Split the given page, fetch embedding vectors and return Chunks 219 * 220 * Will use the text renderer plugin if available to get the rendered text. 221 * Otherwise the raw wiki text is used. 222 * 223 * @param string $page Name of the page to split 224 * @param int $firstChunkID The ID of the first chunk of this page 225 * @return Chunk[] A list of chunks created for this page 226 * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 227 * @throws \Exception 228 */ 229 public function createPageChunks($page, $firstChunkID) 230 { 231 $chunkList = []; 232 233 $text = $this->getPageContent($page); 234 $crumbs = $this->breadcrumbTrail($page); 235 236 // allow plugins to modify the text before splitting 237 $eventData = [ 238 'page' => $page, 239 'body' => '', 240 'metadata' => ['title' => $page, 'relation_references' => []], 241 ]; 242 $event = new Event('INDEXER_PAGE_ADD', $eventData); 243 if ($event->advise_before()) { 244 $text = $eventData['body'] . ' ' . $text; 245 } else { 246 $text = $eventData['body']; 247 } 248 249 $splitter = new TextSplitter($this->getChunkSize(), $this->getTokenEncoder()); 250 $parts = $splitter->splitIntoChunks($text); 251 foreach ($parts as $part) { 252 if (trim($part) === '') continue; // skip empty chunks 253 254 $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 255 256 try { 257 $embedding = $this->embedModel->getEmbedding($part); 258 } catch (\Exception $e) { 259 if ($this->logger instanceof CLI) { 260 $this->logger->error( 261 'Failed to get embedding for chunk of page {page}: {msg}', 262 ['page' => $page, 'msg' => $e->getMessage()] 263 ); 264 } 265 continue; 266 } 267 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 268 $firstChunkID++; 269 } 270 if ($this->logger instanceof CLI) { 271 if ($chunkList !== []) { 272 $this->logger->success( 273 '{id} split into {count} chunks', 274 ['id' => $page, 'count' => count($chunkList)] 275 ); 276 } else { 277 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 278 } 279 } 280 return $chunkList; 281 } 282 283 /** 284 * Do a nearest neighbor search for chunks similar to the given question 285 * 286 * Returns only chunks the current user is allowed to read, may return an empty result. 287 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 288 * 289 * @param string $query The question 290 * @param string $lang Limit results to this language 291 * @param bool $limits Apply chat token limits to the number of chunks returned? 292 * @return Chunk[] 293 * @throws \Exception 294 */ 295 public function getSimilarChunks($query, $lang = '', $limits = true) 296 { 297 global $auth; 298 $vector = $this->embedModel->getEmbedding($query); 299 300 $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 301 302 if ($tokenlimit) { 303 $fetch = min( 304 ($tokenlimit / $this->getChunkSize()), 305 $this->configContextChunks 306 ); 307 } else { 308 $fetch = $this->configContextChunks; 309 } 310 311 $time = microtime(true); 312 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 313 $this->timeSpent = round(microtime(true) - $time, 2); 314 if ($this->logger instanceof CLI) { 315 $this->logger->info( 316 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 317 ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 318 ); 319 } 320 321 $size = 0; 322 $result = []; 323 foreach ($chunks as $chunk) { 324 // filter out chunks the user is not allowed to read 325 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 326 if ($chunk->getScore() < $this->similarityThreshold) continue; 327 328 if ($tokenlimit) { 329 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 330 if ($size + $chunkSize > $tokenlimit) break; // we have enough 331 } 332 333 $result[] = $chunk; 334 $size += $chunkSize ?? 0; 335 336 if (count($result) >= $this->configContextChunks) break; // we have enough 337 } 338 return $result; 339 } 340 341 /** 342 * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk 343 * 344 * This will not apply any token limits 345 * 346 * @param string $query The question 347 * @param string $lang Limit results to this language 348 * @return Chunk[] 349 * @throws \Exception 350 */ 351 public function getSimilarPages($query, $lang = '') 352 { 353 $chunks = $this->getSimilarChunks($query, $lang, false); 354 $pages = []; 355 356 foreach ($chunks as $chunk) { 357 $page = $chunk->getPage(); 358 if (isset($pages[$page])) continue; // we already have this page 359 360 $content = $this->getPageContent($chunk->getPage()); 361 $crumbs = $this->breadcrumbTrail($chunk->getPage()); 362 363 $pages[$page] = new Chunk( 364 $page, 365 $chunk->getId(), 366 $crumbs . "\n\n" . $content, 367 $chunk->getEmbedding(), 368 $chunk->getLanguage(), 369 $chunk->getCreated(), 370 $chunk->getScore() 371 ); 372 } 373 return $pages; 374 } 375 376 /** 377 * Returns all chunks for a page 378 * 379 * Does not apply configContextChunks but checks token limits if requested 380 * 381 * @param string $page 382 * @param bool $limits Apply chat token limits to the number of chunks returned? 383 * @return Chunk[] 384 */ 385 public function getPageChunks($page, $limits = true) 386 { 387 global $auth; 388 if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 389 if ($this->logger instanceof CLI) $this->logger->warning( 390 'User not allowed to read context page {page}', ['page' => $page] 391 ); 392 return []; 393 } 394 395 $indexer = new Indexer(); 396 $pages = $indexer->getPages(); 397 $pos = array_search(cleanID($page), $pages); 398 399 if ($pos === false) { 400 if ($this->logger instanceof CLI) $this->logger->warning( 401 'Context page {page} is not in index', ['page' => $page] 402 ); 403 return []; 404 } 405 406 $chunks = $this->storage->getPageChunks($page, $pos * 100); 407 408 $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 409 410 $size = 0; 411 $result = []; 412 foreach ($chunks as $chunk) { 413 if ($tokenlimit) { 414 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 415 if ($size + $chunkSize > $tokenlimit) break; // we have enough 416 } 417 418 $result[] = $chunk; 419 $size += $chunkSize ?? 0; 420 } 421 422 return $result; 423 } 424 425 426 /** 427 * Create a breadcrumb trail for the given page 428 * 429 * Uses the first heading of each namespace and the page itself. This is added as a prefix to 430 * each chunk to give the AI some context. 431 * 432 * @param string $id 433 * @return string 434 */ 435 protected function breadcrumbTrail($id) 436 { 437 $namespaces = explode(':', getNS($id)); 438 $resolver = new PageResolver($id); 439 $crumbs = []; 440 441 // all namespaces 442 $check = ''; 443 foreach ($namespaces as $namespace) { 444 $check .= $namespace . ':'; 445 $page = $resolver->resolveId($check); 446 $title = p_get_first_heading($page); 447 $crumbs[] = $title ? "$title ($namespace)" : $namespace; 448 } 449 450 // the page itself 451 $title = p_get_first_heading($id); 452 $page = noNS($id); 453 $crumbs[] = $title ? "$title ($page)" : $page; 454 455 return implode(' » ', $crumbs); 456 } 457} 458