1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Extension\PluginInterface; 7use dokuwiki\File\PageResolver; 8use dokuwiki\plugin\aichat\Model\ChatInterface; 9use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 10use dokuwiki\plugin\aichat\Storage\AbstractStorage; 11use dokuwiki\Search\Indexer; 12use splitbrain\phpcli\CLI; 13use TikToken\Encoder; 14use Vanderlee\Sentence\Sentence; 15 16/** 17 * Manage the embeddings index 18 * 19 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 20 * OpenAI and stored in the Storage backend. 21 */ 22class Embeddings 23{ 24 /** @var int maximum overlap between chunks in tokens */ 25 final public const MAX_OVERLAP_LEN = 200; 26 27 /** @var ChatInterface */ 28 protected $chatModel; 29 30 /** @var EmbeddingInterface */ 31 protected $embedModel; 32 33 /** @var CLI|null */ 34 protected $logger; 35 /** @var Encoder */ 36 protected $tokenEncoder; 37 38 /** @var AbstractStorage */ 39 protected $storage; 40 41 /** @var array remember sentences when chunking */ 42 private $sentenceQueue = []; 43 44 /** @var int the time spent for the last similar chunk retrieval */ 45 public $timeSpent = 0; 46 47 protected $configChunkSize; 48 protected $configContextChunks; 49 protected $similarityThreshold; 50 51 /** 52 * Embeddings constructor. 53 * 54 * @param ChatInterface $chatModel 55 * @param EmbeddingInterface $embedModel 56 * @param AbstractStorage $storage 57 * @param array $config The plugin configuration 58 */ 59 public function __construct( 60 ChatInterface $chatModel, 61 EmbeddingInterface $embedModel, 62 AbstractStorage $storage, 63 $config 64 ) { 65 $this->chatModel = $chatModel; 66 $this->embedModel = $embedModel; 67 $this->storage = $storage; 68 $this->configChunkSize = $config['chunkSize']; 69 $this->configContextChunks = $config['contextChunks']; 70 $this->similarityThreshold = $config['similarityThreshold'] / 100; 71 } 72 73 /** 74 * Access storage 75 * 76 * @return AbstractStorage 77 */ 78 public function getStorage() 79 { 80 return $this->storage; 81 } 82 83 /** 84 * Add a logger instance 85 * 86 * @return void 87 */ 88 public function setLogger(CLI $logger) 89 { 90 $this->logger = $logger; 91 } 92 93 /** 94 * Get the token encoder instance 95 * 96 * @return Encoder 97 */ 98 public function getTokenEncoder() 99 { 100 if (!$this->tokenEncoder instanceof Encoder) { 101 $this->tokenEncoder = new Encoder(); 102 } 103 return $this->tokenEncoder; 104 } 105 106 /** 107 * Return the chunk size to use 108 * 109 * @return int 110 */ 111 public function getChunkSize() 112 { 113 return min( 114 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 115 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 116 $this->configChunkSize, // this is usually the smallest 117 ); 118 } 119 120 /** 121 * Update the embeddings storage 122 * 123 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 124 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 125 * @param bool $clear Should any existing storage be cleared before updating? 126 * @return void 127 * @throws \Exception 128 */ 129 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 130 { 131 $indexer = new Indexer(); 132 $pages = $indexer->getPages(); 133 134 $this->storage->startCreation($clear); 135 foreach ($pages as $pid => $page) { 136 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 137 138 if ( 139 !page_exists($page) || 140 isHiddenPage($page) || 141 filesize(wikiFN($page)) < 150 || // skip very small pages 142 ($skipRE && preg_match($skipRE, (string)$page)) || 143 ($matchRE && !preg_match($matchRE, ":$page")) 144 ) { 145 // this page should not be in the index (anymore) 146 $this->storage->deletePageChunks($page, $chunkID); 147 continue; 148 } 149 150 $firstChunk = $this->storage->getChunk($chunkID); 151 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 152 // page is older than the chunks we have, reuse the existing chunks 153 $this->storage->reusePageChunks($page, $chunkID); 154 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 155 } else { 156 // page is newer than the chunks we have, create new chunks 157 $this->storage->deletePageChunks($page, $chunkID); 158 $chunks = $this->createPageChunks($page, $chunkID); 159 if ($chunks) $this->storage->addPageChunks($chunks); 160 } 161 } 162 $this->storage->finalizeCreation(); 163 } 164 165 /** 166 * Split the given page, fetch embedding vectors and return Chunks 167 * 168 * Will use the text renderer plugin if available to get the rendered text. 169 * Otherwise the raw wiki text is used. 170 * 171 * @param string $page Name of the page to split 172 * @param int $firstChunkID The ID of the first chunk of this page 173 * @return Chunk[] A list of chunks created for this page 174 * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 175 * @throws \Exception 176 */ 177 public function createPageChunks($page, $firstChunkID) 178 { 179 $chunkList = []; 180 181 global $ID; 182 $ID = $page; 183 try { 184 $text = p_cached_output(wikiFN($page), 'aichat', $page); 185 } catch (\Throwable $e) { 186 if ($this->logger) $this->logger->error( 187 'Failed to render page {page}. Using raw text instead. {msg}', 188 ['page' => $page, 'msg' => $e->getMessage()] 189 ); 190 $text = rawWiki($page); 191 } 192 193 $crumbs = $this->breadcrumbTrail($page); 194 195 // allow plugins to modify the text before splitting 196 $eventData = [ 197 'page' => $page, 198 'body' => '', 199 'metadata' => ['title' => $page, 'relation_references' => []], 200 ]; 201 $event = new Event('INDEXER_PAGE_ADD', $eventData); 202 if ($event->advise_before()) { 203 $text = $eventData['body'] . ' ' . $text; 204 } else { 205 $text = $eventData['body']; 206 } 207 208 $parts = $this->splitIntoChunks($text); 209 foreach ($parts as $part) { 210 if (trim((string)$part) == '') continue; // skip empty chunks 211 212 $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 213 214 try { 215 $embedding = $this->embedModel->getEmbedding($part); 216 } catch (\Exception $e) { 217 if ($this->logger instanceof CLI) { 218 $this->logger->error( 219 'Failed to get embedding for chunk of page {page}: {msg}', 220 ['page' => $page, 'msg' => $e->getMessage()] 221 ); 222 } 223 continue; 224 } 225 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 226 $firstChunkID++; 227 } 228 if ($this->logger instanceof CLI) { 229 if ($chunkList !== []) { 230 $this->logger->success( 231 '{id} split into {count} chunks', 232 ['id' => $page, 'count' => count($chunkList)] 233 ); 234 } else { 235 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 236 } 237 } 238 return $chunkList; 239 } 240 241 /** 242 * Do a nearest neighbor search for chunks similar to the given question 243 * 244 * Returns only chunks the current user is allowed to read, may return an empty result. 245 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 246 * 247 * @param string $query The question 248 * @param string $lang Limit results to this language 249 * @return Chunk[] 250 * @throws \Exception 251 */ 252 public function getSimilarChunks($query, $lang = '') 253 { 254 global $auth; 255 $vector = $this->embedModel->getEmbedding($query); 256 257 $fetch = min( 258 ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 259 $this->configContextChunks 260 ); 261 262 $time = microtime(true); 263 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 264 $this->timeSpent = round(microtime(true) - $time, 2); 265 if ($this->logger instanceof CLI) { 266 $this->logger->info( 267 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 268 ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 269 ); 270 } 271 272 $size = 0; 273 $result = []; 274 foreach ($chunks as $chunk) { 275 // filter out chunks the user is not allowed to read 276 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 277 if ($chunk->getScore() < $this->similarityThreshold) continue; 278 279 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 280 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 281 282 $result[] = $chunk; 283 $size += $chunkSize; 284 } 285 return $result; 286 } 287 288 /** 289 * Create a breadcrumb trail for the given page 290 * 291 * Uses the first heading of each namespace and the page itself. This is added as a prefix to 292 * each chunk to give the AI some context. 293 * 294 * @param string $id 295 * @return string 296 */ 297 protected function breadcrumbTrail($id) 298 { 299 $namespaces = explode(':', getNS($id)); 300 $resolver = new PageResolver($id); 301 $crumbs = []; 302 303 // all namespaces 304 $check = ''; 305 foreach ($namespaces as $namespace) { 306 $check .= $namespace . ':'; 307 $page = $resolver->resolveId($check); 308 $title = p_get_first_heading($page); 309 $crumbs[] = $title ? "$title ($namespace)" : $namespace; 310 } 311 312 // the page itself 313 $title = p_get_first_heading($id); 314 $page = noNS($id); 315 $crumbs[] = $title ? "$title ($page)" : $page; 316 317 return implode(' » ', $crumbs); 318 } 319 320 /** 321 * @param $text 322 * @return array 323 * @throws \Exception 324 * @todo support splitting too long sentences 325 */ 326 protected function splitIntoChunks($text) 327 { 328 $sentenceSplitter = new Sentence(); 329 $tiktok = $this->getTokenEncoder(); 330 331 $chunks = []; 332 $sentences = $sentenceSplitter->split($text); 333 334 $chunklen = 0; 335 $chunk = ''; 336 while ($sentence = array_shift($sentences)) { 337 $slen = count($tiktok->encode($sentence)); 338 if ($slen > $this->getChunkSize()) { 339 // sentence is too long, we need to split it further 340 if ($this->logger instanceof CLI) $this->logger->warning( 341 'Sentence too long, splitting not implemented yet' 342 ); 343 continue; 344 } 345 346 if ($chunklen + $slen < $this->getChunkSize()) { 347 // add to current chunk 348 $chunk .= $sentence; 349 $chunklen += $slen; 350 // remember sentence for overlap check 351 $this->rememberSentence($sentence); 352 } else { 353 // add current chunk to result 354 $chunk = trim($chunk); 355 if ($chunk !== '') $chunks[] = $chunk; 356 357 // start new chunk with remembered sentences 358 $chunk = implode(' ', $this->sentenceQueue); 359 $chunk .= $sentence; 360 $chunklen = count($tiktok->encode($chunk)); 361 } 362 } 363 $chunks[] = $chunk; 364 365 return $chunks; 366 } 367 368 /** 369 * Add a sentence to the queue of remembered sentences 370 * 371 * @param string $sentence 372 * @return void 373 */ 374 protected function rememberSentence($sentence) 375 { 376 // add sentence to queue 377 $this->sentenceQueue[] = $sentence; 378 379 // remove oldest sentences from queue until we are below the max overlap 380 $encoder = $this->getTokenEncoder(); 381 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 382 array_shift($this->sentenceQueue); 383 } 384 } 385} 386