1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Extension\PluginInterface; 7use dokuwiki\plugin\aichat\Model\ChatInterface; 8use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9use dokuwiki\plugin\aichat\Storage\AbstractStorage; 10use dokuwiki\Search\Indexer; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 /** @var int maximum overlap between chunks in tokens */ 24 final public const MAX_OVERLAP_LEN = 200; 25 26 /** @var ChatInterface */ 27 protected $chatModel; 28 29 /** @var EmbeddingInterface */ 30 protected $embedModel; 31 32 /** @var CLI|null */ 33 protected $logger; 34 /** @var Encoder */ 35 protected $tokenEncoder; 36 37 /** @var AbstractStorage */ 38 protected $storage; 39 40 /** @var array remember sentences when chunking */ 41 private $sentenceQueue = []; 42 43 /** @var int the time spent for the last similar chunk retrieval */ 44 public $timeSpent = 0; 45 46 protected $configChunkSize; 47 protected $configContextChunks; 48 protected $similarityThreshold; 49 50 /** 51 * Embeddings constructor. 52 * 53 * @param ChatInterface $chatModel 54 * @param EmbeddingInterface $embedModel 55 * @param AbstractStorage $storage 56 * @param array $config The plugin configuration 57 */ 58 public function __construct( 59 ChatInterface $chatModel, 60 EmbeddingInterface $embedModel, 61 AbstractStorage $storage, 62 $config 63 ) 64 { 65 $this->chatModel = $chatModel; 66 $this->embedModel = $embedModel; 67 $this->storage = $storage; 68 $this->configChunkSize = $config['chunkSize']; 69 $this->configContextChunks = $config['contextChunks']; 70 $this->similarityThreshold = $config['similarityThreshold'] / 100; 71 } 72 73 /** 74 * Access storage 75 * 76 * @return AbstractStorage 77 */ 78 public function getStorage() 79 { 80 return $this->storage; 81 } 82 83 /** 84 * Add a logger instance 85 * 86 * @return void 87 */ 88 public function setLogger(CLI $logger) 89 { 90 $this->logger = $logger; 91 } 92 93 /** 94 * Get the token encoder instance 95 * 96 * @return Encoder 97 */ 98 public function getTokenEncoder() 99 { 100 if (!$this->tokenEncoder instanceof Encoder) { 101 $this->tokenEncoder = new Encoder(); 102 } 103 return $this->tokenEncoder; 104 } 105 106 /** 107 * Return the chunk size to use 108 * 109 * @return int 110 */ 111 public function getChunkSize() 112 { 113 return min( 114 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 115 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 116 $this->configChunkSize, // this is usually the smallest 117 ); 118 } 119 120 /** 121 * Update the embeddings storage 122 * 123 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 124 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 125 * @param bool $clear Should any existing storage be cleared before updating? 126 * @return void 127 * @throws \Exception 128 */ 129 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 130 { 131 $indexer = new Indexer(); 132 $pages = $indexer->getPages(); 133 134 $this->storage->startCreation($clear); 135 foreach ($pages as $pid => $page) { 136 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 137 138 if ( 139 !page_exists($page) || 140 isHiddenPage($page) || 141 filesize(wikiFN($page)) < 150 || // skip very small pages 142 ($skipRE && preg_match($skipRE, (string)$page)) || 143 ($matchRE && !preg_match($matchRE, ":$page")) 144 ) { 145 // this page should not be in the index (anymore) 146 $this->storage->deletePageChunks($page, $chunkID); 147 continue; 148 } 149 150 $firstChunk = $this->storage->getChunk($chunkID); 151 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 152 // page is older than the chunks we have, reuse the existing chunks 153 $this->storage->reusePageChunks($page, $chunkID); 154 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 155 } else { 156 // page is newer than the chunks we have, create new chunks 157 $this->storage->deletePageChunks($page, $chunkID); 158 $chunks = $this->createPageChunks($page, $chunkID); 159 if ($chunks) $this->storage->addPageChunks($chunks); 160 } 161 } 162 $this->storage->finalizeCreation(); 163 } 164 165 /** 166 * Split the given page, fetch embedding vectors and return Chunks 167 * 168 * Will use the text renderer plugin if available to get the rendered text. 169 * Otherwise the raw wiki text is used. 170 * 171 * @param string $page Name of the page to split 172 * @param int $firstChunkID The ID of the first chunk of this page 173 * @return Chunk[] A list of chunks created for this page 174 * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 175 * @throws \Exception 176 */ 177 public function createPageChunks($page, $firstChunkID) 178 { 179 $chunkList = []; 180 181 $textRenderer = plugin_load('renderer', 'text'); 182 if ($textRenderer instanceof PluginInterface) { 183 global $ID; 184 $ID = $page; 185 $text = p_cached_output(wikiFN($page), 'text', $page); 186 } else { 187 $text = rawWiki($page); 188 } 189 190 // allow plugins to modify the text before splitting 191 $eventData = [ 192 'page' => $page, 193 'body' => '', 194 'metadata' => ['title' => $page, 'relation_references' => []], 195 ]; 196 $event = new Event('INDEXER_PAGE_ADD', $eventData); 197 if ($event->advise_before()) { 198 $text = $eventData['body'] . ' ' . $text; 199 } else { 200 $text = $eventData['body']; 201 } 202 203 $parts = $this->splitIntoChunks($text); 204 foreach ($parts as $part) { 205 if (trim((string)$part) == '') continue; // skip empty chunks 206 207 try { 208 $embedding = $this->embedModel->getEmbedding($part); 209 } catch (\Exception $e) { 210 if ($this->logger instanceof CLI) { 211 $this->logger->error( 212 'Failed to get embedding for chunk of page {page}: {msg}', 213 ['page' => $page, 'msg' => $e->getMessage()] 214 ); 215 } 216 continue; 217 } 218 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 219 $firstChunkID++; 220 } 221 if ($this->logger instanceof CLI) { 222 if ($chunkList !== []) { 223 $this->logger->success( 224 '{id} split into {count} chunks', 225 ['id' => $page, 'count' => count($chunkList)] 226 ); 227 } else { 228 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 229 } 230 } 231 return $chunkList; 232 } 233 234 /** 235 * Do a nearest neighbor search for chunks similar to the given question 236 * 237 * Returns only chunks the current user is allowed to read, may return an empty result. 238 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 239 * 240 * @param string $query The question 241 * @param string $lang Limit results to this language 242 * @return Chunk[] 243 * @throws \Exception 244 */ 245 public function getSimilarChunks($query, $lang = '') 246 { 247 global $auth; 248 $vector = $this->embedModel->getEmbedding($query); 249 250 $fetch = min( 251 ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 252 $this->configContextChunks 253 ); 254 255 $time = microtime(true); 256 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 257 $this->timeSpent = round(microtime(true) - $time, 2); 258 if ($this->logger instanceof CLI) { 259 $this->logger->info( 260 'Fetched {count} similar chunks from store in {time} seconds', 261 ['count' => count($chunks), 'time' => $this->timeSpent] 262 ); 263 } 264 265 $size = 0; 266 $result = []; 267 foreach ($chunks as $chunk) { 268 // filter out chunks the user is not allowed to read 269 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 270 if ($chunk->getScore() < $this->similarityThreshold) continue; 271 272 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 273 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 274 275 $result[] = $chunk; 276 $size += $chunkSize; 277 } 278 return $result; 279 } 280 281 282 /** 283 * @param $text 284 * @return array 285 * @throws \Exception 286 * @todo support splitting too long sentences 287 */ 288 protected function splitIntoChunks($text) 289 { 290 $sentenceSplitter = new Sentence(); 291 $tiktok = $this->getTokenEncoder(); 292 293 $chunks = []; 294 $sentences = $sentenceSplitter->split($text); 295 296 $chunklen = 0; 297 $chunk = ''; 298 while ($sentence = array_shift($sentences)) { 299 $slen = count($tiktok->encode($sentence)); 300 if ($slen > $this->getChunkSize()) { 301 // sentence is too long, we need to split it further 302 if ($this->logger instanceof CLI) $this->logger->warning( 303 'Sentence too long, splitting not implemented yet' 304 ); 305 continue; 306 } 307 308 if ($chunklen + $slen < $this->getChunkSize()) { 309 // add to current chunk 310 $chunk .= $sentence; 311 $chunklen += $slen; 312 // remember sentence for overlap check 313 $this->rememberSentence($sentence); 314 } else { 315 // add current chunk to result 316 $chunk = trim($chunk); 317 if ($chunk !== '') $chunks[] = $chunk; 318 319 // start new chunk with remembered sentences 320 $chunk = implode(' ', $this->sentenceQueue); 321 $chunk .= $sentence; 322 $chunklen = count($tiktok->encode($chunk)); 323 } 324 } 325 $chunks[] = $chunk; 326 327 return $chunks; 328 } 329 330 /** 331 * Add a sentence to the queue of remembered sentences 332 * 333 * @param string $sentence 334 * @return void 335 */ 336 protected function rememberSentence($sentence) 337 { 338 // add sentence to queue 339 $this->sentenceQueue[] = $sentence; 340 341 // remove oldest sentences from queue until we are below the max overlap 342 $encoder = $this->getTokenEncoder(); 343 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 344 array_shift($this->sentenceQueue); 345 } 346 } 347} 348