1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\PluginInterface; 6use dokuwiki\plugin\aichat\Model\ChatInterface; 7use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 8use dokuwiki\plugin\aichat\Storage\AbstractStorage; 9use dokuwiki\Search\Indexer; 10use splitbrain\phpcli\CLI; 11use TikToken\Encoder; 12use Vanderlee\Sentence\Sentence; 13 14/** 15 * Manage the embeddings index 16 * 17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 18 * OpenAI and stored in the Storage backend. 19 */ 20class Embeddings 21{ 22 /** @var int maximum overlap between chunks in tokens */ 23 final public const MAX_OVERLAP_LEN = 200; 24 25 /** @var ChatInterface */ 26 protected $chatModel; 27 28 /** @var EmbeddingInterface */ 29 protected $embedModel; 30 31 /** @var CLI|null */ 32 protected $logger; 33 /** @var Encoder */ 34 protected $tokenEncoder; 35 36 /** @var AbstractStorage */ 37 protected $storage; 38 39 /** @var array remember sentences when chunking */ 40 private $sentenceQueue = []; 41 42 /** @var int the time spent for the last similar chunk retrieval */ 43 public $timeSpent = 0; 44 45 protected $configChunkSize; 46 protected $configContextChunks; 47 protected $similarityThreshold; 48 49 /** 50 * Embeddings constructor. 51 * 52 * @param ChatInterface $chatModel 53 * @param EmbeddingInterface $embedModel 54 * @param AbstractStorage $storage 55 * @param array $config The plugin configuration 56 */ 57 public function __construct( 58 ChatInterface $chatModel, 59 EmbeddingInterface $embedModel, 60 AbstractStorage $storage, 61 $config 62 ) { 63 $this->chatModel = $chatModel; 64 $this->embedModel = $embedModel; 65 $this->storage = $storage; 66 $this->configChunkSize = $config['chunkSize']; 67 $this->configContextChunks = $config['contextChunks']; 68 $this->similarityThreshold = $config['similarityThreshold']/100; 69 } 70 71 /** 72 * Access storage 73 * 74 * @return AbstractStorage 75 */ 76 public function getStorage() 77 { 78 return $this->storage; 79 } 80 81 /** 82 * Add a logger instance 83 * 84 * @return void 85 */ 86 public function setLogger(CLI $logger) 87 { 88 $this->logger = $logger; 89 } 90 91 /** 92 * Get the token encoder instance 93 * 94 * @return Encoder 95 */ 96 public function getTokenEncoder() 97 { 98 if (!$this->tokenEncoder instanceof Encoder) { 99 $this->tokenEncoder = new Encoder(); 100 } 101 return $this->tokenEncoder; 102 } 103 104 /** 105 * Return the chunk size to use 106 * 107 * @return int 108 */ 109 public function getChunkSize() 110 { 111 return min( 112 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 113 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 114 $this->configChunkSize, // this is usually the smallest 115 ); 116 } 117 118 /** 119 * Update the embeddings storage 120 * 121 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 122 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 123 * @param bool $clear Should any existing storage be cleared before updating? 124 * @return void 125 * @throws \Exception 126 */ 127 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 128 { 129 $indexer = new Indexer(); 130 $pages = $indexer->getPages(); 131 132 $this->storage->startCreation($clear); 133 foreach ($pages as $pid => $page) { 134 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 135 136 if ( 137 !page_exists($page) || 138 isHiddenPage($page) || 139 filesize(wikiFN($page)) < 150 || // skip very small pages 140 ($skipRE && preg_match($skipRE, (string)$page)) || 141 ($matchRE && !preg_match($matchRE, ":$page")) 142 ) { 143 // this page should not be in the index (anymore) 144 $this->storage->deletePageChunks($page, $chunkID); 145 continue; 146 } 147 148 $firstChunk = $this->storage->getChunk($chunkID); 149 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 150 // page is older than the chunks we have, reuse the existing chunks 151 $this->storage->reusePageChunks($page, $chunkID); 152 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 153 } else { 154 // page is newer than the chunks we have, create new chunks 155 $this->storage->deletePageChunks($page, $chunkID); 156 $chunks = $this->createPageChunks($page, $chunkID); 157 if ($chunks) $this->storage->addPageChunks($chunks); 158 } 159 } 160 $this->storage->finalizeCreation(); 161 } 162 163 /** 164 * Split the given page, fetch embedding vectors and return Chunks 165 * 166 * Will use the text renderer plugin if available to get the rendered text. 167 * Otherwise the raw wiki text is used. 168 * 169 * @param string $page Name of the page to split 170 * @param int $firstChunkID The ID of the first chunk of this page 171 * @return Chunk[] A list of chunks created for this page 172 * @throws \Exception 173 */ 174 protected function createPageChunks($page, $firstChunkID) 175 { 176 $chunkList = []; 177 178 $textRenderer = plugin_load('renderer', 'text'); 179 if ($textRenderer instanceof PluginInterface) { 180 global $ID; 181 $ID = $page; 182 $text = p_cached_output(wikiFN($page), 'text', $page); 183 } else { 184 $text = rawWiki($page); 185 } 186 187 $parts = $this->splitIntoChunks($text); 188 foreach ($parts as $part) { 189 if (trim((string)$part) == '') continue; // skip empty chunks 190 191 try { 192 $embedding = $this->embedModel->getEmbedding($part); 193 } catch (\Exception $e) { 194 if ($this->logger instanceof CLI) { 195 $this->logger->error( 196 'Failed to get embedding for chunk of page {page}: {msg}', 197 ['page' => $page, 'msg' => $e->getMessage()] 198 ); 199 } 200 continue; 201 } 202 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 203 $firstChunkID++; 204 } 205 if ($this->logger instanceof CLI) { 206 if ($chunkList !== []) { 207 $this->logger->success( 208 '{id} split into {count} chunks', 209 ['id' => $page, 'count' => count($chunkList)] 210 ); 211 } else { 212 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 213 } 214 } 215 return $chunkList; 216 } 217 218 /** 219 * Do a nearest neighbor search for chunks similar to the given question 220 * 221 * Returns only chunks the current user is allowed to read, may return an empty result. 222 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 223 * 224 * @param string $query The question 225 * @param string $lang Limit results to this language 226 * @return Chunk[] 227 * @throws \Exception 228 */ 229 public function getSimilarChunks($query, $lang = '') 230 { 231 global $auth; 232 $vector = $this->embedModel->getEmbedding($query); 233 234 $fetch = min( 235 ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 236 $this->configContextChunks 237 ); 238 239 $time = microtime(true); 240 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 241 $this->timeSpent = round(microtime(true) - $time, 2); 242 if ($this->logger instanceof CLI) { 243 $this->logger->info( 244 'Fetched {count} similar chunks from store in {time} seconds', 245 ['count' => count($chunks), 'time' => $this->timeSpent] 246 ); 247 } 248 249 $size = 0; 250 $result = []; 251 foreach ($chunks as $chunk) { 252 // filter out chunks the user is not allowed to read 253 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 254 if($chunk->getScore() < $this->similarityThreshold) continue; 255 256 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 257 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 258 259 $result[] = $chunk; 260 $size += $chunkSize; 261 } 262 return $result; 263 } 264 265 266 /** 267 * @param $text 268 * @return array 269 * @throws \Exception 270 * @todo support splitting too long sentences 271 */ 272 public function splitIntoChunks($text) 273 { 274 $sentenceSplitter = new Sentence(); 275 $tiktok = $this->getTokenEncoder(); 276 277 $chunks = []; 278 $sentences = $sentenceSplitter->split($text); 279 280 $chunklen = 0; 281 $chunk = ''; 282 while ($sentence = array_shift($sentences)) { 283 $slen = count($tiktok->encode($sentence)); 284 if ($slen > $this->getChunkSize()) { 285 // sentence is too long, we need to split it further 286 if ($this->logger instanceof CLI) $this->logger->warning( 287 'Sentence too long, splitting not implemented yet' 288 ); 289 continue; 290 } 291 292 if ($chunklen + $slen < $this->getChunkSize()) { 293 // add to current chunk 294 $chunk .= $sentence; 295 $chunklen += $slen; 296 // remember sentence for overlap check 297 $this->rememberSentence($sentence); 298 } else { 299 // add current chunk to result 300 $chunks[] = $chunk; 301 302 // start new chunk with remembered sentences 303 $chunk = implode(' ', $this->sentenceQueue); 304 $chunk .= $sentence; 305 $chunklen = count($tiktok->encode($chunk)); 306 } 307 } 308 $chunks[] = $chunk; 309 310 return $chunks; 311 } 312 313 /** 314 * Add a sentence to the queue of remembered sentences 315 * 316 * @param string $sentence 317 * @return void 318 */ 319 protected function rememberSentence($sentence) 320 { 321 // add sentence to queue 322 $this->sentenceQueue[] = $sentence; 323 324 // remove oldest sentences from queue until we are below the max overlap 325 $encoder = $this->getTokenEncoder(); 326 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 327 array_shift($this->sentenceQueue); 328 } 329 } 330} 331