1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\PluginInterface; 6use dokuwiki\plugin\aichat\Model\ChatInterface; 7use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 8use dokuwiki\plugin\aichat\Storage\AbstractStorage; 9use dokuwiki\Search\Indexer; 10use splitbrain\phpcli\CLI; 11use TikToken\Encoder; 12use Vanderlee\Sentence\Sentence; 13 14/** 15 * Manage the embeddings index 16 * 17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 18 * OpenAI and stored in the Storage backend. 19 */ 20class Embeddings 21{ 22 /** @var int maximum overlap between chunks in tokens */ 23 final public const MAX_OVERLAP_LEN = 200; 24 25 /** @var ChatInterface */ 26 protected $chatModel; 27 28 /** @var EmbeddingInterface */ 29 protected $embedModel; 30 31 /** @var CLI|null */ 32 protected $logger; 33 /** @var Encoder */ 34 protected $tokenEncoder; 35 36 /** @var AbstractStorage */ 37 protected $storage; 38 39 /** @var array remember sentences when chunking */ 40 private $sentenceQueue = []; 41 42 /** @var int the time spent for the last similar chunk retrieval */ 43 public $timeSpent = 0; 44 45 protected $configChunkSize; 46 protected $configContextChunks; 47 48 /** 49 * Embeddings constructor. 50 * 51 * @param ChatInterface $chatModel 52 * @param EmbeddingInterface $embedModel 53 * @param AbstractStorage $storage 54 * @param array $config The plugin configuration 55 */ 56 public function __construct( 57 ChatInterface $chatModel, 58 EmbeddingInterface $embedModel, 59 AbstractStorage $storage, 60 $config 61 ) 62 { 63 $this->chatModel = $chatModel; 64 $this->embedModel = $embedModel; 65 $this->storage = $storage; 66 $this->configChunkSize = $config['chunkSize']; 67 $this->configContextChunks = $config['contextChunks']; 68 } 69 70 /** 71 * Access storage 72 * 73 * @return AbstractStorage 74 */ 75 public function getStorage() 76 { 77 return $this->storage; 78 } 79 80 /** 81 * Add a logger instance 82 * 83 * @return void 84 */ 85 public function setLogger(CLI $logger) 86 { 87 $this->logger = $logger; 88 } 89 90 /** 91 * Get the token encoder instance 92 * 93 * @return Encoder 94 */ 95 public function getTokenEncoder() 96 { 97 if (!$this->tokenEncoder instanceof Encoder) { 98 $this->tokenEncoder = new Encoder(); 99 } 100 return $this->tokenEncoder; 101 } 102 103 /** 104 * Return the chunk size to use 105 * 106 * @return int 107 */ 108 public function getChunkSize() 109 { 110 return min( 111 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 112 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 113 $this->configChunkSize, // this is usually the smallest 114 ); 115 } 116 117 /** 118 * Update the embeddings storage 119 * 120 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 121 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 122 * @param bool $clear Should any existing storage be cleared before updating? 123 * @return void 124 * @throws \Exception 125 */ 126 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 127 { 128 $indexer = new Indexer(); 129 $pages = $indexer->getPages(); 130 131 $this->storage->startCreation($clear); 132 foreach ($pages as $pid => $page) { 133 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 134 135 if ( 136 !page_exists($page) || 137 isHiddenPage($page) || 138 filesize(wikiFN($page)) < 150 || // skip very small pages 139 ($skipRE && preg_match($skipRE, (string)$page)) || 140 ($matchRE && !preg_match($matchRE, ":$page")) 141 ) { 142 // this page should not be in the index (anymore) 143 $this->storage->deletePageChunks($page, $chunkID); 144 continue; 145 } 146 147 $firstChunk = $this->storage->getChunk($chunkID); 148 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 149 // page is older than the chunks we have, reuse the existing chunks 150 $this->storage->reusePageChunks($page, $chunkID); 151 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 152 } else { 153 // page is newer than the chunks we have, create new chunks 154 $this->storage->deletePageChunks($page, $chunkID); 155 $chunks = $this->createPageChunks($page, $chunkID); 156 if ($chunks) $this->storage->addPageChunks($chunks); 157 } 158 } 159 $this->storage->finalizeCreation(); 160 } 161 162 /** 163 * Split the given page, fetch embedding vectors and return Chunks 164 * 165 * Will use the text renderer plugin if available to get the rendered text. 166 * Otherwise the raw wiki text is used. 167 * 168 * @param string $page Name of the page to split 169 * @param int $firstChunkID The ID of the first chunk of this page 170 * @return Chunk[] A list of chunks created for this page 171 * @throws \Exception 172 */ 173 protected function createPageChunks($page, $firstChunkID) 174 { 175 $chunkList = []; 176 177 $textRenderer = plugin_load('renderer', 'text'); 178 if ($textRenderer instanceof PluginInterface) { 179 global $ID; 180 $ID = $page; 181 $text = p_cached_output(wikiFN($page), 'text', $page); 182 } else { 183 $text = rawWiki($page); 184 } 185 186 $parts = $this->splitIntoChunks($text); 187 foreach ($parts as $part) { 188 if (trim((string)$part) == '') continue; // skip empty chunks 189 190 try { 191 $embedding = $this->embedModel->getEmbedding($part); 192 } catch (\Exception $e) { 193 if ($this->logger instanceof CLI) { 194 $this->logger->error( 195 'Failed to get embedding for chunk of page {page}: {msg}', 196 ['page' => $page, 'msg' => $e->getMessage()] 197 ); 198 } 199 continue; 200 } 201 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 202 $firstChunkID++; 203 } 204 if ($this->logger instanceof CLI) { 205 if ($chunkList !== []) { 206 $this->logger->success( 207 '{id} split into {count} chunks', 208 ['id' => $page, 'count' => count($chunkList)] 209 ); 210 } else { 211 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 212 } 213 } 214 return $chunkList; 215 } 216 217 /** 218 * Do a nearest neighbor search for chunks similar to the given question 219 * 220 * Returns only chunks the current user is allowed to read, may return an empty result. 221 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 222 * 223 * @param string $query The question 224 * @param string $lang Limit results to this language 225 * @return Chunk[] 226 * @throws \Exception 227 */ 228 public function getSimilarChunks($query, $lang = '') 229 { 230 global $auth; 231 $vector = $this->embedModel->getEmbedding($query); 232 233 $fetch = min( 234 ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 235 $this->configContextChunks 236 ); 237 238 $time = microtime(true); 239 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 240 $this->timeSpent = microtime(true) - $time; 241 if ($this->logger instanceof CLI) { 242 $this->logger->info( 243 'Fetched {count} similar chunks from store in {time} seconds', 244 ['count' => count($chunks), 'time' => round($this->timeSpent, 2)] 245 ); 246 } 247 248 $size = 0; 249 $result = []; 250 foreach ($chunks as $chunk) { 251 // filter out chunks the user is not allowed to read 252 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 253 254 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 255 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 256 257 $result[] = $chunk; 258 $size += $chunkSize; 259 } 260 return $result; 261 } 262 263 264 /** 265 * @param $text 266 * @return array 267 * @throws \Exception 268 * @todo support splitting too long sentences 269 */ 270 public function splitIntoChunks($text) 271 { 272 $sentenceSplitter = new Sentence(); 273 $tiktok = $this->getTokenEncoder(); 274 275 $chunks = []; 276 $sentences = $sentenceSplitter->split($text); 277 278 $chunklen = 0; 279 $chunk = ''; 280 while ($sentence = array_shift($sentences)) { 281 $slen = count($tiktok->encode($sentence)); 282 if ($slen > $this->getChunkSize()) { 283 // sentence is too long, we need to split it further 284 if ($this->logger instanceof CLI) $this->logger->warning( 285 'Sentence too long, splitting not implemented yet' 286 ); 287 continue; 288 } 289 290 if ($chunklen + $slen < $this->getChunkSize()) { 291 // add to current chunk 292 $chunk .= $sentence; 293 $chunklen += $slen; 294 // remember sentence for overlap check 295 $this->rememberSentence($sentence); 296 } else { 297 // add current chunk to result 298 $chunks[] = $chunk; 299 300 // start new chunk with remembered sentences 301 $chunk = implode(' ', $this->sentenceQueue); 302 $chunk .= $sentence; 303 $chunklen = count($tiktok->encode($chunk)); 304 } 305 } 306 $chunks[] = $chunk; 307 308 return $chunks; 309 } 310 311 /** 312 * Add a sentence to the queue of remembered sentences 313 * 314 * @param string $sentence 315 * @return void 316 */ 317 protected function rememberSentence($sentence) 318 { 319 // add sentence to queue 320 $this->sentenceQueue[] = $sentence; 321 322 // remove oldest sentences from queue until we are below the max overlap 323 $encoder = $this->getTokenEncoder(); 324 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 325 array_shift($this->sentenceQueue); 326 } 327 } 328} 329