1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\PluginInterface; 6use dokuwiki\plugin\aichat\Model\ChatInterface; 7use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 8use dokuwiki\plugin\aichat\Storage\AbstractStorage; 9use dokuwiki\Search\Indexer; 10use splitbrain\phpcli\CLI; 11use TikToken\Encoder; 12use Vanderlee\Sentence\Sentence; 13 14/** 15 * Manage the embeddings index 16 * 17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 18 * OpenAI and stored in the Storage backend. 19 */ 20class Embeddings 21{ 22 /** @var int maximum overlap between chunks in tokens */ 23 final public const MAX_OVERLAP_LEN = 200; 24 25 /** @var ChatInterface */ 26 protected $chatModel; 27 28 /** @var EmbeddingInterface */ 29 protected $embedModel; 30 31 /** @var CLI|null */ 32 protected $logger; 33 /** @var Encoder */ 34 protected $tokenEncoder; 35 36 /** @var AbstractStorage */ 37 protected $storage; 38 39 /** @var array remember sentences when chunking */ 40 private $sentenceQueue = []; 41 42 protected $configChunkSize; 43 protected $configContextChunks; 44 45 /** 46 * Embeddings constructor. 47 * 48 * @param ChatInterface $chatModel 49 * @param EmbeddingInterface $embedModel 50 * @param AbstractStorage $storage 51 * @param array $config The plugin configuration 52 */ 53 public function __construct( 54 ChatInterface $chatModel, 55 EmbeddingInterface $embedModel, 56 AbstractStorage $storage, 57 $config 58 ) 59 { 60 $this->chatModel = $chatModel; 61 $this->embedModel = $embedModel; 62 $this->storage = $storage; 63 $this->configChunkSize = $config['chunkSize']; 64 $this->configContextChunks = $config['contextChunks']; 65 } 66 67 /** 68 * Access storage 69 * 70 * @return AbstractStorage 71 */ 72 public function getStorage() 73 { 74 return $this->storage; 75 } 76 77 /** 78 * Add a logger instance 79 * 80 * @return void 81 */ 82 public function setLogger(CLI $logger) 83 { 84 $this->logger = $logger; 85 } 86 87 /** 88 * Get the token encoder instance 89 * 90 * @return Encoder 91 */ 92 public function getTokenEncoder() 93 { 94 if (!$this->tokenEncoder instanceof Encoder) { 95 $this->tokenEncoder = new Encoder(); 96 } 97 return $this->tokenEncoder; 98 } 99 100 /** 101 * Return the chunk size to use 102 * 103 * @return int 104 */ 105 public function getChunkSize() 106 { 107 return min( 108 floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 109 floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 110 $this->configChunkSize, // this is usually the smallest 111 ); 112 } 113 114 /** 115 * Update the embeddings storage 116 * 117 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 118 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 119 * @param bool $clear Should any existing storage be cleared before updating? 120 * @return void 121 * @throws \Exception 122 */ 123 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 124 { 125 $indexer = new Indexer(); 126 $pages = $indexer->getPages(); 127 128 $this->storage->startCreation($clear); 129 foreach ($pages as $pid => $page) { 130 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 131 132 if ( 133 !page_exists($page) || 134 isHiddenPage($page) || 135 filesize(wikiFN($page)) < 150 || // skip very small pages 136 ($skipRE && preg_match($skipRE, (string)$page)) || 137 ($matchRE && !preg_match($matchRE, ":$page")) 138 ) { 139 // this page should not be in the index (anymore) 140 $this->storage->deletePageChunks($page, $chunkID); 141 continue; 142 } 143 144 $firstChunk = $this->storage->getChunk($chunkID); 145 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 146 // page is older than the chunks we have, reuse the existing chunks 147 $this->storage->reusePageChunks($page, $chunkID); 148 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 149 } else { 150 // page is newer than the chunks we have, create new chunks 151 $this->storage->deletePageChunks($page, $chunkID); 152 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 153 } 154 } 155 $this->storage->finalizeCreation(); 156 } 157 158 /** 159 * Split the given page, fetch embedding vectors and return Chunks 160 * 161 * Will use the text renderer plugin if available to get the rendered text. 162 * Otherwise the raw wiki text is used. 163 * 164 * @param string $page Name of the page to split 165 * @param int $firstChunkID The ID of the first chunk of this page 166 * @return Chunk[] A list of chunks created for this page 167 * @throws \Exception 168 */ 169 protected function createPageChunks($page, $firstChunkID) 170 { 171 $chunkList = []; 172 173 $textRenderer = plugin_load('renderer', 'text'); 174 if ($textRenderer instanceof PluginInterface) { 175 global $ID; 176 $ID = $page; 177 $text = p_cached_output(wikiFN($page), 'text', $page); 178 } else { 179 $text = rawWiki($page); 180 } 181 182 $parts = $this->splitIntoChunks($text); 183 foreach ($parts as $part) { 184 if (trim((string)$part) == '') continue; // skip empty chunks 185 186 try { 187 $embedding = $this->embedModel->getEmbedding($part); 188 } catch (\Exception $e) { 189 if ($this->logger instanceof CLI) { 190 $this->logger->error( 191 'Failed to get embedding for chunk of page {page}: {msg}', 192 ['page' => $page, 'msg' => $e->getMessage()] 193 ); 194 } 195 continue; 196 } 197 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 198 $firstChunkID++; 199 } 200 if ($this->logger instanceof CLI) { 201 if ($chunkList !== []) { 202 $this->logger->success( 203 '{id} split into {count} chunks', 204 ['id' => $page, 'count' => count($chunkList)] 205 ); 206 } else { 207 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 208 } 209 } 210 return $chunkList; 211 } 212 213 /** 214 * Do a nearest neighbor search for chunks similar to the given question 215 * 216 * Returns only chunks the current user is allowed to read, may return an empty result. 217 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 218 * 219 * @param string $query The question 220 * @param string $lang Limit results to this language 221 * @return Chunk[] 222 * @throws \Exception 223 */ 224 public function getSimilarChunks($query, $lang = '') 225 { 226 global $auth; 227 $vector = $this->embedModel->getEmbedding($query); 228 229 $fetch = (int) ceil( 230 min( 231 ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize() ), 232 $this->configContextChunks 233 ) 234 * 1.5 // fetch a few more than needed, since not all chunks are maximum length 235 ); 236 237 $time = microtime(true); 238 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 239 if ($this->logger instanceof CLI) { 240 $this->logger->info( 241 'Fetched {count} similar chunks from store in {time} seconds', 242 ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)] 243 ); 244 } 245 246 $size = 0; 247 $result = []; 248 foreach ($chunks as $chunk) { 249 // filter out chunks the user is not allowed to read 250 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 251 252 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 253 if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 254 255 $result[] = $chunk; 256 $size += $chunkSize; 257 } 258 return $result; 259 } 260 261 262 /** 263 * @param $text 264 * @return array 265 * @throws \Exception 266 * @todo support splitting too long sentences 267 */ 268 public function splitIntoChunks($text) 269 { 270 $sentenceSplitter = new Sentence(); 271 $tiktok = $this->getTokenEncoder(); 272 273 $chunks = []; 274 $sentences = $sentenceSplitter->split($text); 275 276 $chunklen = 0; 277 $chunk = ''; 278 while ($sentence = array_shift($sentences)) { 279 $slen = count($tiktok->encode($sentence)); 280 if ($slen > $this->getChunkSize()) { 281 // sentence is too long, we need to split it further 282 if ($this->logger instanceof CLI) $this->logger->warning( 283 'Sentence too long, splitting not implemented yet' 284 ); 285 continue; 286 } 287 288 if ($chunklen + $slen < $this->getChunkSize()) { 289 // add to current chunk 290 $chunk .= $sentence; 291 $chunklen += $slen; 292 // remember sentence for overlap check 293 $this->rememberSentence($sentence); 294 } else { 295 // add current chunk to result 296 $chunks[] = $chunk; 297 298 // start new chunk with remembered sentences 299 $chunk = implode(' ', $this->sentenceQueue); 300 $chunk .= $sentence; 301 $chunklen = count($tiktok->encode($chunk)); 302 } 303 } 304 $chunks[] = $chunk; 305 306 return $chunks; 307 } 308 309 /** 310 * Add a sentence to the queue of remembered sentences 311 * 312 * @param string $sentence 313 * @return void 314 */ 315 protected function rememberSentence($sentence) 316 { 317 // add sentence to queue 318 $this->sentenceQueue[] = $sentence; 319 320 // remove oldest sentences from queue until we are below the max overlap 321 $encoder = $this->getTokenEncoder(); 322 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 323 array_shift($this->sentenceQueue); 324 } 325 } 326} 327