1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\PluginInterface; 6use dokuwiki\plugin\aichat\Model\ChatInterface; 7use dokuwiki\plugin\aichat\Model\EmbeddingInterface; 8use dokuwiki\plugin\aichat\Storage\AbstractStorage; 9use dokuwiki\Search\Indexer; 10use splitbrain\phpcli\CLI; 11use TikToken\Encoder; 12use Vanderlee\Sentence\Sentence; 13 14/** 15 * Manage the embeddings index 16 * 17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 18 * OpenAI and stored in the Storage backend. 19 */ 20class Embeddings 21{ 22 /** @var int maximum overlap between chunks in tokens */ 23 final public const MAX_OVERLAP_LEN = 200; 24 25 /** @var ChatInterface */ 26 protected $chatModel; 27 28 /** @var EmbeddingInterface */ 29 protected $embedModel; 30 31 /** @var CLI|null */ 32 protected $logger; 33 /** @var Encoder */ 34 protected $tokenEncoder; 35 36 /** @var AbstractStorage */ 37 protected $storage; 38 39 /** @var array remember sentences when chunking */ 40 private $sentenceQueue = []; 41 42 public function __construct( 43 ChatInterface $chatModel, 44 EmbeddingInterface $embedModel, 45 AbstractStorage $storage 46 ) { 47 $this->chatModel = $chatModel; 48 $this->embedModel = $embedModel; 49 $this->storage = $storage; 50 } 51 52 /** 53 * Access storage 54 * 55 * @return AbstractStorage 56 */ 57 public function getStorage() 58 { 59 return $this->storage; 60 } 61 62 /** 63 * Add a logger instance 64 * 65 * @return void 66 */ 67 public function setLogger(CLI $logger) 68 { 69 $this->logger = $logger; 70 } 71 72 /** 73 * Get the token encoder instance 74 * 75 * @return Encoder 76 */ 77 public function getTokenEncoder() 78 { 79 if (!$this->tokenEncoder instanceof Encoder) { 80 $this->tokenEncoder = new Encoder(); 81 } 82 return $this->tokenEncoder; 83 } 84 85 /** 86 * Return the chunk size to use 87 * 88 * @return int 89 */ 90 public function getChunkSize() 91 { 92 return min( 93 $this->chatModel->getMaxEmbeddingTokenLength(), 94 $this->embedModel->getMaxEmbeddingTokenLength() 95 ); 96 } 97 98 /** 99 * Update the embeddings storage 100 * 101 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 102 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 103 * @param bool $clear Should any existing storage be cleared before updating? 104 * @return void 105 * @throws \Exception 106 */ 107 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 108 { 109 $indexer = new Indexer(); 110 $pages = $indexer->getPages(); 111 112 $this->storage->startCreation($clear); 113 foreach ($pages as $pid => $page) { 114 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 115 116 if ( 117 !page_exists($page) || 118 isHiddenPage($page) || 119 filesize(wikiFN($page)) < 150 || // skip very small pages 120 ($skipRE && preg_match($skipRE, (string) $page)) || 121 ($matchRE && !preg_match($matchRE, ":$page")) 122 ) { 123 // this page should not be in the index (anymore) 124 $this->storage->deletePageChunks($page, $chunkID); 125 continue; 126 } 127 128 $firstChunk = $this->storage->getChunk($chunkID); 129 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 130 // page is older than the chunks we have, reuse the existing chunks 131 $this->storage->reusePageChunks($page, $chunkID); 132 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 133 } else { 134 // page is newer than the chunks we have, create new chunks 135 $this->storage->deletePageChunks($page, $chunkID); 136 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 137 } 138 } 139 $this->storage->finalizeCreation(); 140 } 141 142 /** 143 * Split the given page, fetch embedding vectors and return Chunks 144 * 145 * Will use the text renderer plugin if available to get the rendered text. 146 * Otherwise the raw wiki text is used. 147 * 148 * @param string $page Name of the page to split 149 * @param int $firstChunkID The ID of the first chunk of this page 150 * @return Chunk[] A list of chunks created for this page 151 * @throws \Exception 152 */ 153 protected function createPageChunks($page, $firstChunkID) 154 { 155 $chunkList = []; 156 157 $textRenderer = plugin_load('renderer', 'text'); 158 if ($textRenderer instanceof PluginInterface) { 159 global $ID; 160 $ID = $page; 161 $text = p_cached_output(wikiFN($page), 'text', $page); 162 } else { 163 $text = rawWiki($page); 164 } 165 166 $parts = $this->splitIntoChunks($text); 167 foreach ($parts as $part) { 168 if (trim((string) $part) == '') continue; // skip empty chunks 169 170 try { 171 $embedding = $this->embedModel->getEmbedding($part); 172 } catch (\Exception $e) { 173 if ($this->logger instanceof CLI) { 174 $this->logger->error( 175 'Failed to get embedding for chunk of page {page}: {msg}', 176 ['page' => $page, 'msg' => $e->getMessage()] 177 ); 178 } 179 continue; 180 } 181 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 182 $firstChunkID++; 183 } 184 if ($this->logger instanceof CLI) { 185 if ($chunkList !== []) { 186 $this->logger->success( 187 '{id} split into {count} chunks', 188 ['id' => $page, 'count' => count($chunkList)] 189 ); 190 } else { 191 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 192 } 193 } 194 return $chunkList; 195 } 196 197 /** 198 * Do a nearest neighbor search for chunks similar to the given question 199 * 200 * Returns only chunks the current user is allowed to read, may return an empty result. 201 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 202 * 203 * @param string $query The question 204 * @param string $lang Limit results to this language 205 * @return Chunk[] 206 * @throws \Exception 207 */ 208 public function getSimilarChunks($query, $lang = '') 209 { 210 global $auth; 211 $vector = $this->embedModel->getEmbedding($query); 212 213 $fetch = ceil( 214 ($this->getChunkSize() / $this->chatModel->getMaxEmbeddingTokenLength()) 215 * 1.5 // fetch a few more than needed, since not all chunks are maximum length 216 ); 217 218 $time = microtime(true); 219 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 220 if ($this->logger instanceof CLI) { 221 $this->logger->info( 222 'Fetched {count} similar chunks from store in {time} seconds', 223 ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)] 224 ); 225 } 226 227 $size = 0; 228 $result = []; 229 foreach ($chunks as $chunk) { 230 // filter out chunks the user is not allowed to read 231 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 232 233 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 234 if ($size + $chunkSize > $this->chatModel->getMaxContextTokenLength()) break; // we have enough 235 236 $result[] = $chunk; 237 $size += $chunkSize; 238 } 239 return $result; 240 } 241 242 243 /** 244 * @param $text 245 * @return array 246 * @throws \Exception 247 * @todo support splitting too long sentences 248 */ 249 public function splitIntoChunks($text) 250 { 251 $sentenceSplitter = new Sentence(); 252 $tiktok = $this->getTokenEncoder(); 253 254 $chunks = []; 255 $sentences = $sentenceSplitter->split($text); 256 257 $chunklen = 0; 258 $chunk = ''; 259 while ($sentence = array_shift($sentences)) { 260 $slen = count($tiktok->encode($sentence)); 261 if ($slen > $this->getChunkSize()) { 262 // sentence is too long, we need to split it further 263 if ($this->logger instanceof CLI) $this->logger->warning( 264 'Sentence too long, splitting not implemented yet' 265 ); 266 continue; 267 } 268 269 if ($chunklen + $slen < $this->getChunkSize()) { 270 // add to current chunk 271 $chunk .= $sentence; 272 $chunklen += $slen; 273 // remember sentence for overlap check 274 $this->rememberSentence($sentence); 275 } else { 276 // add current chunk to result 277 $chunks[] = $chunk; 278 279 // start new chunk with remembered sentences 280 $chunk = implode(' ', $this->sentenceQueue); 281 $chunk .= $sentence; 282 $chunklen = count($tiktok->encode($chunk)); 283 } 284 } 285 $chunks[] = $chunk; 286 287 return $chunks; 288 } 289 290 /** 291 * Add a sentence to the queue of remembered sentences 292 * 293 * @param string $sentence 294 * @return void 295 */ 296 protected function rememberSentence($sentence) 297 { 298 // add sentence to queue 299 $this->sentenceQueue[] = $sentence; 300 301 // remove oldest sentences from queue until we are below the max overlap 302 $encoder = $this->getTokenEncoder(); 303 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 304 array_shift($this->sentenceQueue); 305 } 306 } 307} 308