1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\PluginInterface; 6use dokuwiki\plugin\aichat\Model\AbstractModel; 7use dokuwiki\plugin\aichat\Storage\AbstractStorage; 8use dokuwiki\Search\Indexer; 9use splitbrain\phpcli\CLI; 10use TikToken\Encoder; 11use Vanderlee\Sentence\Sentence; 12 13/** 14 * Manage the embeddings index 15 * 16 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 17 * OpenAI and stored in the Storage backend. 18 */ 19class Embeddings 20{ 21 /** @var int maximum overlap between chunks in tokens */ 22 public const MAX_OVERLAP_LEN = 200; 23 24 /** @var AbstractModel */ 25 protected $model; 26 /** @var CLI|null */ 27 protected $logger; 28 /** @var Encoder */ 29 protected $tokenEncoder; 30 31 /** @var AbstractStorage */ 32 protected $storage; 33 34 /** @var array remember sentences when chunking */ 35 private $sentenceQueue = []; 36 37 /** 38 * @param AbstractModel $model 39 */ 40 public function __construct(AbstractModel $model, AbstractStorage $storage) 41 { 42 $this->model = $model; 43 $this->storage = $storage; 44 } 45 46 /** 47 * Access storage 48 * 49 * @return AbstractStorage 50 */ 51 public function getStorage() 52 { 53 return $this->storage; 54 } 55 56 /** 57 * Add a logger instance 58 * 59 * @param CLI $logger 60 * @return void 61 */ 62 public function setLogger(CLI $logger) 63 { 64 $this->logger = $logger; 65 } 66 67 /** 68 * Get the token encoder instance 69 * 70 * @return Encoder 71 */ 72 public function getTokenEncoder() 73 { 74 if (!$this->tokenEncoder instanceof Encoder) { 75 $this->tokenEncoder = new Encoder(); 76 } 77 return $this->tokenEncoder; 78 } 79 80 /** 81 * Update the embeddings storage 82 * 83 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 84 * @param bool $clear Should any existing storage be cleared before updating? 85 * @return void 86 * @throws \Exception 87 */ 88 public function createNewIndex($skipRE = '', $clear = false) 89 { 90 $indexer = new Indexer(); 91 $pages = $indexer->getPages(); 92 93 $this->storage->startCreation($clear); 94 foreach ($pages as $pid => $page) { 95 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 96 97 if ( 98 !page_exists($page) || 99 isHiddenPage($page) || 100 filesize(wikiFN($page)) < 150 || // skip very small pages 101 ($skipRE && preg_match($skipRE, $page)) 102 ) { 103 // this page should not be in the index (anymore) 104 $this->storage->deletePageChunks($page, $chunkID); 105 continue; 106 } 107 108 $firstChunk = $this->storage->getChunk($chunkID); 109 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 110 // page is older than the chunks we have, reuse the existing chunks 111 $this->storage->reusePageChunks($page, $chunkID); 112 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 113 } else { 114 // page is newer than the chunks we have, create new chunks 115 $this->storage->deletePageChunks($page, $chunkID); 116 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 117 } 118 } 119 $this->storage->finalizeCreation(); 120 } 121 122 /** 123 * Split the given page, fetch embedding vectors and return Chunks 124 * 125 * Will use the text renderer plugin if available to get the rendered text. 126 * Otherwise the raw wiki text is used. 127 * 128 * @param string $page Name of the page to split 129 * @param int $firstChunkID The ID of the first chunk of this page 130 * @return Chunk[] A list of chunks created for this page 131 * @throws \Exception 132 */ 133 protected function createPageChunks($page, $firstChunkID) 134 { 135 $chunkList = []; 136 137 $textRenderer = plugin_load('renderer', 'text'); 138 if ($textRenderer instanceof PluginInterface) { 139 global $ID; 140 $ID = $page; 141 $text = p_cached_output(wikiFN($page), 'text', $page); 142 } else { 143 $text = rawWiki($page); 144 } 145 146 $parts = $this->splitIntoChunks($text); 147 foreach ($parts as $part) { 148 if (trim($part) == '') continue; // skip empty chunks 149 150 try { 151 $embedding = $this->model->getEmbedding($part); 152 } catch (\Exception $e) { 153 if ($this->logger instanceof CLI) { 154 $this->logger->error( 155 'Failed to get embedding for chunk of page {page}: {msg}', 156 ['page' => $page, 'msg' => $e->getMessage()] 157 ); 158 } 159 continue; 160 } 161 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 162 $firstChunkID++; 163 } 164 if ($this->logger instanceof CLI) { 165 if ($chunkList !== []) { 166 $this->logger->success( 167 '{id} split into {count} chunks', 168 ['id' => $page, 'count' => count($chunkList)] 169 ); 170 } else { 171 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 172 } 173 } 174 return $chunkList; 175 } 176 177 /** 178 * Do a nearest neighbor search for chunks similar to the given question 179 * 180 * Returns only chunks the current user is allowed to read, may return an empty result. 181 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 182 * 183 * @param string $query The question 184 * @param string $lang Limit results to this language 185 * @return Chunk[] 186 * @throws \Exception 187 */ 188 public function getSimilarChunks($query, $lang = '') 189 { 190 global $auth; 191 $vector = $this->model->getEmbedding($query); 192 193 $fetch = ceil( 194 ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength()) 195 * 1.5 // fetch a few more than needed, since not all chunks are maximum length 196 ); 197 198 $time = microtime(true); 199 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 200 if ($this->logger instanceof CLI) { 201 $this->logger->info( 202 'Fetched {count} similar chunks from store in {time} seconds', 203 ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)] 204 ); 205 } 206 207 $size = 0; 208 $result = []; 209 foreach ($chunks as $chunk) { 210 // filter out chunks the user is not allowed to read 211 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 212 213 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 214 if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough 215 216 $result[] = $chunk; 217 $size += $chunkSize; 218 } 219 return $result; 220 } 221 222 223 /** 224 * @param $text 225 * @return array 226 * @throws \Exception 227 * @todo support splitting too long sentences 228 */ 229 public function splitIntoChunks($text) 230 { 231 $sentenceSplitter = new Sentence(); 232 $tiktok = $this->getTokenEncoder(); 233 234 $chunks = []; 235 $sentences = $sentenceSplitter->split($text); 236 237 $chunklen = 0; 238 $chunk = ''; 239 while ($sentence = array_shift($sentences)) { 240 $slen = count($tiktok->encode($sentence)); 241 if ($slen > $this->model->getMaxEmbeddingTokenLength()) { 242 // sentence is too long, we need to split it further 243 if ($this->logger instanceof CLI) $this->logger->warning( 244 'Sentence too long, splitting not implemented yet' 245 ); 246 continue; 247 } 248 249 if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) { 250 // add to current chunk 251 $chunk .= $sentence; 252 $chunklen += $slen; 253 // remember sentence for overlap check 254 $this->rememberSentence($sentence); 255 } else { 256 // add current chunk to result 257 $chunks[] = $chunk; 258 259 // start new chunk with remembered sentences 260 $chunk = implode(' ', $this->sentenceQueue); 261 $chunk .= $sentence; 262 $chunklen = count($tiktok->encode($chunk)); 263 } 264 } 265 $chunks[] = $chunk; 266 267 return $chunks; 268 } 269 270 /** 271 * Add a sentence to the queue of remembered sentences 272 * 273 * @param string $sentence 274 * @return void 275 */ 276 protected function rememberSentence($sentence) 277 { 278 // add sentence to queue 279 $this->sentenceQueue[] = $sentence; 280 281 // remove oldest sentences from queue until we are below the max overlap 282 $encoder = $this->getTokenEncoder(); 283 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 284 array_shift($this->sentenceQueue); 285 } 286 } 287} 288