1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Extension\PluginInterface; 6use dokuwiki\plugin\aichat\Model\AbstractModel; 7use dokuwiki\plugin\aichat\Storage\AbstractStorage; 8use dokuwiki\Search\Indexer; 9use splitbrain\phpcli\CLI; 10use TikToken\Encoder; 11use Vanderlee\Sentence\Sentence; 12 13/** 14 * Manage the embeddings index 15 * 16 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 17 * OpenAI and stored in the Storage backend. 18 */ 19class Embeddings 20{ 21 /** @var int maximum overlap between chunks in tokens */ 22 final public const MAX_OVERLAP_LEN = 200; 23 24 /** @var AbstractModel */ 25 protected $model; 26 /** @var CLI|null */ 27 protected $logger; 28 /** @var Encoder */ 29 protected $tokenEncoder; 30 31 /** @var AbstractStorage */ 32 protected $storage; 33 34 /** @var array remember sentences when chunking */ 35 private $sentenceQueue = []; 36 37 public function __construct(AbstractModel $model, AbstractStorage $storage) 38 { 39 $this->model = $model; 40 $this->storage = $storage; 41 } 42 43 /** 44 * Access storage 45 * 46 * @return AbstractStorage 47 */ 48 public function getStorage() 49 { 50 return $this->storage; 51 } 52 53 /** 54 * Add a logger instance 55 * 56 * @return void 57 */ 58 public function setLogger(CLI $logger) 59 { 60 $this->logger = $logger; 61 } 62 63 /** 64 * Get the token encoder instance 65 * 66 * @return Encoder 67 */ 68 public function getTokenEncoder() 69 { 70 if (!$this->tokenEncoder instanceof Encoder) { 71 $this->tokenEncoder = new Encoder(); 72 } 73 return $this->tokenEncoder; 74 } 75 76 /** 77 * Update the embeddings storage 78 * 79 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 80 * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 81 * @param bool $clear Should any existing storage be cleared before updating? 82 * @return void 83 * @throws \Exception 84 */ 85 public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 86 { 87 $indexer = new Indexer(); 88 $pages = $indexer->getPages(); 89 90 $this->storage->startCreation($clear); 91 foreach ($pages as $pid => $page) { 92 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 93 94 if ( 95 !page_exists($page) || 96 isHiddenPage($page) || 97 filesize(wikiFN($page)) < 150 || // skip very small pages 98 ($skipRE && preg_match($skipRE, (string) $page)) || 99 ($matchRE && !preg_match($matchRE, ":$page")) 100 ) { 101 // this page should not be in the index (anymore) 102 $this->storage->deletePageChunks($page, $chunkID); 103 continue; 104 } 105 106 $firstChunk = $this->storage->getChunk($chunkID); 107 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 108 // page is older than the chunks we have, reuse the existing chunks 109 $this->storage->reusePageChunks($page, $chunkID); 110 if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 111 } else { 112 // page is newer than the chunks we have, create new chunks 113 $this->storage->deletePageChunks($page, $chunkID); 114 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 115 } 116 } 117 $this->storage->finalizeCreation(); 118 } 119 120 /** 121 * Split the given page, fetch embedding vectors and return Chunks 122 * 123 * Will use the text renderer plugin if available to get the rendered text. 124 * Otherwise the raw wiki text is used. 125 * 126 * @param string $page Name of the page to split 127 * @param int $firstChunkID The ID of the first chunk of this page 128 * @return Chunk[] A list of chunks created for this page 129 * @throws \Exception 130 */ 131 protected function createPageChunks($page, $firstChunkID) 132 { 133 $chunkList = []; 134 135 $textRenderer = plugin_load('renderer', 'text'); 136 if ($textRenderer instanceof PluginInterface) { 137 global $ID; 138 $ID = $page; 139 $text = p_cached_output(wikiFN($page), 'text', $page); 140 } else { 141 $text = rawWiki($page); 142 } 143 144 $parts = $this->splitIntoChunks($text); 145 foreach ($parts as $part) { 146 if (trim((string) $part) == '') continue; // skip empty chunks 147 148 try { 149 $embedding = $this->model->getEmbedding($part); 150 } catch (\Exception $e) { 151 if ($this->logger instanceof CLI) { 152 $this->logger->error( 153 'Failed to get embedding for chunk of page {page}: {msg}', 154 ['page' => $page, 'msg' => $e->getMessage()] 155 ); 156 } 157 continue; 158 } 159 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 160 $firstChunkID++; 161 } 162 if ($this->logger instanceof CLI) { 163 if ($chunkList !== []) { 164 $this->logger->success( 165 '{id} split into {count} chunks', 166 ['id' => $page, 'count' => count($chunkList)] 167 ); 168 } else { 169 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 170 } 171 } 172 return $chunkList; 173 } 174 175 /** 176 * Do a nearest neighbor search for chunks similar to the given question 177 * 178 * Returns only chunks the current user is allowed to read, may return an empty result. 179 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 180 * 181 * @param string $query The question 182 * @param string $lang Limit results to this language 183 * @return Chunk[] 184 * @throws \Exception 185 */ 186 public function getSimilarChunks($query, $lang = '') 187 { 188 global $auth; 189 $vector = $this->model->getEmbedding($query); 190 191 $fetch = ceil( 192 ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength()) 193 * 1.5 // fetch a few more than needed, since not all chunks are maximum length 194 ); 195 196 $time = microtime(true); 197 $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 198 if ($this->logger instanceof CLI) { 199 $this->logger->info( 200 'Fetched {count} similar chunks from store in {time} seconds', 201 ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)] 202 ); 203 } 204 205 $size = 0; 206 $result = []; 207 foreach ($chunks as $chunk) { 208 // filter out chunks the user is not allowed to read 209 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 210 211 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 212 if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough 213 214 $result[] = $chunk; 215 $size += $chunkSize; 216 } 217 return $result; 218 } 219 220 221 /** 222 * @param $text 223 * @return array 224 * @throws \Exception 225 * @todo support splitting too long sentences 226 */ 227 public function splitIntoChunks($text) 228 { 229 $sentenceSplitter = new Sentence(); 230 $tiktok = $this->getTokenEncoder(); 231 232 $chunks = []; 233 $sentences = $sentenceSplitter->split($text); 234 235 $chunklen = 0; 236 $chunk = ''; 237 while ($sentence = array_shift($sentences)) { 238 $slen = count($tiktok->encode($sentence)); 239 if ($slen > $this->model->getMaxEmbeddingTokenLength()) { 240 // sentence is too long, we need to split it further 241 if ($this->logger instanceof CLI) $this->logger->warning( 242 'Sentence too long, splitting not implemented yet' 243 ); 244 continue; 245 } 246 247 if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) { 248 // add to current chunk 249 $chunk .= $sentence; 250 $chunklen += $slen; 251 // remember sentence for overlap check 252 $this->rememberSentence($sentence); 253 } else { 254 // add current chunk to result 255 $chunks[] = $chunk; 256 257 // start new chunk with remembered sentences 258 $chunk = implode(' ', $this->sentenceQueue); 259 $chunk .= $sentence; 260 $chunklen = count($tiktok->encode($chunk)); 261 } 262 } 263 $chunks[] = $chunk; 264 265 return $chunks; 266 } 267 268 /** 269 * Add a sentence to the queue of remembered sentences 270 * 271 * @param string $sentence 272 * @return void 273 */ 274 protected function rememberSentence($sentence) 275 { 276 // add sentence to queue 277 $this->sentenceQueue[] = $sentence; 278 279 // remove oldest sentences from queue until we are below the max overlap 280 $encoder = $this->getTokenEncoder(); 281 while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 282 array_shift($this->sentenceQueue); 283 } 284 } 285} 286