1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\plugin\aichat\backend\AbstractStorage; 6use dokuwiki\plugin\aichat\backend\Chunk; 7use dokuwiki\plugin\aichat\backend\SQLiteStorage; 8use dokuwiki\Search\Indexer; 9use splitbrain\phpcli\CLI; 10use TikToken\Encoder; 11use Vanderlee\Sentence\Sentence; 12 13/** 14 * Manage the embeddings index 15 * 16 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 17 * OpenAI and stored in the Storage backend. 18 */ 19class Embeddings 20{ 21 /** @var int length of all context chunks together */ 22 const MAX_CONTEXT_LEN = 3800; 23 24 /** @var int size of the chunks in tokens */ 25 const MAX_CHUNK_LEN = 1000; 26 27 /** @var int maximum overlap between chunks in tokens */ 28 const MAX_OVERLAP_LEN = 200; 29 30 /** @var OpenAI */ 31 protected $openAI; 32 /** @var CLI|null */ 33 protected $logger; 34 /** @var Encoder */ 35 protected $tokenEncoder; 36 37 /** @var AbstractStorage */ 38 protected $storage; 39 40 /** @var array remember sentences when chunking */ 41 private $sentenceQueue = []; 42 43 /** 44 * @param OpenAI $openAI 45 */ 46 public function __construct(OpenAI $openAI) 47 { 48 $this->openAI = $openAI; 49 $this->storage = new SQLiteStorage(); 50 } 51 52 /** 53 * Access storage 54 * 55 * @return AbstractStorage 56 */ 57 public function getStorage() 58 { 59 return $this->storage; 60 } 61 62 /** 63 * Add a logger instance 64 * 65 * @param CLI $logger 66 * @return void 67 */ 68 public function setLogger(CLI $logger) 69 { 70 $this->logger = $logger; 71 } 72 73 /** 74 * Get the token encoder instance 75 * 76 * @return Encoder 77 */ 78 public function getTokenEncoder() 79 { 80 if ($this->tokenEncoder === null) { 81 $this->tokenEncoder = new Encoder(); 82 } 83 return $this->tokenEncoder; 84 } 85 86 /** 87 * Update the embeddings storage 88 * 89 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 90 * @param bool $clear Should any existing storage be cleared before updating? 91 * @return void 92 * @throws \Exception 93 */ 94 public function createNewIndex($skipRE = '', $clear = false) 95 { 96 $indexer = new Indexer(); 97 $pages = $indexer->getPages(); 98 99 $this->storage->startCreation(1536, $clear); 100 foreach ($pages as $pid => $page) { 101 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 102 103 if ( 104 !page_exists($page) || 105 isHiddenPage($page) || 106 filesize(wikiFN($page)) < 150 || // skip very small pages 107 ($skipRE && preg_match($skipRE, $page)) 108 ) { 109 // this page should not be in the index (anymore) 110 $this->storage->deletePageChunks($page, $chunkID); 111 continue; 112 } 113 114 $firstChunk = $this->storage->getChunk($chunkID); 115 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 116 // page is older than the chunks we have, reuse the existing chunks 117 $this->storage->reusePageChunks($page, $chunkID); 118 if ($this->logger) $this->logger->info("Reusing chunks for $page"); 119 } else { 120 // page is newer than the chunks we have, create new chunks 121 $this->storage->deletePageChunks($page, $chunkID); 122 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 123 } 124 } 125 $this->storage->finalizeCreation(); 126 } 127 128 /** 129 * Split the given page, fetch embedding vectors and return Chunks 130 * 131 * Will use the text renderer plugin if available to get the rendered text. 132 * Otherwise the raw wiki text is used. 133 * 134 * @param string $page Name of the page to split 135 * @param int $firstChunkID The ID of the first chunk of this page 136 * @return Chunk[] A list of chunks created for this page 137 * @throws \Exception 138 */ 139 protected function createPageChunks($page, $firstChunkID) 140 { 141 $chunkList = []; 142 143 $textRenderer = plugin_load('renderer', 'text'); 144 if ($textRenderer) { 145 global $ID; 146 $ID = $page; 147 $text = p_cached_output(wikiFN($page), 'text', $page); 148 } else { 149 $text = rawWiki($page); 150 } 151 152 $parts = $this->splitIntoChunks($text); 153 foreach ($parts as $part) { 154 if (trim($part) == '') continue; // skip empty chunks 155 156 try { 157 $embedding = $this->openAI->getEmbedding($part); 158 } catch (\Exception $e) { 159 if ($this->logger) { 160 $this->logger->error( 161 'Failed to get embedding for chunk of page {page}: {msg}', 162 ['page' => $page, 'msg' => $e->getMessage()] 163 ); 164 } 165 continue; 166 } 167 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 168 $firstChunkID++; 169 } 170 if ($this->logger) { 171 if (count($chunkList)) { 172 $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]); 173 } else { 174 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 175 } 176 } 177 return $chunkList; 178 } 179 180 /** 181 * Do a nearest neighbor search for chunks similar to the given question 182 * 183 * Returns only chunks the current user is allowed to read, may return an empty result. 184 * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 185 * 186 * @param string $query The question 187 * @return Chunk[] 188 * @throws \Exception 189 */ 190 public function getSimilarChunks($query) 191 { 192 global $auth; 193 $vector = $this->openAI->getEmbedding($query); 194 195 // fetch a few more than needed, since not all chunks are maximum length 196 $fetch = ceil((self::MAX_CONTEXT_LEN / self::MAX_CHUNK_LEN) * 1.2); 197 $chunks = $this->storage->getSimilarChunks($vector, $fetch); 198 199 $size = 0; 200 $result = []; 201 foreach ($chunks as $chunk) { 202 // filter out chunks the user is not allowed to read 203 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 204 205 $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 206 if ($size + $chunkSize > self::MAX_CONTEXT_LEN) break; // we have enough 207 208 $result[] = $chunk; 209 $size += $chunkSize; 210 } 211 return $result; 212 } 213 214 215 /** 216 * @param $text 217 * @return array 218 * @throws \Exception 219 * @todo support splitting too long sentences 220 */ 221 public function splitIntoChunks($text) 222 { 223 $sentenceSplitter = new Sentence(); 224 $tiktok = $this->getTokenEncoder(); 225 226 $chunks = []; 227 $sentences = $sentenceSplitter->split($text); 228 229 $chunklen = 0; 230 $chunk = ''; 231 while ($sentence = array_shift($sentences)) { 232 $slen = count($tiktok->encode($sentence)); 233 if ($slen > self::MAX_CHUNK_LEN) { 234 // sentence is too long, we need to split it further 235 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 236 continue; 237 } 238 239 if ($chunklen + $slen < self::MAX_CHUNK_LEN) { 240 // add to current chunk 241 $chunk .= $sentence; 242 $chunklen += $slen; 243 // remember sentence for overlap check 244 $this->rememberSentence($sentence); 245 } else { 246 // add current chunk to result 247 $chunks[] = $chunk; 248 249 // start new chunk with remembered sentences 250 $chunk = join(' ', $this->sentenceQueue); 251 $chunk .= $sentence; 252 $chunklen = count($tiktok->encode($chunk)); 253 } 254 } 255 $chunks[] = $chunk; 256 257 return $chunks; 258 } 259 260 /** 261 * Add a sentence to the queue of remembered sentences 262 * 263 * @param string $sentence 264 * @return void 265 */ 266 protected function rememberSentence($sentence) 267 { 268 // add sentence to queue 269 $this->sentenceQueue[] = $sentence; 270 271 // remove oldest sentences from queue until we are below the max overlap 272 $encoder = $this->getTokenEncoder(); 273 while (count($encoder->encode(join(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 274 array_shift($this->sentenceQueue); 275 } 276 } 277} 278