1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\Exception\ValidationException; 7use Hexogen\KDTree\FSKDTree; 8use Hexogen\KDTree\FSTreePersister; 9use Hexogen\KDTree\Item; 10use Hexogen\KDTree\ItemFactory; 11use Hexogen\KDTree\ItemList; 12use Hexogen\KDTree\KDTree; 13use Hexogen\KDTree\NearestSearch; 14use Hexogen\KDTree\Point; 15use splitbrain\phpcli\CLI; 16use TikToken\Encoder; 17use Vanderlee\Sentence\Sentence; 18 19/** 20 * Manage the embeddings index 21 * 22 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 23 * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 24 */ 25class Embeddings 26{ 27 28 const MAX_TOKEN_LEN = 1000; 29 const INDEX_NAME = 'aichat'; 30 const INDEX_FILE = 'index.bin'; 31 32 /** @var OpenAI */ 33 protected $openAI; 34 /** @var CLI|null */ 35 protected $logger; 36 37 /** 38 * @param OpenAI $openAI 39 */ 40 public function __construct(OpenAI $openAI) 41 { 42 $this->openAI = $openAI; 43 } 44 45 /** 46 * Add a logger instance 47 * 48 * @param CLI $logger 49 * @return void 50 */ 51 public function setLogger(CLI $logger) 52 { 53 $this->logger = $logger; 54 } 55 56 /** 57 * Create a new K-D Tree from all pages 58 * 59 * Deletes the existing index 60 * 61 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 62 * @return void 63 * @throws ValidationException 64 */ 65 public function createNewIndex($skipRE = '') 66 { 67 $indexer = new Indexer(); 68 $pages = $indexer->getPages(); 69 70 $itemList = new ItemList(1536); 71 foreach ($pages as $pid => $page) { 72 if (!page_exists($page)) continue; 73 if (isHiddenPage($page)) continue; 74 if ($skipRE && preg_match($skipRE, $page)) continue; 75 76 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 77 78 $firstChunk = $this->getChunkFilePath($chunkID); 79 if (@filemtime(wikiFN($page)) < @filemtime($firstChunk)) { 80 // page is older than the chunks we have, reuse the existing chunks 81 $this->reusePageChunks($itemList, $page, $chunkID); 82 } else { 83 // page is newer than the chunks we have, create new chunks 84 $this->deletePageChunks($chunkID); 85 $this->createPageChunks($itemList, $page, $chunkID); 86 } 87 } 88 89 $tree = new KDTree($itemList); 90 if ($this->logger) { 91 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 92 } 93 $persister = new FSTreePersister($this->getStorageDir()); 94 $persister->convert($tree, self::INDEX_FILE); 95 } 96 97 /** 98 * Split the given page, fetch embedding vectors, save chunks and add them to the tree list 99 * 100 * @param ItemList $itemList The list to add the items to 101 * @param string $page Name of the page to split 102 * @param int $chunkID The ID of the first chunk of this page 103 * @return void 104 * @throws \Exception 105 */ 106 protected function createPageChunks(ItemList $itemList, $page, $chunkID) 107 { 108 $text = rawWiki($page); 109 $chunks = $this->splitIntoChunks($text); 110 $meta = [ 111 'pageid' => $page, 112 ]; 113 foreach ($chunks as $chunk) { 114 try { 115 $embedding = $this->openAI->getEmbedding($chunk); 116 } catch (\Exception $e) { 117 if ($this->logger) { 118 $this->logger->error( 119 'Failed to get embedding for chunk of page {page}: {msg}', 120 ['page' => $page, 'msg' => $e->getMessage()] 121 ); 122 } 123 continue; 124 } 125 $item = new Item($chunkID, $embedding); 126 $itemList->addItem($item); 127 $this->saveChunk($item->getId(), $chunk, $embedding, $meta); 128 $chunkID++; 129 } 130 if ($this->logger) { 131 $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 132 } 133 } 134 135 /** 136 * Load the existing chunks for the given page and add them to the tree list 137 * 138 * @param ItemList $itemList The list to add the items to 139 * @param string $page Name of the page to split 140 * @param int $chunkID The ID of the first chunk of this page 141 * @return void 142 */ 143 protected function reusePageChunks(ItemList $itemList, $page, $chunkID) 144 { 145 for ($i = 0; $i < 100; $i++) { 146 $chunk = $this->loadChunk($chunkID + $i); 147 if (!$chunk) break; 148 $item = new Item($chunkID, $chunk['embedding']); 149 $itemList->addItem($item); 150 } 151 if ($this->logger) { 152 $this->logger->success('{id} reused {count} chunks', ['id' => $page, 'count' => $i]); 153 } 154 } 155 156 /** 157 * Delete all possibly existing chunks for one page (identified by the first chunk ID) 158 * 159 * @param int $chunkID The ID of the first chunk of this page 160 * @return void 161 */ 162 protected function deletePageChunks($chunkID) 163 { 164 for ($i = 0; $i < 100; $i++) { 165 $chunk = $this->getChunkFilePath($chunkID + $i); 166 if (!file_exists($chunk)) break; 167 unlink($chunk); 168 } 169 } 170 171 /** 172 * Do a nearest neighbor search for chunks similar to the given question 173 * 174 * Returns only chunks the current user is allowed to read, may return an empty result. 175 * 176 * @param string $query The question 177 * @param int $limit The number of results to return 178 * @return array 179 * @throws \Exception 180 */ 181 public function getSimilarChunks($query, $limit = 4) 182 { 183 global $auth; 184 $embedding = $this->openAI->getEmbedding($query); 185 186 $fsTree = $this->getTree(); 187 $fsSearcher = new NearestSearch($fsTree); 188 $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed 189 190 $result = []; 191 foreach ($items as $item) { 192 $chunk = $this->loadChunk($item->getId()); 193 // filter out chunks the user is not allowed to read 194 if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue; 195 $result[] = $chunk; 196 if (count($result) >= $limit) break; 197 } 198 return $result; 199 } 200 201 /** 202 * Access to the KD Tree 203 * 204 * @return FSKDTree 205 */ 206 public function getTree() 207 { 208 $file = $this->getStorageDir() . self::INDEX_FILE; 209 return new FSKDTree($file, new ItemFactory()); 210 } 211 212 /** 213 * @param $text 214 * @return array 215 * @throws \Exception 216 * @todo maybe add overlap support 217 * @todo support splitting too long sentences 218 */ 219 public function splitIntoChunks($text) 220 { 221 $sentenceSplitter = new Sentence(); 222 $tiktok = new Encoder(); 223 224 $chunks = []; 225 $sentences = $sentenceSplitter->split($text); 226 227 $chunklen = 0; 228 $chunk = ''; 229 while ($sentence = array_shift($sentences)) { 230 $slen = count($tiktok->encode($sentence)); 231 if ($slen > self::MAX_TOKEN_LEN) { 232 // sentence is too long, we need to split it further 233 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 234 continue; 235 } 236 237 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 238 // add to current chunk 239 $chunk .= $sentence; 240 $chunklen += $slen; 241 } else { 242 // start new chunk 243 $chunks[] = $chunk; 244 $chunk = $sentence; 245 $chunklen = $slen; 246 } 247 } 248 $chunks[] = $chunk; 249 250 return $chunks; 251 } 252 253 /** 254 * Store additional chunk data in the file system 255 * 256 * @param int $id The chunk id in the K-D tree 257 * @param string $text raw text of the chunk 258 * @param float[] $embedding embedding vector of the chunk 259 * @param array $meta meta data to store with the chunk 260 * @return void 261 */ 262 public function saveChunk($id, $text, $embedding, $meta = []) 263 { 264 $data = [ 265 'id' => $id, 266 'text' => $text, 267 'embedding' => $embedding, 268 'meta' => $meta, 269 ]; 270 271 $chunkfile = $this->getChunkFilePath($id); 272 io_saveFile($chunkfile, json_encode($data)); 273 } 274 275 /** 276 * Load chunk data from the file system 277 * 278 * @param int $id 279 * @return array|false The chunk data [id, text, embedding, meta => []], false if not found 280 */ 281 public function loadChunk($id) 282 { 283 $chunkfile = $this->getChunkFilePath($id); 284 if (!file_exists($chunkfile)) return false; 285 return json_decode(io_readFile($chunkfile, false), true); 286 } 287 288 /** 289 * Return the path to the chunk file 290 * 291 * @param $id 292 * @return string 293 */ 294 protected function getChunkFilePath($id) 295 { 296 $id = dechex($id); // use hexadecimal for shorter file names 297 return $this->getStorageDir('chunk') . $id . '.json'; 298 } 299 300 /** 301 * Return the path to where the K-D tree and chunk data is stored 302 * 303 * @param string $subdir 304 * @return string 305 */ 306 protected function getStorageDir($subdir = '') 307 { 308 global $conf; 309 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 310 if ($subdir) $dir .= $subdir . '/'; 311 io_mkdir_p($dir); 312 return $dir; 313 } 314} 315