1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\Exception\ValidationException; 7use Hexogen\KDTree\FSKDTree; 8use Hexogen\KDTree\FSTreePersister; 9use Hexogen\KDTree\Item; 10use Hexogen\KDTree\ItemFactory; 11use Hexogen\KDTree\ItemList; 12use Hexogen\KDTree\KDTree; 13use Hexogen\KDTree\NearestSearch; 14use Hexogen\KDTree\Point; 15use splitbrain\phpcli\CLI; 16use TikToken\Encoder; 17use Vanderlee\Sentence\Sentence; 18 19/** 20 * Manage the embeddings index 21 * 22 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 23 * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 24 */ 25class Embeddings 26{ 27 28 const MAX_TOKEN_LEN = 1000; 29 const INDEX_NAME = 'aichat'; 30 const INDEX_FILE = 'index.bin'; 31 32 /** @var OpenAI */ 33 protected $openAI; 34 /** @var CLI|null */ 35 protected $logger; 36 37 /** 38 * @param OpenAI $openAI 39 */ 40 public function __construct(OpenAI $openAI) 41 { 42 $this->openAI = $openAI; 43 } 44 45 /** 46 * Add a logger instance 47 * 48 * @param CLI $logger 49 * @return void 50 */ 51 public function setLogger(CLI $logger) 52 { 53 $this->logger = $logger; 54 } 55 56 /** 57 * Create a new K-D Tree from all pages 58 * 59 * Deletes the existing index 60 * 61 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 62 * @return void 63 * @throws ValidationException 64 */ 65 public function createNewIndex($skipRE = '') 66 { 67 $indexer = new Indexer(); 68 $pages = $indexer->getPages(); 69 70 $itemList = new ItemList(1536); 71 foreach ($pages as $pid => $page) { 72 if (!page_exists($page)) continue; 73 if (isHiddenPage($page)) continue; 74 if ($skipRE && preg_match($skipRE, $page)) continue; 75 76 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 77 78 $firstChunk = $this->getChunkFilePath($chunkID); 79 if (@filemtime(wikiFN($page)) < @filemtime($firstChunk)) { 80 // page is older than the chunks we have, reuse the existing chunks 81 $this->reusePageChunks($itemList, $page, $chunkID); 82 } else { 83 // page is newer than the chunks we have, create new chunks 84 $this->deletePageChunks($chunkID); 85 $this->createPageChunks($itemList, $page, $chunkID); 86 } 87 } 88 89 $tree = new KDTree($itemList); 90 if ($this->logger) { 91 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 92 } 93 $persister = new FSTreePersister($this->getStorageDir()); 94 $persister->convert($tree, self::INDEX_FILE); 95 } 96 97 /** 98 * Split the given page, fetch embedding vectors, save chunks and add them to the tree list 99 * 100 * @param ItemList $itemList The list to add the items to 101 * @param string $page Name of the page to split 102 * @param int $chunkID The ID of the first chunk of this page 103 * @return void 104 * @throws \Exception 105 */ 106 protected function createPageChunks(ItemList $itemList, $page, $chunkID) 107 { 108 $text = rawWiki($page); 109 $chunks = $this->splitIntoChunks($text); 110 $meta = [ 111 'pageid' => $page, 112 ]; 113 foreach ($chunks as $chunk) { 114 try { 115 $embedding = $this->openAI->getEmbedding($chunk); 116 } catch (\Exception $e) { 117 if ($this->logger) { 118 $this->logger->error( 119 'Failed to get embedding for chunk of page {page}: {msg}', 120 ['page' => $page, 'msg' => $e->getMessage()] 121 ); 122 } 123 continue; 124 } 125 $item = new Item($chunkID, $embedding); 126 $itemList->addItem($item); 127 $this->saveChunk($item->getId(), $chunk, $embedding, $meta); 128 $chunkID++; 129 } 130 if ($this->logger) { 131 $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 132 } 133 } 134 135 /** 136 * Load the existing chunks for the given page and add them to the tree list 137 * 138 * @param ItemList $itemList The list to add the items to 139 * @param string $page Name of the page to split 140 * @param int $chunkID The ID of the first chunk of this page 141 * @return void 142 */ 143 protected function reusePageChunks(ItemList $itemList, $page, $chunkID) 144 { 145 for ($i = 0; $i < 100; $i++) { 146 $chunk = $this->loadChunk($chunkID + $i); 147 if (!$chunk) break; 148 $item = new Item($chunkID, $chunk['embedding']); 149 $itemList->addItem($item); 150 } 151 if ($this->logger) { 152 $this->logger->success('{id} reused {count} chunks', ['id' => $page, 'count' => $i]); 153 } 154 } 155 156 /** 157 * Delete all possibly existing chunks for one page (identified by the first chunk ID) 158 * 159 * @param int $chunkID The ID of the first chunk of this page 160 * @return void 161 */ 162 protected function deletePageChunks($chunkID) 163 { 164 for ($i = 0; $i < 100; $i++) { 165 $chunk = $this->getChunkFilePath($chunkID + $i); 166 if (!file_exists($chunk)) break; 167 unlink($chunk); 168 } 169 } 170 171 /** 172 * Do a nearest neighbor search for chunks similar to the given question 173 * 174 * Returns only chunks the current user is allowed to read, may return an empty result. 175 * 176 * @param string $query The question 177 * @param int $limit The number of results to return 178 * @return array 179 * @throws \Exception 180 */ 181 public function getSimilarChunks($query, $limit = 4) 182 { 183 global $auth; 184 $embedding = $this->openAI->getEmbedding($query); 185 186 $file = $this->getStorageDir() . self::INDEX_FILE; 187 $fsTree = new FSKDTree($file, new ItemFactory()); 188 $fsSearcher = new NearestSearch($fsTree); 189 $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed 190 191 $result = []; 192 foreach ($items as $item) { 193 $chunk = $this->loadChunk($item->getId()); 194 // filter out chunks the user is not allowed to read 195 if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue; 196 $result[] = $chunk; 197 if (count($result) >= $limit) break; 198 } 199 return $result; 200 } 201 202 /** 203 * @param $text 204 * @return array 205 * @throws \Exception 206 * @todo maybe add overlap support 207 * @todo support splitting too long sentences 208 */ 209 public function splitIntoChunks($text) 210 { 211 $sentenceSplitter = new Sentence(); 212 $tiktok = new Encoder(); 213 214 $chunks = []; 215 $sentences = $sentenceSplitter->split($text); 216 217 $chunklen = 0; 218 $chunk = ''; 219 while ($sentence = array_shift($sentences)) { 220 $slen = count($tiktok->encode($sentence)); 221 if ($slen > self::MAX_TOKEN_LEN) { 222 // sentence is too long, we need to split it further 223 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 224 continue; 225 } 226 227 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 228 // add to current chunk 229 $chunk .= $sentence; 230 $chunklen += $slen; 231 } else { 232 // start new chunk 233 $chunks[] = $chunk; 234 $chunk = $sentence; 235 $chunklen = $slen; 236 } 237 } 238 $chunks[] = $chunk; 239 240 return $chunks; 241 } 242 243 /** 244 * Store additional chunk data in the file system 245 * 246 * @param int $id The chunk id in the K-D tree 247 * @param string $text raw text of the chunk 248 * @param float[] $embedding embedding vector of the chunk 249 * @param array $meta meta data to store with the chunk 250 * @return void 251 */ 252 public function saveChunk($id, $text, $embedding, $meta = []) 253 { 254 $data = [ 255 'id' => $id, 256 'text' => $text, 257 'embedding' => $embedding, 258 'meta' => $meta, 259 ]; 260 261 $chunkfile = $this->getChunkFilePath($id); 262 io_saveFile($chunkfile, json_encode($data)); 263 } 264 265 /** 266 * Load chunk data from the file system 267 * 268 * @param int $id 269 * @return array|false The chunk data [id, text, embedding, meta => []], false if not found 270 */ 271 public function loadChunk($id) 272 { 273 $chunkfile = $this->getChunkFilePath($id); 274 if (!file_exists($chunkfile)) return false; 275 return json_decode(io_readFile($chunkfile, false), true); 276 } 277 278 /** 279 * Return the path to the chunk file 280 * 281 * @param $id 282 * @return string 283 */ 284 protected function getChunkFilePath($id) 285 { 286 $id = dechex($id); // use hexadecimal for shorter file names 287 return $this->getStorageDir('chunk') . $id . '.json'; 288 } 289 290 /** 291 * Return the path to where the K-D tree and chunk data is stored 292 * 293 * @param string $subdir 294 * @return string 295 */ 296 protected function getStorageDir($subdir = '') 297 { 298 global $conf; 299 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 300 if ($subdir) $dir .= $subdir . '/'; 301 io_mkdir_p($dir); 302 return $dir; 303 } 304} 305