18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 58817535bSAndreas Gohruse dokuwiki\Search\Indexer; 6ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException; 78817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree; 88817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister; 98817535bSAndreas Gohruse Hexogen\KDTree\Item; 108817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory; 118817535bSAndreas Gohruse Hexogen\KDTree\ItemList; 128817535bSAndreas Gohruse Hexogen\KDTree\KDTree; 138817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch; 148817535bSAndreas Gohruse Hexogen\KDTree\Point; 152ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 168817535bSAndreas Gohruse TikToken\Encoder; 178817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 188817535bSAndreas Gohr 199da5f0dfSAndreas Gohr/** 209da5f0dfSAndreas Gohr * Manage the embeddings index 219da5f0dfSAndreas Gohr * 229da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 239da5f0dfSAndreas Gohr * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 249da5f0dfSAndreas Gohr */ 258817535bSAndreas Gohrclass Embeddings 268817535bSAndreas Gohr{ 278817535bSAndreas Gohr 28c4584168SAndreas Gohr const MAX_TOKEN_LEN = 1000; 298817535bSAndreas Gohr const INDEX_NAME = 'aichat'; 308817535bSAndreas Gohr const INDEX_FILE = 'index.bin'; 318817535bSAndreas Gohr 322ecc089aSAndreas Gohr /** @var OpenAI */ 338817535bSAndreas Gohr protected $openAI; 342ecc089aSAndreas Gohr /** @var CLI|null */ 352ecc089aSAndreas Gohr protected $logger; 368817535bSAndreas Gohr 372ecc089aSAndreas Gohr /** 382ecc089aSAndreas Gohr * @param OpenAI $openAI 392ecc089aSAndreas Gohr */ 402ecc089aSAndreas Gohr public function __construct(OpenAI $openAI) 418817535bSAndreas Gohr { 428817535bSAndreas Gohr $this->openAI = $openAI; 432ecc089aSAndreas Gohr } 442ecc089aSAndreas Gohr 452ecc089aSAndreas Gohr /** 462ecc089aSAndreas Gohr * Add a logger instance 472ecc089aSAndreas Gohr * 482ecc089aSAndreas Gohr * @param CLI $logger 492ecc089aSAndreas Gohr * @return void 502ecc089aSAndreas Gohr */ 512ecc089aSAndreas Gohr public function setLogger(CLI $logger) 522ecc089aSAndreas Gohr { 538817535bSAndreas Gohr $this->logger = $logger; 548817535bSAndreas Gohr } 558817535bSAndreas Gohr 562ecc089aSAndreas Gohr /** 572ecc089aSAndreas Gohr * Create a new K-D Tree from all pages 582ecc089aSAndreas Gohr * 592ecc089aSAndreas Gohr * Deletes the existing index 602ecc089aSAndreas Gohr * 61ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 622ecc089aSAndreas Gohr * @return void 63ad38c5fdSAndreas Gohr * @throws ValidationException 642ecc089aSAndreas Gohr */ 65ad38c5fdSAndreas Gohr public function createNewIndex($skipRE = '') 668817535bSAndreas Gohr { 678817535bSAndreas Gohr $indexer = new Indexer(); 688817535bSAndreas Gohr $pages = $indexer->getPages(); 698817535bSAndreas Gohr 708817535bSAndreas Gohr $itemList = new ItemList(1536); 715aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 728817535bSAndreas Gohr if (!page_exists($page)) continue; 736f9744f7SAndreas Gohr if (isHiddenPage($page)) continue; 74ad38c5fdSAndreas Gohr if ($skipRE && preg_match($skipRE, $page)) continue; 755aa45b4dSAndreas Gohr 765aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 775aa45b4dSAndreas Gohr 785aa45b4dSAndreas Gohr $firstChunk = $this->getChunkFilePath($chunkID); 795aa45b4dSAndreas Gohr if (@filemtime(wikiFN($page)) < @filemtime($firstChunk)) { 805aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 815aa45b4dSAndreas Gohr $this->reusePageChunks($itemList, $page, $chunkID); 825aa45b4dSAndreas Gohr } else { 835aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 845aa45b4dSAndreas Gohr $this->deletePageChunks($chunkID); 855aa45b4dSAndreas Gohr $this->createPageChunks($itemList, $page, $chunkID); 865aa45b4dSAndreas Gohr } 875aa45b4dSAndreas Gohr } 885aa45b4dSAndreas Gohr 895aa45b4dSAndreas Gohr $tree = new KDTree($itemList); 905aa45b4dSAndreas Gohr if ($this->logger) { 915aa45b4dSAndreas Gohr $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 925aa45b4dSAndreas Gohr } 935aa45b4dSAndreas Gohr $persister = new FSTreePersister($this->getStorageDir()); 945aa45b4dSAndreas Gohr $persister->convert($tree, self::INDEX_FILE); 955aa45b4dSAndreas Gohr } 965aa45b4dSAndreas Gohr 975aa45b4dSAndreas Gohr /** 985aa45b4dSAndreas Gohr * Split the given page, fetch embedding vectors, save chunks and add them to the tree list 995aa45b4dSAndreas Gohr * 1005aa45b4dSAndreas Gohr * @param ItemList $itemList The list to add the items to 1015aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1025aa45b4dSAndreas Gohr * @param int $chunkID The ID of the first chunk of this page 1035aa45b4dSAndreas Gohr * @return void 1045aa45b4dSAndreas Gohr * @throws \Exception 1055aa45b4dSAndreas Gohr */ 1065aa45b4dSAndreas Gohr protected function createPageChunks(ItemList $itemList, $page, $chunkID) 1075aa45b4dSAndreas Gohr { 1088817535bSAndreas Gohr $text = rawWiki($page); 1098817535bSAndreas Gohr $chunks = $this->splitIntoChunks($text); 1108817535bSAndreas Gohr $meta = [ 1118817535bSAndreas Gohr 'pageid' => $page, 1128817535bSAndreas Gohr ]; 1138817535bSAndreas Gohr foreach ($chunks as $chunk) { 114ad38c5fdSAndreas Gohr try { 1158817535bSAndreas Gohr $embedding = $this->openAI->getEmbedding($chunk); 116ad38c5fdSAndreas Gohr } catch (\Exception $e) { 117ad38c5fdSAndreas Gohr if ($this->logger) { 118ad38c5fdSAndreas Gohr $this->logger->error( 119ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 120ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 121ad38c5fdSAndreas Gohr ); 122ad38c5fdSAndreas Gohr } 123ad38c5fdSAndreas Gohr continue; 124ad38c5fdSAndreas Gohr } 1255aa45b4dSAndreas Gohr $item = new Item($chunkID, $embedding); 1268817535bSAndreas Gohr $itemList->addItem($item); 1275aa45b4dSAndreas Gohr $this->saveChunk($item->getId(), $chunk, $embedding, $meta); 1285aa45b4dSAndreas Gohr $chunkID++; 1298817535bSAndreas Gohr } 1308817535bSAndreas Gohr if ($this->logger) { 1315aa45b4dSAndreas Gohr $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 1328817535bSAndreas Gohr } 1338817535bSAndreas Gohr } 1348817535bSAndreas Gohr 1355aa45b4dSAndreas Gohr /** 1365aa45b4dSAndreas Gohr * Load the existing chunks for the given page and add them to the tree list 1375aa45b4dSAndreas Gohr * 1385aa45b4dSAndreas Gohr * @param ItemList $itemList The list to add the items to 1395aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1405aa45b4dSAndreas Gohr * @param int $chunkID The ID of the first chunk of this page 1415aa45b4dSAndreas Gohr * @return void 1425aa45b4dSAndreas Gohr */ 1435aa45b4dSAndreas Gohr protected function reusePageChunks(ItemList $itemList, $page, $chunkID) 1445aa45b4dSAndreas Gohr { 1455aa45b4dSAndreas Gohr for ($i = 0; $i < 100; $i++) { 1465aa45b4dSAndreas Gohr $chunk = $this->loadChunk($chunkID + $i); 1475aa45b4dSAndreas Gohr if (!$chunk) break; 1485aa45b4dSAndreas Gohr $item = new Item($chunkID, $chunk['embedding']); 1495aa45b4dSAndreas Gohr $itemList->addItem($item); 1508817535bSAndreas Gohr } 1515aa45b4dSAndreas Gohr if ($this->logger) { 1525aa45b4dSAndreas Gohr $this->logger->success('{id} reused {count} chunks', ['id' => $page, 'count' => $i]); 1535aa45b4dSAndreas Gohr } 1545aa45b4dSAndreas Gohr } 1555aa45b4dSAndreas Gohr 1565aa45b4dSAndreas Gohr /** 1575aa45b4dSAndreas Gohr * Delete all possibly existing chunks for one page (identified by the first chunk ID) 1585aa45b4dSAndreas Gohr * 1595aa45b4dSAndreas Gohr * @param int $chunkID The ID of the first chunk of this page 1605aa45b4dSAndreas Gohr * @return void 1615aa45b4dSAndreas Gohr */ 1625aa45b4dSAndreas Gohr protected function deletePageChunks($chunkID) 1635aa45b4dSAndreas Gohr { 1645aa45b4dSAndreas Gohr for ($i = 0; $i < 100; $i++) { 1655aa45b4dSAndreas Gohr $chunk = $this->getChunkFilePath($chunkID + $i); 1665aa45b4dSAndreas Gohr if (!file_exists($chunk)) break; 1675aa45b4dSAndreas Gohr unlink($chunk); 1685aa45b4dSAndreas Gohr } 1698817535bSAndreas Gohr } 1708817535bSAndreas Gohr 1719e81bea7SAndreas Gohr /** 1729e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 1739e81bea7SAndreas Gohr * 1749e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 1759e81bea7SAndreas Gohr * 1769e81bea7SAndreas Gohr * @param string $query The question 1779e81bea7SAndreas Gohr * @param int $limit The number of results to return 1789e81bea7SAndreas Gohr * @return array 1799e81bea7SAndreas Gohr * @throws \Exception 1809e81bea7SAndreas Gohr */ 1818817535bSAndreas Gohr public function getSimilarChunks($query, $limit = 4) 1828817535bSAndreas Gohr { 1839e81bea7SAndreas Gohr global $auth; 1848817535bSAndreas Gohr $embedding = $this->openAI->getEmbedding($query); 1858817535bSAndreas Gohr 186*5786be46SAndreas Gohr $fsTree = $this->getTree(); 1878817535bSAndreas Gohr $fsSearcher = new NearestSearch($fsTree); 1889e81bea7SAndreas Gohr $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed 1898817535bSAndreas Gohr 1908817535bSAndreas Gohr $result = []; 1918817535bSAndreas Gohr foreach ($items as $item) { 1929e81bea7SAndreas Gohr $chunk = $this->loadChunk($item->getId()); 1939e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 1949e81bea7SAndreas Gohr if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue; 1959e81bea7SAndreas Gohr $result[] = $chunk; 1969e81bea7SAndreas Gohr if (count($result) >= $limit) break; 1978817535bSAndreas Gohr } 1988817535bSAndreas Gohr return $result; 1998817535bSAndreas Gohr } 2008817535bSAndreas Gohr 2018817535bSAndreas Gohr /** 202*5786be46SAndreas Gohr * Access to the KD Tree 203*5786be46SAndreas Gohr * 204*5786be46SAndreas Gohr * @return FSKDTree 205*5786be46SAndreas Gohr */ 206*5786be46SAndreas Gohr public function getTree() 207*5786be46SAndreas Gohr { 208*5786be46SAndreas Gohr $file = $this->getStorageDir() . self::INDEX_FILE; 209*5786be46SAndreas Gohr return new FSKDTree($file, new ItemFactory()); 210*5786be46SAndreas Gohr } 211*5786be46SAndreas Gohr 212*5786be46SAndreas Gohr /** 2138817535bSAndreas Gohr * @param $text 2148817535bSAndreas Gohr * @return array 2158817535bSAndreas Gohr * @throws \Exception 2168817535bSAndreas Gohr * @todo maybe add overlap support 2178817535bSAndreas Gohr * @todo support splitting too long sentences 2188817535bSAndreas Gohr */ 219ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 2208817535bSAndreas Gohr { 2218817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 2228817535bSAndreas Gohr $tiktok = new Encoder(); 2238817535bSAndreas Gohr 2248817535bSAndreas Gohr $chunks = []; 2258817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 2268817535bSAndreas Gohr 2278817535bSAndreas Gohr $chunklen = 0; 2288817535bSAndreas Gohr $chunk = ''; 2298817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 2308817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 2318817535bSAndreas Gohr if ($slen > self::MAX_TOKEN_LEN) { 2328817535bSAndreas Gohr // sentence is too long, we need to split it further 233ad38c5fdSAndreas Gohr if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 234ad38c5fdSAndreas Gohr continue; 2358817535bSAndreas Gohr } 2368817535bSAndreas Gohr 2378817535bSAndreas Gohr if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 2388817535bSAndreas Gohr // add to current chunk 2398817535bSAndreas Gohr $chunk .= $sentence; 2408817535bSAndreas Gohr $chunklen += $slen; 2418817535bSAndreas Gohr } else { 2428817535bSAndreas Gohr // start new chunk 2438817535bSAndreas Gohr $chunks[] = $chunk; 2448817535bSAndreas Gohr $chunk = $sentence; 2458817535bSAndreas Gohr $chunklen = $slen; 2468817535bSAndreas Gohr } 2478817535bSAndreas Gohr } 2488817535bSAndreas Gohr $chunks[] = $chunk; 2498817535bSAndreas Gohr 2508817535bSAndreas Gohr return $chunks; 2518817535bSAndreas Gohr } 2528817535bSAndreas Gohr 2539da5f0dfSAndreas Gohr /** 2549da5f0dfSAndreas Gohr * Store additional chunk data in the file system 2559da5f0dfSAndreas Gohr * 2569da5f0dfSAndreas Gohr * @param int $id The chunk id in the K-D tree 2579da5f0dfSAndreas Gohr * @param string $text raw text of the chunk 2585aa45b4dSAndreas Gohr * @param float[] $embedding embedding vector of the chunk 2599da5f0dfSAndreas Gohr * @param array $meta meta data to store with the chunk 2609da5f0dfSAndreas Gohr * @return void 2619da5f0dfSAndreas Gohr */ 2625aa45b4dSAndreas Gohr public function saveChunk($id, $text, $embedding, $meta = []) 2638817535bSAndreas Gohr { 2648817535bSAndreas Gohr $data = [ 2658817535bSAndreas Gohr 'id' => $id, 2668817535bSAndreas Gohr 'text' => $text, 2675aa45b4dSAndreas Gohr 'embedding' => $embedding, 2688817535bSAndreas Gohr 'meta' => $meta, 2698817535bSAndreas Gohr ]; 2708817535bSAndreas Gohr 2715aa45b4dSAndreas Gohr $chunkfile = $this->getChunkFilePath($id); 2728817535bSAndreas Gohr io_saveFile($chunkfile, json_encode($data)); 2738817535bSAndreas Gohr } 2748817535bSAndreas Gohr 2759da5f0dfSAndreas Gohr /** 2769da5f0dfSAndreas Gohr * Load chunk data from the file system 2779da5f0dfSAndreas Gohr * 2789da5f0dfSAndreas Gohr * @param int $id 2795aa45b4dSAndreas Gohr * @return array|false The chunk data [id, text, embedding, meta => []], false if not found 2809da5f0dfSAndreas Gohr */ 2818817535bSAndreas Gohr public function loadChunk($id) 2828817535bSAndreas Gohr { 2835aa45b4dSAndreas Gohr $chunkfile = $this->getChunkFilePath($id); 2845aa45b4dSAndreas Gohr if (!file_exists($chunkfile)) return false; 2858817535bSAndreas Gohr return json_decode(io_readFile($chunkfile, false), true); 2868817535bSAndreas Gohr } 2878817535bSAndreas Gohr 2889da5f0dfSAndreas Gohr /** 2895aa45b4dSAndreas Gohr * Return the path to the chunk file 2905aa45b4dSAndreas Gohr * 2915aa45b4dSAndreas Gohr * @param $id 2925aa45b4dSAndreas Gohr * @return string 2935aa45b4dSAndreas Gohr */ 2945aa45b4dSAndreas Gohr protected function getChunkFilePath($id) 2955aa45b4dSAndreas Gohr { 2965aa45b4dSAndreas Gohr $id = dechex($id); // use hexadecimal for shorter file names 2975aa45b4dSAndreas Gohr return $this->getStorageDir('chunk') . $id . '.json'; 2985aa45b4dSAndreas Gohr } 2995aa45b4dSAndreas Gohr 3005aa45b4dSAndreas Gohr /** 3019da5f0dfSAndreas Gohr * Return the path to where the K-D tree and chunk data is stored 3029da5f0dfSAndreas Gohr * 3039da5f0dfSAndreas Gohr * @param string $subdir 3049da5f0dfSAndreas Gohr * @return string 3059da5f0dfSAndreas Gohr */ 3068817535bSAndreas Gohr protected function getStorageDir($subdir = '') 3078817535bSAndreas Gohr { 3088817535bSAndreas Gohr global $conf; 3098817535bSAndreas Gohr $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 3108817535bSAndreas Gohr if ($subdir) $dir .= $subdir . '/'; 3118817535bSAndreas Gohr io_mkdir_p($dir); 3128817535bSAndreas Gohr return $dir; 3138817535bSAndreas Gohr } 3148817535bSAndreas Gohr} 315