18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 58817535bSAndreas Gohruse dokuwiki\Search\Indexer; 68817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree; 78817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister; 88817535bSAndreas Gohruse Hexogen\KDTree\Item; 98817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory; 108817535bSAndreas Gohruse Hexogen\KDTree\ItemList; 118817535bSAndreas Gohruse Hexogen\KDTree\KDTree; 128817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch; 138817535bSAndreas Gohruse Hexogen\KDTree\Point; 148817535bSAndreas Gohruse TikToken\Encoder; 158817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 168817535bSAndreas Gohr 17*9da5f0dfSAndreas Gohr/** 18*9da5f0dfSAndreas Gohr * Manage the embeddings index 19*9da5f0dfSAndreas Gohr * 20*9da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 21*9da5f0dfSAndreas Gohr * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 22*9da5f0dfSAndreas Gohr */ 238817535bSAndreas Gohrclass Embeddings 248817535bSAndreas Gohr{ 258817535bSAndreas Gohr 26c4584168SAndreas Gohr const MAX_TOKEN_LEN = 1000; 278817535bSAndreas Gohr const INDEX_NAME = 'aichat'; 288817535bSAndreas Gohr const INDEX_FILE = 'index.bin'; 298817535bSAndreas Gohr 308817535bSAndreas Gohr protected $openAI; 318817535bSAndreas Gohr 328817535bSAndreas Gohr public function __construct(OpenAI $openAI, $logger = null) 338817535bSAndreas Gohr { 348817535bSAndreas Gohr $this->openAI = $openAI; 358817535bSAndreas Gohr $this->logger = $logger; 368817535bSAndreas Gohr } 378817535bSAndreas Gohr 388817535bSAndreas Gohr public function createNewIndex() 398817535bSAndreas Gohr { 408817535bSAndreas Gohr io_rmdir($this->getStorageDir(), true); // delete old index 418817535bSAndreas Gohr 428817535bSAndreas Gohr $indexer = new Indexer(); 438817535bSAndreas Gohr $pages = $indexer->getPages(); 448817535bSAndreas Gohr $itemCount = 0; 458817535bSAndreas Gohr 468817535bSAndreas Gohr $itemList = new ItemList(1536); 478817535bSAndreas Gohr foreach ($pages as $page) { 488817535bSAndreas Gohr if (!page_exists($page)) continue; 498817535bSAndreas Gohr $text = rawWiki($page); 508817535bSAndreas Gohr $chunks = $this->splitIntoChunks($text); 518817535bSAndreas Gohr $meta = [ 528817535bSAndreas Gohr 'pageid' => $page, 538817535bSAndreas Gohr // fixme add title here? 548817535bSAndreas Gohr ]; 558817535bSAndreas Gohr foreach ($chunks as $chunk) { 568817535bSAndreas Gohr $embedding = $this->openAI->getEmbedding($chunk); 578817535bSAndreas Gohr $item = new Item($itemCount++, $embedding); 588817535bSAndreas Gohr $itemList->addItem($item); 598817535bSAndreas Gohr $this->saveChunk($item->getId(), $chunk, $meta); 608817535bSAndreas Gohr } 618817535bSAndreas Gohr if ($this->logger) { 628817535bSAndreas Gohr $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 638817535bSAndreas Gohr } 648817535bSAndreas Gohr } 658817535bSAndreas Gohr 668817535bSAndreas Gohr $tree = new KDTree($itemList); 678817535bSAndreas Gohr if($this->logger) { 688817535bSAndreas Gohr $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 698817535bSAndreas Gohr } 708817535bSAndreas Gohr $persister = new FSTreePersister($this->getStorageDir()); 718817535bSAndreas Gohr $persister->convert($tree, self::INDEX_FILE); 728817535bSAndreas Gohr } 738817535bSAndreas Gohr 748817535bSAndreas Gohr public function getSimilarChunks($query, $limit = 4) 758817535bSAndreas Gohr { 768817535bSAndreas Gohr $embedding = $this->openAI->getEmbedding($query); 778817535bSAndreas Gohr 788817535bSAndreas Gohr $file = $this->getStorageDir() . self::INDEX_FILE; 798817535bSAndreas Gohr $fsTree = new FSKDTree($file, new ItemFactory()); 808817535bSAndreas Gohr $fsSearcher = new NearestSearch($fsTree); 818817535bSAndreas Gohr $items = $fsSearcher->search(new Point($embedding), $limit); 828817535bSAndreas Gohr 838817535bSAndreas Gohr $result = []; 848817535bSAndreas Gohr foreach ($items as $item) { 858817535bSAndreas Gohr $result [] = $this->loadChunk($item->getId()); 868817535bSAndreas Gohr } 878817535bSAndreas Gohr return $result; 888817535bSAndreas Gohr } 898817535bSAndreas Gohr 908817535bSAndreas Gohr /** 918817535bSAndreas Gohr * @param $text 928817535bSAndreas Gohr * @return array 938817535bSAndreas Gohr * @throws \Exception 948817535bSAndreas Gohr * @todo maybe add overlap support 958817535bSAndreas Gohr * @todo support splitting too long sentences 968817535bSAndreas Gohr */ 978817535bSAndreas Gohr protected function splitIntoChunks($text) 988817535bSAndreas Gohr { 998817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 1008817535bSAndreas Gohr $tiktok = new Encoder(); 1018817535bSAndreas Gohr 1028817535bSAndreas Gohr $chunks = []; 1038817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 1048817535bSAndreas Gohr 1058817535bSAndreas Gohr $chunklen = 0; 1068817535bSAndreas Gohr $chunk = ''; 1078817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 1088817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 1098817535bSAndreas Gohr if ($slen > self::MAX_TOKEN_LEN) { 1108817535bSAndreas Gohr // sentence is too long, we need to split it further 1118817535bSAndreas Gohr throw new \Exception('Sentence too long, splitting not implemented yet'); 1128817535bSAndreas Gohr } 1138817535bSAndreas Gohr 1148817535bSAndreas Gohr if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 1158817535bSAndreas Gohr // add to current chunk 1168817535bSAndreas Gohr $chunk .= $sentence; 1178817535bSAndreas Gohr $chunklen += $slen; 1188817535bSAndreas Gohr } else { 1198817535bSAndreas Gohr // start new chunk 1208817535bSAndreas Gohr $chunks[] = $chunk; 1218817535bSAndreas Gohr $chunk = $sentence; 1228817535bSAndreas Gohr $chunklen = $slen; 1238817535bSAndreas Gohr } 1248817535bSAndreas Gohr } 1258817535bSAndreas Gohr $chunks[] = $chunk; 1268817535bSAndreas Gohr 1278817535bSAndreas Gohr return $chunks; 1288817535bSAndreas Gohr } 1298817535bSAndreas Gohr 130*9da5f0dfSAndreas Gohr /** 131*9da5f0dfSAndreas Gohr * Store additional chunk data in the file system 132*9da5f0dfSAndreas Gohr * 133*9da5f0dfSAndreas Gohr * @param int $id The chunk id in the K-D tree 134*9da5f0dfSAndreas Gohr * @param string $text raw text of the chunk 135*9da5f0dfSAndreas Gohr * @param array $meta meta data to store with the chunk 136*9da5f0dfSAndreas Gohr * @return void 137*9da5f0dfSAndreas Gohr */ 1388817535bSAndreas Gohr public function saveChunk($id, $text, $meta = []) 1398817535bSAndreas Gohr { 1408817535bSAndreas Gohr $data = [ 1418817535bSAndreas Gohr 'id' => $id, 1428817535bSAndreas Gohr 'text' => $text, 1438817535bSAndreas Gohr 'meta' => $meta, 1448817535bSAndreas Gohr ]; 1458817535bSAndreas Gohr 1468817535bSAndreas Gohr $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 1478817535bSAndreas Gohr io_saveFile($chunkfile, json_encode($data)); 1488817535bSAndreas Gohr } 1498817535bSAndreas Gohr 150*9da5f0dfSAndreas Gohr /** 151*9da5f0dfSAndreas Gohr * Load chunk data from the file system 152*9da5f0dfSAndreas Gohr * 153*9da5f0dfSAndreas Gohr * @param int $id 154*9da5f0dfSAndreas Gohr * @return array The chunk data [id, text, meta => []] 155*9da5f0dfSAndreas Gohr */ 1568817535bSAndreas Gohr public function loadChunk($id) 1578817535bSAndreas Gohr { 1588817535bSAndreas Gohr $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 1598817535bSAndreas Gohr return json_decode(io_readFile($chunkfile, false), true); 1608817535bSAndreas Gohr } 1618817535bSAndreas Gohr 162*9da5f0dfSAndreas Gohr /** 163*9da5f0dfSAndreas Gohr * Return the path to where the K-D tree and chunk data is stored 164*9da5f0dfSAndreas Gohr * 165*9da5f0dfSAndreas Gohr * @param string $subdir 166*9da5f0dfSAndreas Gohr * @return string 167*9da5f0dfSAndreas Gohr */ 1688817535bSAndreas Gohr protected function getStorageDir($subdir = '') 1698817535bSAndreas Gohr { 1708817535bSAndreas Gohr global $conf; 1718817535bSAndreas Gohr $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 1728817535bSAndreas Gohr if ($subdir) $dir .= $subdir . '/'; 1738817535bSAndreas Gohr io_mkdir_p($dir); 1748817535bSAndreas Gohr return $dir; 1758817535bSAndreas Gohr } 1768817535bSAndreas Gohr} 177