18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 58817535bSAndreas Gohruse dokuwiki\Search\Indexer; 68817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree; 78817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister; 88817535bSAndreas Gohruse Hexogen\KDTree\Item; 98817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory; 108817535bSAndreas Gohruse Hexogen\KDTree\ItemList; 118817535bSAndreas Gohruse Hexogen\KDTree\KDTree; 128817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch; 138817535bSAndreas Gohruse Hexogen\KDTree\Point; 148817535bSAndreas Gohruse TikToken\Encoder; 158817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 168817535bSAndreas Gohr 178817535bSAndreas Gohrclass Embeddings 188817535bSAndreas Gohr{ 198817535bSAndreas Gohr 20*c4584168SAndreas Gohr const MAX_TOKEN_LEN = 1000; 218817535bSAndreas Gohr const INDEX_NAME = 'aichat'; 228817535bSAndreas Gohr const INDEX_FILE = 'index.bin'; 238817535bSAndreas Gohr 248817535bSAndreas Gohr protected $openAI; 258817535bSAndreas Gohr 268817535bSAndreas Gohr public function __construct(OpenAI $openAI, $logger = null) 278817535bSAndreas Gohr { 288817535bSAndreas Gohr $this->openAI = $openAI; 298817535bSAndreas Gohr $this->logger = $logger; 308817535bSAndreas Gohr } 318817535bSAndreas Gohr 328817535bSAndreas Gohr public function createNewIndex() 338817535bSAndreas Gohr { 348817535bSAndreas Gohr io_rmdir($this->getStorageDir(), true); // delete old index 358817535bSAndreas Gohr 368817535bSAndreas Gohr $indexer = new Indexer(); 378817535bSAndreas Gohr $pages = $indexer->getPages(); 388817535bSAndreas Gohr $itemCount = 0; 398817535bSAndreas Gohr 408817535bSAndreas Gohr $itemList = new ItemList(1536); 418817535bSAndreas Gohr foreach ($pages as $page) { 428817535bSAndreas Gohr if (!page_exists($page)) continue; 438817535bSAndreas Gohr $text = rawWiki($page); 448817535bSAndreas Gohr $chunks = $this->splitIntoChunks($text); 458817535bSAndreas Gohr $meta = [ 468817535bSAndreas Gohr 'pageid' => $page, 478817535bSAndreas Gohr // fixme add title here? 488817535bSAndreas Gohr ]; 498817535bSAndreas Gohr foreach ($chunks as $chunk) { 508817535bSAndreas Gohr $embedding = $this->openAI->getEmbedding($chunk); 518817535bSAndreas Gohr $item = new Item($itemCount++, $embedding); 528817535bSAndreas Gohr $itemList->addItem($item); 538817535bSAndreas Gohr $this->saveChunk($item->getId(), $chunk, $meta); 548817535bSAndreas Gohr } 558817535bSAndreas Gohr if ($this->logger) { 568817535bSAndreas Gohr $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 578817535bSAndreas Gohr } 588817535bSAndreas Gohr } 598817535bSAndreas Gohr 608817535bSAndreas Gohr $tree = new KDTree($itemList); 618817535bSAndreas Gohr if($this->logger) { 628817535bSAndreas Gohr $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 638817535bSAndreas Gohr } 648817535bSAndreas Gohr $persister = new FSTreePersister($this->getStorageDir()); 658817535bSAndreas Gohr $persister->convert($tree, self::INDEX_FILE); 668817535bSAndreas Gohr } 678817535bSAndreas Gohr 688817535bSAndreas Gohr public function getSimilarChunks($query, $limit = 4) 698817535bSAndreas Gohr { 708817535bSAndreas Gohr $embedding = $this->openAI->getEmbedding($query); 718817535bSAndreas Gohr 728817535bSAndreas Gohr $file = $this->getStorageDir() . self::INDEX_FILE; 738817535bSAndreas Gohr $fsTree = new FSKDTree($file, new ItemFactory()); 748817535bSAndreas Gohr $fsSearcher = new NearestSearch($fsTree); 758817535bSAndreas Gohr $items = $fsSearcher->search(new Point($embedding), $limit); 768817535bSAndreas Gohr 778817535bSAndreas Gohr $result = []; 788817535bSAndreas Gohr foreach ($items as $item) { 798817535bSAndreas Gohr $result [] = $this->loadChunk($item->getId()); 808817535bSAndreas Gohr } 818817535bSAndreas Gohr return $result; 828817535bSAndreas Gohr } 838817535bSAndreas Gohr 848817535bSAndreas Gohr /** 858817535bSAndreas Gohr * @param $text 868817535bSAndreas Gohr * @return array 878817535bSAndreas Gohr * @throws \Exception 888817535bSAndreas Gohr * @todo maybe add overlap support 898817535bSAndreas Gohr * @todo support splitting too long sentences 908817535bSAndreas Gohr */ 918817535bSAndreas Gohr protected function splitIntoChunks($text) 928817535bSAndreas Gohr { 938817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 948817535bSAndreas Gohr $tiktok = new Encoder(); 958817535bSAndreas Gohr 968817535bSAndreas Gohr $chunks = []; 978817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 988817535bSAndreas Gohr 998817535bSAndreas Gohr $chunklen = 0; 1008817535bSAndreas Gohr $chunk = ''; 1018817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 1028817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 1038817535bSAndreas Gohr if ($slen > self::MAX_TOKEN_LEN) { 1048817535bSAndreas Gohr // sentence is too long, we need to split it further 1058817535bSAndreas Gohr throw new \Exception('Sentence too long, splitting not implemented yet'); 1068817535bSAndreas Gohr } 1078817535bSAndreas Gohr 1088817535bSAndreas Gohr if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 1098817535bSAndreas Gohr // add to current chunk 1108817535bSAndreas Gohr $chunk .= $sentence; 1118817535bSAndreas Gohr $chunklen += $slen; 1128817535bSAndreas Gohr } else { 1138817535bSAndreas Gohr // start new chunk 1148817535bSAndreas Gohr $chunks[] = $chunk; 1158817535bSAndreas Gohr $chunk = $sentence; 1168817535bSAndreas Gohr $chunklen = $slen; 1178817535bSAndreas Gohr } 1188817535bSAndreas Gohr } 1198817535bSAndreas Gohr $chunks[] = $chunk; 1208817535bSAndreas Gohr 1218817535bSAndreas Gohr return $chunks; 1228817535bSAndreas Gohr } 1238817535bSAndreas Gohr 1248817535bSAndreas Gohr 1258817535bSAndreas Gohr public function saveChunk($id, $text, $meta = []) 1268817535bSAndreas Gohr { 1278817535bSAndreas Gohr $data = [ 1288817535bSAndreas Gohr 'id' => $id, 1298817535bSAndreas Gohr 'text' => $text, 1308817535bSAndreas Gohr 'meta' => $meta, 1318817535bSAndreas Gohr ]; 1328817535bSAndreas Gohr 1338817535bSAndreas Gohr $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 1348817535bSAndreas Gohr io_saveFile($chunkfile, json_encode($data)); 1358817535bSAndreas Gohr } 1368817535bSAndreas Gohr 1378817535bSAndreas Gohr 1388817535bSAndreas Gohr public function loadChunk($id) 1398817535bSAndreas Gohr { 1408817535bSAndreas Gohr $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 1418817535bSAndreas Gohr return json_decode(io_readFile($chunkfile, false), true); 1428817535bSAndreas Gohr } 1438817535bSAndreas Gohr 1448817535bSAndreas Gohr protected function getStorageDir($subdir = '') 1458817535bSAndreas Gohr { 1468817535bSAndreas Gohr global $conf; 1478817535bSAndreas Gohr $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 1488817535bSAndreas Gohr if ($subdir) $dir .= $subdir . '/'; 1498817535bSAndreas Gohr io_mkdir_p($dir); 1508817535bSAndreas Gohr return $dir; 1518817535bSAndreas Gohr } 1528817535bSAndreas Gohr} 153