1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\FSKDTree; 7use Hexogen\KDTree\FSTreePersister; 8use Hexogen\KDTree\Item; 9use Hexogen\KDTree\ItemFactory; 10use Hexogen\KDTree\ItemList; 11use Hexogen\KDTree\KDTree; 12use Hexogen\KDTree\NearestSearch; 13use Hexogen\KDTree\Point; 14use TikToken\Encoder; 15use Vanderlee\Sentence\Sentence; 16 17class Embeddings 18{ 19 20 const MAX_TOKEN_LEN = 1500; 21 const INDEX_NAME = 'aichat'; 22 const INDEX_FILE = 'index.bin'; 23 24 protected $openAI; 25 26 public function __construct(OpenAI $openAI, $logger = null) 27 { 28 $this->openAI = $openAI; 29 $this->logger = $logger; 30 } 31 32 public function createNewIndex() 33 { 34 io_rmdir($this->getStorageDir(), true); // delete old index 35 36 $indexer = new Indexer(); 37 $pages = $indexer->getPages(); 38 $itemCount = 0; 39 40 $itemList = new ItemList(1536); 41 foreach ($pages as $page) { 42 if (!page_exists($page)) continue; 43 $text = rawWiki($page); 44 $chunks = $this->splitIntoChunks($text); 45 $meta = [ 46 'pageid' => $page, 47 // fixme add title here? 48 ]; 49 foreach ($chunks as $chunk) { 50 $embedding = $this->openAI->getEmbedding($chunk); 51 $item = new Item($itemCount++, $embedding); 52 $itemList->addItem($item); 53 $this->saveChunk($item->getId(), $chunk, $meta); 54 } 55 if ($this->logger) { 56 $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 57 } 58 } 59 60 $tree = new KDTree($itemList); 61 if($this->logger) { 62 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 63 } 64 $persister = new FSTreePersister($this->getStorageDir()); 65 $persister->convert($tree, self::INDEX_FILE); 66 } 67 68 public function getSimilarChunks($query, $limit = 4) 69 { 70 $embedding = $this->openAI->getEmbedding($query); 71 72 $file = $this->getStorageDir() . self::INDEX_FILE; 73 $fsTree = new FSKDTree($file, new ItemFactory()); 74 $items = $fsTree->getItemCount(); 75 $fsSearcher = new NearestSearch($fsTree); 76 $items = $fsSearcher->search(new Point($embedding), $limit); 77 78 $result = []; 79 foreach ($items as $item) { 80 $result [] = $this->loadChunk($item->getId()); 81 } 82 return $result; 83 } 84 85 /** 86 * @param $text 87 * @return array 88 * @throws \Exception 89 * @todo maybe add overlap support 90 * @todo support splitting too long sentences 91 */ 92 protected function splitIntoChunks($text) 93 { 94 $sentenceSplitter = new Sentence(); 95 $tiktok = new Encoder(); 96 97 $chunks = []; 98 $sentences = $sentenceSplitter->split($text); 99 100 $chunklen = 0; 101 $chunk = ''; 102 while ($sentence = array_shift($sentences)) { 103 $slen = count($tiktok->encode($sentence)); 104 if ($slen > self::MAX_TOKEN_LEN) { 105 // sentence is too long, we need to split it further 106 throw new \Exception('Sentence too long, splitting not implemented yet'); 107 } 108 109 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 110 // add to current chunk 111 $chunk .= $sentence; 112 $chunklen += $slen; 113 } else { 114 // start new chunk 115 $chunks[] = $chunk; 116 $chunk = $sentence; 117 $chunklen = $slen; 118 } 119 } 120 $chunks[] = $chunk; 121 122 return $chunks; 123 } 124 125 126 public function saveChunk($id, $text, $meta = []) 127 { 128 $data = [ 129 'id' => $id, 130 'text' => $text, 131 'meta' => $meta, 132 ]; 133 134 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 135 io_saveFile($chunkfile, json_encode($data)); 136 } 137 138 139 public function loadChunk($id) 140 { 141 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 142 return json_decode(io_readFile($chunkfile, false), true); 143 } 144 145 protected function getStorageDir($subdir = '') 146 { 147 global $conf; 148 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 149 if ($subdir) $dir .= $subdir . '/'; 150 io_mkdir_p($dir); 151 return $dir; 152 } 153} 154