1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\FSKDTree; 7use Hexogen\KDTree\FSTreePersister; 8use Hexogen\KDTree\Item; 9use Hexogen\KDTree\ItemFactory; 10use Hexogen\KDTree\ItemList; 11use Hexogen\KDTree\KDTree; 12use Hexogen\KDTree\NearestSearch; 13use Hexogen\KDTree\Point; 14use TikToken\Encoder; 15use Vanderlee\Sentence\Sentence; 16 17/** 18 * Manage the embeddings index 19 * 20 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 21 * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 22 */ 23class Embeddings 24{ 25 26 const MAX_TOKEN_LEN = 1000; 27 const INDEX_NAME = 'aichat'; 28 const INDEX_FILE = 'index.bin'; 29 30 protected $openAI; 31 32 public function __construct(OpenAI $openAI, $logger = null) 33 { 34 $this->openAI = $openAI; 35 $this->logger = $logger; 36 } 37 38 public function createNewIndex() 39 { 40 io_rmdir($this->getStorageDir(), true); // delete old index 41 42 $indexer = new Indexer(); 43 $pages = $indexer->getPages(); 44 $itemCount = 0; 45 46 $itemList = new ItemList(1536); 47 foreach ($pages as $page) { 48 if (!page_exists($page)) continue; 49 $text = rawWiki($page); 50 $chunks = $this->splitIntoChunks($text); 51 $meta = [ 52 'pageid' => $page, 53 // fixme add title here? 54 ]; 55 foreach ($chunks as $chunk) { 56 $embedding = $this->openAI->getEmbedding($chunk); 57 $item = new Item($itemCount++, $embedding); 58 $itemList->addItem($item); 59 $this->saveChunk($item->getId(), $chunk, $meta); 60 } 61 if ($this->logger) { 62 $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 63 } 64 } 65 66 $tree = new KDTree($itemList); 67 if ($this->logger) { 68 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 69 } 70 $persister = new FSTreePersister($this->getStorageDir()); 71 $persister->convert($tree, self::INDEX_FILE); 72 } 73 74 /** 75 * Do a nearest neighbor search for chunks similar to the given question 76 * 77 * Returns only chunks the current user is allowed to read, may return an empty result. 78 * 79 * @param string $query The question 80 * @param int $limit The number of results to return 81 * @return array 82 * @throws \Exception 83 */ 84 public function getSimilarChunks($query, $limit = 4) 85 { 86 global $auth; 87 $embedding = $this->openAI->getEmbedding($query); 88 89 $file = $this->getStorageDir() . self::INDEX_FILE; 90 $fsTree = new FSKDTree($file, new ItemFactory()); 91 $fsSearcher = new NearestSearch($fsTree); 92 $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed 93 94 $result = []; 95 foreach ($items as $item) { 96 $chunk = $this->loadChunk($item->getId()); 97 // filter out chunks the user is not allowed to read 98 if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue; 99 $result[] = $chunk; 100 if (count($result) >= $limit) break; 101 } 102 return $result; 103 } 104 105 /** 106 * @param $text 107 * @return array 108 * @throws \Exception 109 * @todo maybe add overlap support 110 * @todo support splitting too long sentences 111 */ 112 protected function splitIntoChunks($text) 113 { 114 $sentenceSplitter = new Sentence(); 115 $tiktok = new Encoder(); 116 117 $chunks = []; 118 $sentences = $sentenceSplitter->split($text); 119 120 $chunklen = 0; 121 $chunk = ''; 122 while ($sentence = array_shift($sentences)) { 123 $slen = count($tiktok->encode($sentence)); 124 if ($slen > self::MAX_TOKEN_LEN) { 125 // sentence is too long, we need to split it further 126 throw new \Exception('Sentence too long, splitting not implemented yet'); 127 } 128 129 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 130 // add to current chunk 131 $chunk .= $sentence; 132 $chunklen += $slen; 133 } else { 134 // start new chunk 135 $chunks[] = $chunk; 136 $chunk = $sentence; 137 $chunklen = $slen; 138 } 139 } 140 $chunks[] = $chunk; 141 142 return $chunks; 143 } 144 145 /** 146 * Store additional chunk data in the file system 147 * 148 * @param int $id The chunk id in the K-D tree 149 * @param string $text raw text of the chunk 150 * @param array $meta meta data to store with the chunk 151 * @return void 152 */ 153 public function saveChunk($id, $text, $meta = []) 154 { 155 $data = [ 156 'id' => $id, 157 'text' => $text, 158 'meta' => $meta, 159 ]; 160 161 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 162 io_saveFile($chunkfile, json_encode($data)); 163 } 164 165 /** 166 * Load chunk data from the file system 167 * 168 * @param int $id 169 * @return array The chunk data [id, text, meta => []] 170 */ 171 public function loadChunk($id) 172 { 173 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 174 return json_decode(io_readFile($chunkfile, false), true); 175 } 176 177 /** 178 * Return the path to where the K-D tree and chunk data is stored 179 * 180 * @param string $subdir 181 * @return string 182 */ 183 protected function getStorageDir($subdir = '') 184 { 185 global $conf; 186 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 187 if ($subdir) $dir .= $subdir . '/'; 188 io_mkdir_p($dir); 189 return $dir; 190 } 191} 192