1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\FSKDTree; 7use Hexogen\KDTree\FSTreePersister; 8use Hexogen\KDTree\Item; 9use Hexogen\KDTree\ItemFactory; 10use Hexogen\KDTree\ItemList; 11use Hexogen\KDTree\KDTree; 12use Hexogen\KDTree\NearestSearch; 13use Hexogen\KDTree\Point; 14use splitbrain\phpcli\CLI; 15use TikToken\Encoder; 16use Vanderlee\Sentence\Sentence; 17 18/** 19 * Manage the embeddings index 20 * 21 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 22 * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 23 */ 24class Embeddings 25{ 26 27 const MAX_TOKEN_LEN = 1000; 28 const INDEX_NAME = 'aichat'; 29 const INDEX_FILE = 'index.bin'; 30 31 /** @var OpenAI */ 32 protected $openAI; 33 /** @var CLI|null */ 34 protected $logger; 35 36 /** 37 * @param OpenAI $openAI 38 */ 39 public function __construct(OpenAI $openAI) 40 { 41 $this->openAI = $openAI; 42 } 43 44 /** 45 * Add a logger instance 46 * 47 * @param CLI $logger 48 * @return void 49 */ 50 public function setLogger(CLI $logger) 51 { 52 $this->logger = $logger; 53 } 54 55 /** 56 * Create a new K-D Tree from all pages 57 * 58 * Deletes the existing index 59 * 60 * @return void 61 * @throws \Hexogen\KDTree\Exception\ValidationException 62 */ 63 public function createNewIndex() 64 { 65 io_rmdir($this->getStorageDir(), true); // delete old index 66 67 $indexer = new Indexer(); 68 $pages = $indexer->getPages(); 69 $itemCount = 0; 70 71 $itemList = new ItemList(1536); 72 foreach ($pages as $page) { 73 if (!page_exists($page)) continue; 74 if (isHiddenPage($page)) continue; 75 $text = rawWiki($page); 76 $chunks = $this->splitIntoChunks($text); 77 $meta = [ 78 'pageid' => $page, 79 ]; 80 foreach ($chunks as $chunk) { 81 $embedding = $this->openAI->getEmbedding($chunk); 82 $item = new Item($itemCount++, $embedding); 83 $itemList->addItem($item); 84 $this->saveChunk($item->getId(), $chunk, $meta); 85 } 86 if ($this->logger) { 87 $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 88 } 89 } 90 91 $tree = new KDTree($itemList); 92 if ($this->logger) { 93 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 94 } 95 $persister = new FSTreePersister($this->getStorageDir()); 96 $persister->convert($tree, self::INDEX_FILE); 97 } 98 99 /** 100 * Do a nearest neighbor search for chunks similar to the given question 101 * 102 * Returns only chunks the current user is allowed to read, may return an empty result. 103 * 104 * @param string $query The question 105 * @param int $limit The number of results to return 106 * @return array 107 * @throws \Exception 108 */ 109 public function getSimilarChunks($query, $limit = 4) 110 { 111 global $auth; 112 $embedding = $this->openAI->getEmbedding($query); 113 114 $file = $this->getStorageDir() . self::INDEX_FILE; 115 $fsTree = new FSKDTree($file, new ItemFactory()); 116 $fsSearcher = new NearestSearch($fsTree); 117 $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed 118 119 $result = []; 120 foreach ($items as $item) { 121 $chunk = $this->loadChunk($item->getId()); 122 // filter out chunks the user is not allowed to read 123 if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue; 124 $result[] = $chunk; 125 if (count($result) >= $limit) break; 126 } 127 return $result; 128 } 129 130 /** 131 * @param $text 132 * @return array 133 * @throws \Exception 134 * @todo maybe add overlap support 135 * @todo support splitting too long sentences 136 */ 137 protected function splitIntoChunks($text) 138 { 139 $sentenceSplitter = new Sentence(); 140 $tiktok = new Encoder(); 141 142 $chunks = []; 143 $sentences = $sentenceSplitter->split($text); 144 145 $chunklen = 0; 146 $chunk = ''; 147 while ($sentence = array_shift($sentences)) { 148 $slen = count($tiktok->encode($sentence)); 149 if ($slen > self::MAX_TOKEN_LEN) { 150 // sentence is too long, we need to split it further 151 throw new \Exception('Sentence too long, splitting not implemented yet'); 152 } 153 154 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 155 // add to current chunk 156 $chunk .= $sentence; 157 $chunklen += $slen; 158 } else { 159 // start new chunk 160 $chunks[] = $chunk; 161 $chunk = $sentence; 162 $chunklen = $slen; 163 } 164 } 165 $chunks[] = $chunk; 166 167 return $chunks; 168 } 169 170 /** 171 * Store additional chunk data in the file system 172 * 173 * @param int $id The chunk id in the K-D tree 174 * @param string $text raw text of the chunk 175 * @param array $meta meta data to store with the chunk 176 * @return void 177 */ 178 public function saveChunk($id, $text, $meta = []) 179 { 180 $data = [ 181 'id' => $id, 182 'text' => $text, 183 'meta' => $meta, 184 ]; 185 186 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 187 io_saveFile($chunkfile, json_encode($data)); 188 } 189 190 /** 191 * Load chunk data from the file system 192 * 193 * @param int $id 194 * @return array The chunk data [id, text, meta => []] 195 */ 196 public function loadChunk($id) 197 { 198 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 199 return json_decode(io_readFile($chunkfile, false), true); 200 } 201 202 /** 203 * Return the path to where the K-D tree and chunk data is stored 204 * 205 * @param string $subdir 206 * @return string 207 */ 208 protected function getStorageDir($subdir = '') 209 { 210 global $conf; 211 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 212 if ($subdir) $dir .= $subdir . '/'; 213 io_mkdir_p($dir); 214 return $dir; 215 } 216} 217