1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\Exception\ValidationException; 7use Hexogen\KDTree\FSKDTree; 8use Hexogen\KDTree\FSTreePersister; 9use Hexogen\KDTree\Item; 10use Hexogen\KDTree\ItemFactory; 11use Hexogen\KDTree\ItemList; 12use Hexogen\KDTree\KDTree; 13use Hexogen\KDTree\NearestSearch; 14use Hexogen\KDTree\Point; 15use splitbrain\phpcli\CLI; 16use TikToken\Encoder; 17use Vanderlee\Sentence\Sentence; 18 19/** 20 * Manage the embeddings index 21 * 22 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 23 * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 24 */ 25class Embeddings 26{ 27 28 const MAX_TOKEN_LEN = 1000; 29 const INDEX_NAME = 'aichat'; 30 const INDEX_FILE = 'index.bin'; 31 32 /** @var OpenAI */ 33 protected $openAI; 34 /** @var CLI|null */ 35 protected $logger; 36 37 /** 38 * @param OpenAI $openAI 39 */ 40 public function __construct(OpenAI $openAI) 41 { 42 $this->openAI = $openAI; 43 } 44 45 /** 46 * Add a logger instance 47 * 48 * @param CLI $logger 49 * @return void 50 */ 51 public function setLogger(CLI $logger) 52 { 53 $this->logger = $logger; 54 } 55 56 /** 57 * Create a new K-D Tree from all pages 58 * 59 * Deletes the existing index 60 * 61 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 62 * @return void 63 * @throws ValidationException 64 */ 65 public function createNewIndex($skipRE = '') 66 { 67 io_rmdir($this->getStorageDir(), true); // delete old index 68 69 $indexer = new Indexer(); 70 $pages = $indexer->getPages(); 71 $itemCount = 0; 72 73 $itemList = new ItemList(1536); 74 foreach ($pages as $page) { 75 if (!page_exists($page)) continue; 76 if (isHiddenPage($page)) continue; 77 if ($skipRE && preg_match($skipRE, $page)) continue; 78 $text = rawWiki($page); 79 $chunks = $this->splitIntoChunks($text); 80 $meta = [ 81 'pageid' => $page, 82 ]; 83 foreach ($chunks as $chunk) { 84 try { 85 $embedding = $this->openAI->getEmbedding($chunk); 86 } catch (\Exception $e) { 87 if ($this->logger) { 88 $this->logger->error( 89 'Failed to get embedding for chunk of page {page}: {msg}', 90 ['page' => $page, 'msg' => $e->getMessage()] 91 ); 92 } 93 continue; 94 } 95 $item = new Item($itemCount++, $embedding); 96 $itemList->addItem($item); 97 $this->saveChunk($item->getId(), $chunk, $meta); 98 } 99 if ($this->logger) { 100 $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 101 } 102 } 103 104 $tree = new KDTree($itemList); 105 if ($this->logger) { 106 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 107 } 108 $persister = new FSTreePersister($this->getStorageDir()); 109 $persister->convert($tree, self::INDEX_FILE); 110 } 111 112 /** 113 * Do a nearest neighbor search for chunks similar to the given question 114 * 115 * Returns only chunks the current user is allowed to read, may return an empty result. 116 * 117 * @param string $query The question 118 * @param int $limit The number of results to return 119 * @return array 120 * @throws \Exception 121 */ 122 public function getSimilarChunks($query, $limit = 4) 123 { 124 global $auth; 125 $embedding = $this->openAI->getEmbedding($query); 126 127 $file = $this->getStorageDir() . self::INDEX_FILE; 128 $fsTree = new FSKDTree($file, new ItemFactory()); 129 $fsSearcher = new NearestSearch($fsTree); 130 $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed 131 132 $result = []; 133 foreach ($items as $item) { 134 $chunk = $this->loadChunk($item->getId()); 135 // filter out chunks the user is not allowed to read 136 if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue; 137 $result[] = $chunk; 138 if (count($result) >= $limit) break; 139 } 140 return $result; 141 } 142 143 /** 144 * @param $text 145 * @return array 146 * @throws \Exception 147 * @todo maybe add overlap support 148 * @todo support splitting too long sentences 149 */ 150 public function splitIntoChunks($text) 151 { 152 $sentenceSplitter = new Sentence(); 153 $tiktok = new Encoder(); 154 155 $chunks = []; 156 $sentences = $sentenceSplitter->split($text); 157 158 $chunklen = 0; 159 $chunk = ''; 160 while ($sentence = array_shift($sentences)) { 161 $slen = count($tiktok->encode($sentence)); 162 if ($slen > self::MAX_TOKEN_LEN) { 163 // sentence is too long, we need to split it further 164 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 165 continue; 166 } 167 168 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 169 // add to current chunk 170 $chunk .= $sentence; 171 $chunklen += $slen; 172 } else { 173 // start new chunk 174 $chunks[] = $chunk; 175 $chunk = $sentence; 176 $chunklen = $slen; 177 } 178 } 179 $chunks[] = $chunk; 180 181 return $chunks; 182 } 183 184 /** 185 * Store additional chunk data in the file system 186 * 187 * @param int $id The chunk id in the K-D tree 188 * @param string $text raw text of the chunk 189 * @param array $meta meta data to store with the chunk 190 * @return void 191 */ 192 public function saveChunk($id, $text, $meta = []) 193 { 194 $data = [ 195 'id' => $id, 196 'text' => $text, 197 'meta' => $meta, 198 ]; 199 200 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 201 io_saveFile($chunkfile, json_encode($data)); 202 } 203 204 /** 205 * Load chunk data from the file system 206 * 207 * @param int $id 208 * @return array The chunk data [id, text, meta => []] 209 */ 210 public function loadChunk($id) 211 { 212 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 213 return json_decode(io_readFile($chunkfile, false), true); 214 } 215 216 /** 217 * Return the path to where the K-D tree and chunk data is stored 218 * 219 * @param string $subdir 220 * @return string 221 */ 222 protected function getStorageDir($subdir = '') 223 { 224 global $conf; 225 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 226 if ($subdir) $dir .= $subdir . '/'; 227 io_mkdir_p($dir); 228 return $dir; 229 } 230} 231