1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\FSKDTree; 7use Hexogen\KDTree\FSTreePersister; 8use Hexogen\KDTree\Item; 9use Hexogen\KDTree\ItemFactory; 10use Hexogen\KDTree\ItemList; 11use Hexogen\KDTree\KDTree; 12use Hexogen\KDTree\NearestSearch; 13use Hexogen\KDTree\Point; 14use TikToken\Encoder; 15use Vanderlee\Sentence\Sentence; 16 17/** 18 * Manage the embeddings index 19 * 20 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 21 * OpenAI and stored in a K-D Tree, chunk data is written to the file system. 22 */ 23class Embeddings 24{ 25 26 const MAX_TOKEN_LEN = 1000; 27 const INDEX_NAME = 'aichat'; 28 const INDEX_FILE = 'index.bin'; 29 30 protected $openAI; 31 32 public function __construct(OpenAI $openAI, $logger = null) 33 { 34 $this->openAI = $openAI; 35 $this->logger = $logger; 36 } 37 38 public function createNewIndex() 39 { 40 io_rmdir($this->getStorageDir(), true); // delete old index 41 42 $indexer = new Indexer(); 43 $pages = $indexer->getPages(); 44 $itemCount = 0; 45 46 $itemList = new ItemList(1536); 47 foreach ($pages as $page) { 48 if (!page_exists($page)) continue; 49 $text = rawWiki($page); 50 $chunks = $this->splitIntoChunks($text); 51 $meta = [ 52 'pageid' => $page, 53 // fixme add title here? 54 ]; 55 foreach ($chunks as $chunk) { 56 $embedding = $this->openAI->getEmbedding($chunk); 57 $item = new Item($itemCount++, $embedding); 58 $itemList->addItem($item); 59 $this->saveChunk($item->getId(), $chunk, $meta); 60 } 61 if ($this->logger) { 62 $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 63 } 64 } 65 66 $tree = new KDTree($itemList); 67 if($this->logger) { 68 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 69 } 70 $persister = new FSTreePersister($this->getStorageDir()); 71 $persister->convert($tree, self::INDEX_FILE); 72 } 73 74 public function getSimilarChunks($query, $limit = 4) 75 { 76 $embedding = $this->openAI->getEmbedding($query); 77 78 $file = $this->getStorageDir() . self::INDEX_FILE; 79 $fsTree = new FSKDTree($file, new ItemFactory()); 80 $fsSearcher = new NearestSearch($fsTree); 81 $items = $fsSearcher->search(new Point($embedding), $limit); 82 83 $result = []; 84 foreach ($items as $item) { 85 $result [] = $this->loadChunk($item->getId()); 86 } 87 return $result; 88 } 89 90 /** 91 * @param $text 92 * @return array 93 * @throws \Exception 94 * @todo maybe add overlap support 95 * @todo support splitting too long sentences 96 */ 97 protected function splitIntoChunks($text) 98 { 99 $sentenceSplitter = new Sentence(); 100 $tiktok = new Encoder(); 101 102 $chunks = []; 103 $sentences = $sentenceSplitter->split($text); 104 105 $chunklen = 0; 106 $chunk = ''; 107 while ($sentence = array_shift($sentences)) { 108 $slen = count($tiktok->encode($sentence)); 109 if ($slen > self::MAX_TOKEN_LEN) { 110 // sentence is too long, we need to split it further 111 throw new \Exception('Sentence too long, splitting not implemented yet'); 112 } 113 114 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 115 // add to current chunk 116 $chunk .= $sentence; 117 $chunklen += $slen; 118 } else { 119 // start new chunk 120 $chunks[] = $chunk; 121 $chunk = $sentence; 122 $chunklen = $slen; 123 } 124 } 125 $chunks[] = $chunk; 126 127 return $chunks; 128 } 129 130 /** 131 * Store additional chunk data in the file system 132 * 133 * @param int $id The chunk id in the K-D tree 134 * @param string $text raw text of the chunk 135 * @param array $meta meta data to store with the chunk 136 * @return void 137 */ 138 public function saveChunk($id, $text, $meta = []) 139 { 140 $data = [ 141 'id' => $id, 142 'text' => $text, 143 'meta' => $meta, 144 ]; 145 146 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 147 io_saveFile($chunkfile, json_encode($data)); 148 } 149 150 /** 151 * Load chunk data from the file system 152 * 153 * @param int $id 154 * @return array The chunk data [id, text, meta => []] 155 */ 156 public function loadChunk($id) 157 { 158 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 159 return json_decode(io_readFile($chunkfile, false), true); 160 } 161 162 /** 163 * Return the path to where the K-D tree and chunk data is stored 164 * 165 * @param string $subdir 166 * @return string 167 */ 168 protected function getStorageDir($subdir = '') 169 { 170 global $conf; 171 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 172 if ($subdir) $dir .= $subdir . '/'; 173 io_mkdir_p($dir); 174 return $dir; 175 } 176} 177