1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\Search\Indexer; 6use Hexogen\KDTree\FSKDTree; 7use Hexogen\KDTree\FSTreePersister; 8use Hexogen\KDTree\Item; 9use Hexogen\KDTree\ItemFactory; 10use Hexogen\KDTree\ItemList; 11use Hexogen\KDTree\KDTree; 12use Hexogen\KDTree\NearestSearch; 13use Hexogen\KDTree\Point; 14use TikToken\Encoder; 15use Vanderlee\Sentence\Sentence; 16 17class Embeddings 18{ 19 20 const MAX_TOKEN_LEN = 1000; 21 const INDEX_NAME = 'aichat'; 22 const INDEX_FILE = 'index.bin'; 23 24 protected $openAI; 25 26 public function __construct(OpenAI $openAI, $logger = null) 27 { 28 $this->openAI = $openAI; 29 $this->logger = $logger; 30 } 31 32 public function createNewIndex() 33 { 34 io_rmdir($this->getStorageDir(), true); // delete old index 35 36 $indexer = new Indexer(); 37 $pages = $indexer->getPages(); 38 $itemCount = 0; 39 40 $itemList = new ItemList(1536); 41 foreach ($pages as $page) { 42 if (!page_exists($page)) continue; 43 $text = rawWiki($page); 44 $chunks = $this->splitIntoChunks($text); 45 $meta = [ 46 'pageid' => $page, 47 // fixme add title here? 48 ]; 49 foreach ($chunks as $chunk) { 50 $embedding = $this->openAI->getEmbedding($chunk); 51 $item = new Item($itemCount++, $embedding); 52 $itemList->addItem($item); 53 $this->saveChunk($item->getId(), $chunk, $meta); 54 } 55 if ($this->logger) { 56 $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]); 57 } 58 } 59 60 $tree = new KDTree($itemList); 61 if($this->logger) { 62 $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]); 63 } 64 $persister = new FSTreePersister($this->getStorageDir()); 65 $persister->convert($tree, self::INDEX_FILE); 66 } 67 68 public function getSimilarChunks($query, $limit = 4) 69 { 70 $embedding = $this->openAI->getEmbedding($query); 71 72 $file = $this->getStorageDir() . self::INDEX_FILE; 73 $fsTree = new FSKDTree($file, new ItemFactory()); 74 $fsSearcher = new NearestSearch($fsTree); 75 $items = $fsSearcher->search(new Point($embedding), $limit); 76 77 $result = []; 78 foreach ($items as $item) { 79 $result [] = $this->loadChunk($item->getId()); 80 } 81 return $result; 82 } 83 84 /** 85 * @param $text 86 * @return array 87 * @throws \Exception 88 * @todo maybe add overlap support 89 * @todo support splitting too long sentences 90 */ 91 protected function splitIntoChunks($text) 92 { 93 $sentenceSplitter = new Sentence(); 94 $tiktok = new Encoder(); 95 96 $chunks = []; 97 $sentences = $sentenceSplitter->split($text); 98 99 $chunklen = 0; 100 $chunk = ''; 101 while ($sentence = array_shift($sentences)) { 102 $slen = count($tiktok->encode($sentence)); 103 if ($slen > self::MAX_TOKEN_LEN) { 104 // sentence is too long, we need to split it further 105 throw new \Exception('Sentence too long, splitting not implemented yet'); 106 } 107 108 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 109 // add to current chunk 110 $chunk .= $sentence; 111 $chunklen += $slen; 112 } else { 113 // start new chunk 114 $chunks[] = $chunk; 115 $chunk = $sentence; 116 $chunklen = $slen; 117 } 118 } 119 $chunks[] = $chunk; 120 121 return $chunks; 122 } 123 124 125 public function saveChunk($id, $text, $meta = []) 126 { 127 $data = [ 128 'id' => $id, 129 'text' => $text, 130 'meta' => $meta, 131 ]; 132 133 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 134 io_saveFile($chunkfile, json_encode($data)); 135 } 136 137 138 public function loadChunk($id) 139 { 140 $chunkfile = $this->getStorageDir('chunk') . $id . '.json'; 141 return json_decode(io_readFile($chunkfile, false), true); 142 } 143 144 protected function getStorageDir($subdir = '') 145 { 146 global $conf; 147 $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/'; 148 if ($subdir) $dir .= $subdir . '/'; 149 io_mkdir_p($dir); 150 return $dir; 151 } 152} 153