1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\plugin\aichat\backend\AbstractStorage; 6use dokuwiki\plugin\aichat\backend\Chunk; 7use dokuwiki\plugin\aichat\backend\KDTreeStorage; 8use dokuwiki\plugin\aichat\backend\SQLiteStorage; 9use dokuwiki\Search\Indexer; 10use Hexogen\KDTree\Exception\ValidationException; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 24 const MAX_TOKEN_LEN = 1000; 25 26 27 /** @var OpenAI */ 28 protected $openAI; 29 /** @var CLI|null */ 30 protected $logger; 31 32 /** @var AbstractStorage */ 33 protected $storage; 34 35 /** 36 * @param OpenAI $openAI 37 */ 38 public function __construct(OpenAI $openAI) 39 { 40 $this->openAI = $openAI; 41 $this->storage = new SQLiteStorage(); 42 } 43 44 /** 45 * Access storage 46 * 47 * @return AbstractStorage 48 */ 49 public function getStorage() 50 { 51 return $this->storage; 52 } 53 54 /** 55 * Add a logger instance 56 * 57 * @param CLI $logger 58 * @return void 59 */ 60 public function setLogger(CLI $logger) 61 { 62 $this->logger = $logger; 63 } 64 65 /** 66 * Update the embeddings storage 67 * 68 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 69 * @param bool $clear Should any existing storage be cleared before updating? 70 * @return void 71 * @throws \Exception 72 */ 73 public function createNewIndex($skipRE = '', $clear = false) 74 { 75 $indexer = new Indexer(); 76 $pages = $indexer->getPages(); 77 78 $this->storage->startCreation(1536, $clear); 79 foreach ($pages as $pid => $page) { 80 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 81 82 if ( 83 !page_exists($page) || 84 isHiddenPage($page) || 85 filesize(wikiFN($page)) < 150 || // skip very small pages 86 ($skipRE && preg_match($skipRE, $page)) 87 ) { 88 // this page should not be in the index (anymore) 89 $this->storage->deletePageChunks($page, $chunkID); 90 continue; 91 } 92 93 $firstChunk = $this->storage->getChunk($chunkID); 94 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 95 // page is older than the chunks we have, reuse the existing chunks 96 $this->storage->reusePageChunks($page, $chunkID); 97 if ($this->logger) $this->logger->info("Reusing chunks for $page"); 98 } else { 99 // page is newer than the chunks we have, create new chunks 100 $this->storage->deletePageChunks($page, $chunkID); 101 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 102 } 103 } 104 $this->storage->finalizeCreation(); 105 } 106 107 /** 108 * Split the given page, fetch embedding vectors and return Chunks 109 * 110 * Will use the text renderer plugin if available to get the rendered text. 111 * Otherwise the raw wiki text is used. 112 * 113 * @param string $page Name of the page to split 114 * @param int $firstChunkID The ID of the first chunk of this page 115 * @return Chunk[] A list of chunks created for this page 116 * @throws \Exception 117 */ 118 protected function createPageChunks($page, $firstChunkID) 119 { 120 $chunkList = []; 121 122 $textRenderer = plugin_load('renderer', 'text'); 123 if ($textRenderer) { 124 global $ID; 125 $ID = $page; 126 $text = p_cached_output(wikiFN($page), 'text', $page); 127 } else { 128 $text = rawWiki($page); 129 } 130 131 $parts = $this->splitIntoChunks($text); 132 foreach ($parts as $part) { 133 if(trim($part) == '') continue; // skip empty chunks 134 135 try { 136 $embedding = $this->openAI->getEmbedding($part); 137 } catch (\Exception $e) { 138 if ($this->logger) { 139 $this->logger->error( 140 'Failed to get embedding for chunk of page {page}: {msg}', 141 ['page' => $page, 'msg' => $e->getMessage()] 142 ); 143 } 144 continue; 145 } 146 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 147 $firstChunkID++; 148 } 149 if ($this->logger) { 150 if(count($chunkList)) { 151 $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]); 152 } else { 153 $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 154 } 155 } 156 return $chunkList; 157 } 158 159 /** 160 * Do a nearest neighbor search for chunks similar to the given question 161 * 162 * Returns only chunks the current user is allowed to read, may return an empty result. 163 * 164 * @param string $query The question 165 * @param int $limit The number of results to return 166 * @return Chunk[] 167 * @throws \Exception 168 */ 169 public function getSimilarChunks($query, $limit = 4) 170 { 171 global $auth; 172 $vector = $this->openAI->getEmbedding($query); 173 174 $chunks = $this->storage->getSimilarChunks($vector, $limit); 175 $result = []; 176 foreach ($chunks as $chunk) { 177 // filter out chunks the user is not allowed to read 178 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 179 $result[] = $chunk; 180 if (count($result) >= $limit) break; 181 } 182 return $result; 183 } 184 185 186 /** 187 * @param $text 188 * @return array 189 * @throws \Exception 190 * @todo maybe add overlap support 191 * @todo support splitting too long sentences 192 */ 193 public function splitIntoChunks($text) 194 { 195 $sentenceSplitter = new Sentence(); 196 $tiktok = new Encoder(); 197 198 $chunks = []; 199 $sentences = $sentenceSplitter->split($text); 200 201 $chunklen = 0; 202 $chunk = ''; 203 while ($sentence = array_shift($sentences)) { 204 $slen = count($tiktok->encode($sentence)); 205 if ($slen > self::MAX_TOKEN_LEN) { 206 // sentence is too long, we need to split it further 207 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 208 continue; 209 } 210 211 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 212 // add to current chunk 213 $chunk .= $sentence; 214 $chunklen += $slen; 215 } else { 216 // start new chunk 217 $chunks[] = $chunk; 218 $chunk = $sentence; 219 $chunklen = $slen; 220 } 221 } 222 $chunks[] = $chunk; 223 224 return $chunks; 225 } 226} 227