1<?php 2 3namespace dokuwiki\plugin\aichat; 4 5use dokuwiki\plugin\aichat\backend\AbstractStorage; 6use dokuwiki\plugin\aichat\backend\Chunk; 7use dokuwiki\plugin\aichat\backend\KDTreeStorage; 8use dokuwiki\plugin\aichat\backend\SQLiteStorage; 9use dokuwiki\Search\Indexer; 10use Hexogen\KDTree\Exception\ValidationException; 11use splitbrain\phpcli\CLI; 12use TikToken\Encoder; 13use Vanderlee\Sentence\Sentence; 14 15/** 16 * Manage the embeddings index 17 * 18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19 * OpenAI and stored in the Storage backend. 20 */ 21class Embeddings 22{ 23 24 const MAX_TOKEN_LEN = 1000; 25 26 27 /** @var OpenAI */ 28 protected $openAI; 29 /** @var CLI|null */ 30 protected $logger; 31 32 /** @var AbstractStorage */ 33 protected $storage; 34 35 /** 36 * @param OpenAI $openAI 37 */ 38 public function __construct(OpenAI $openAI) 39 { 40 $this->openAI = $openAI; 41 //$this->storage = new KDTreeStorage(); // FIXME make configurable 42 $this->storage = new SQLiteStorage(); // FIXME make configurable 43 } 44 45 /** 46 * Access storage 47 * 48 * @return AbstractStorage 49 */ 50 public function getStorage() 51 { 52 return $this->storage; 53 } 54 55 /** 56 * Add a logger instance 57 * 58 * @param CLI $logger 59 * @return void 60 */ 61 public function setLogger(CLI $logger) 62 { 63 $this->logger = $logger; 64 } 65 66 /** 67 * Create a new K-D Tree from all pages 68 * 69 * Deletes the existing index 70 * 71 * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 72 * @return void 73 * @throws ValidationException 74 */ 75 public function createNewIndex($skipRE = '') 76 { 77 $indexer = new Indexer(); 78 $pages = $indexer->getPages(); 79 80 $this->storage->startCreation(1536); 81 foreach ($pages as $pid => $page) { 82 if (!page_exists($page)) continue; 83 if (isHiddenPage($page)) continue; 84 if ($skipRE && preg_match($skipRE, $page)) continue; // FIXME delete previous chunks 85 86 $chunkID = $pid * 100; // chunk IDs start at page ID * 100 87 88 $firstChunk = $this->storage->getChunk($chunkID); 89 if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 90 // page is older than the chunks we have, reuse the existing chunks 91 $this->storage->reusePageChunks($page, $chunkID); 92 if($this->logger) $this->logger->info("Reusing chunks for $page"); 93 } else { 94 // page is newer than the chunks we have, create new chunks 95 $this->storage->deletePageChunks($page, $chunkID); 96 $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 97 } 98 } 99 $this->storage->finalizeCreation(); 100 } 101 102 /** 103 * Split the given page, fetch embedding vectors and return Chunks 104 * 105 * @param string $page Name of the page to split 106 * @param int $firstChunkID The ID of the first chunk of this page 107 * @return Chunk[] A list of chunks created for this page 108 * @throws \Exception 109 */ 110 protected function createPageChunks($page, $firstChunkID) 111 { 112 $chunkList = []; 113 $parts = $this->splitIntoChunks(rawWiki($page)); 114 foreach ($parts as $part) { 115 try { 116 $embedding = $this->openAI->getEmbedding($part); 117 } catch (\Exception $e) { 118 if ($this->logger) { 119 $this->logger->error( 120 'Failed to get embedding for chunk of page {page}: {msg}', 121 ['page' => $page, 'msg' => $e->getMessage()] 122 ); 123 } 124 continue; 125 } 126 $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 127 $firstChunkID++; 128 } 129 if ($this->logger) { 130 $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($parts)]); 131 } 132 return $chunkList; 133 } 134 135 /** 136 * Do a nearest neighbor search for chunks similar to the given question 137 * 138 * Returns only chunks the current user is allowed to read, may return an empty result. 139 * 140 * @param string $query The question 141 * @param int $limit The number of results to return 142 * @return Chunk[] 143 * @throws \Exception 144 */ 145 public function getSimilarChunks($query, $limit = 4) 146 { 147 global $auth; 148 $vector = $this->openAI->getEmbedding($query); 149 150 $chunks = $this->storage->getSimilarChunks($vector, $limit); 151 $result = []; 152 foreach ($chunks as $chunk) { 153 // filter out chunks the user is not allowed to read 154 if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 155 $result[] = $chunk; 156 if (count($result) >= $limit) break; 157 } 158 return $result; 159 } 160 161 162 /** 163 * @param $text 164 * @return array 165 * @throws \Exception 166 * @todo maybe add overlap support 167 * @todo support splitting too long sentences 168 */ 169 public function splitIntoChunks($text) 170 { 171 $sentenceSplitter = new Sentence(); 172 $tiktok = new Encoder(); 173 174 $chunks = []; 175 $sentences = $sentenceSplitter->split($text); 176 177 $chunklen = 0; 178 $chunk = ''; 179 while ($sentence = array_shift($sentences)) { 180 $slen = count($tiktok->encode($sentence)); 181 if ($slen > self::MAX_TOKEN_LEN) { 182 // sentence is too long, we need to split it further 183 if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 184 continue; 185 } 186 187 if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 188 // add to current chunk 189 $chunk .= $sentence; 190 $chunklen += $slen; 191 } else { 192 // start new chunk 193 $chunks[] = $chunk; 194 $chunk = $sentence; 195 $chunklen = $slen; 196 } 197 } 198 $chunks[] = $chunk; 199 200 return $chunks; 201 } 202} 203