18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 57ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface; 6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Model\AbstractModel; 7f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 88817535bSAndreas Gohruse dokuwiki\Search\Indexer; 92ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 108817535bSAndreas Gohruse TikToken\Encoder; 118817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 128817535bSAndreas Gohr 139da5f0dfSAndreas Gohr/** 149da5f0dfSAndreas Gohr * Manage the embeddings index 159da5f0dfSAndreas Gohr * 169da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 177ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 189da5f0dfSAndreas Gohr */ 198817535bSAndreas Gohrclass Embeddings 208817535bSAndreas Gohr{ 2168908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 2230b9cbc7Ssplitbrain final public const MAX_OVERLAP_LEN = 200; 238817535bSAndreas Gohr 24f6ef2e50SAndreas Gohr /** @var AbstractModel */ 25f6ef2e50SAndreas Gohr protected $model; 262ecc089aSAndreas Gohr /** @var CLI|null */ 272ecc089aSAndreas Gohr protected $logger; 2868908844SAndreas Gohr /** @var Encoder */ 2968908844SAndreas Gohr protected $tokenEncoder; 308817535bSAndreas Gohr 317ee8b02dSAndreas Gohr /** @var AbstractStorage */ 327ee8b02dSAndreas Gohr protected $storage; 337ee8b02dSAndreas Gohr 3468908844SAndreas Gohr /** @var array remember sentences when chunking */ 3568908844SAndreas Gohr private $sentenceQueue = []; 3668908844SAndreas Gohr 37f6ef2e50SAndreas Gohr public function __construct(AbstractModel $model, AbstractStorage $storage) 388817535bSAndreas Gohr { 39f6ef2e50SAndreas Gohr $this->model = $model; 40f6ef2e50SAndreas Gohr $this->storage = $storage; 417ee8b02dSAndreas Gohr } 427ee8b02dSAndreas Gohr 437ee8b02dSAndreas Gohr /** 447ee8b02dSAndreas Gohr * Access storage 457ee8b02dSAndreas Gohr * 467ee8b02dSAndreas Gohr * @return AbstractStorage 477ee8b02dSAndreas Gohr */ 487ee8b02dSAndreas Gohr public function getStorage() 497ee8b02dSAndreas Gohr { 507ee8b02dSAndreas Gohr return $this->storage; 512ecc089aSAndreas Gohr } 522ecc089aSAndreas Gohr 532ecc089aSAndreas Gohr /** 542ecc089aSAndreas Gohr * Add a logger instance 552ecc089aSAndreas Gohr * 562ecc089aSAndreas Gohr * @return void 572ecc089aSAndreas Gohr */ 582ecc089aSAndreas Gohr public function setLogger(CLI $logger) 592ecc089aSAndreas Gohr { 608817535bSAndreas Gohr $this->logger = $logger; 618817535bSAndreas Gohr } 628817535bSAndreas Gohr 632ecc089aSAndreas Gohr /** 6468908844SAndreas Gohr * Get the token encoder instance 6568908844SAndreas Gohr * 6668908844SAndreas Gohr * @return Encoder 6768908844SAndreas Gohr */ 6868908844SAndreas Gohr public function getTokenEncoder() 6968908844SAndreas Gohr { 707ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 7168908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 7268908844SAndreas Gohr } 7368908844SAndreas Gohr return $this->tokenEncoder; 7468908844SAndreas Gohr } 7568908844SAndreas Gohr 7668908844SAndreas Gohr /** 775284515dSAndreas Gohr * Update the embeddings storage 782ecc089aSAndreas Gohr * 79ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 80*d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 815284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 822ecc089aSAndreas Gohr * @return void 835284515dSAndreas Gohr * @throws \Exception 842ecc089aSAndreas Gohr */ 85*d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 868817535bSAndreas Gohr { 878817535bSAndreas Gohr $indexer = new Indexer(); 888817535bSAndreas Gohr $pages = $indexer->getPages(); 898817535bSAndreas Gohr 90f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 915aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 925aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 935aa45b4dSAndreas Gohr 945284515dSAndreas Gohr if ( 955284515dSAndreas Gohr !page_exists($page) || 965284515dSAndreas Gohr isHiddenPage($page) || 974e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 98*d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string) $page)) || 99*d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1005284515dSAndreas Gohr ) { 1015284515dSAndreas Gohr // this page should not be in the index (anymore) 1025284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1035284515dSAndreas Gohr continue; 1045284515dSAndreas Gohr } 1055284515dSAndreas Gohr 1067ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1077ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1085aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1097ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1107ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1115aa45b4dSAndreas Gohr } else { 1125aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1137ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1147ee8b02dSAndreas Gohr $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 1155aa45b4dSAndreas Gohr } 1165aa45b4dSAndreas Gohr } 1177ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1185aa45b4dSAndreas Gohr } 1195aa45b4dSAndreas Gohr 1205aa45b4dSAndreas Gohr /** 1217ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1225aa45b4dSAndreas Gohr * 12388305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 12488305719SAndreas Gohr * Otherwise the raw wiki text is used. 12588305719SAndreas Gohr * 1265aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1277ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 1287ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 1295aa45b4dSAndreas Gohr * @throws \Exception 1305aa45b4dSAndreas Gohr */ 1317ee8b02dSAndreas Gohr protected function createPageChunks($page, $firstChunkID) 1325aa45b4dSAndreas Gohr { 1337ee8b02dSAndreas Gohr $chunkList = []; 13488305719SAndreas Gohr 13588305719SAndreas Gohr $textRenderer = plugin_load('renderer', 'text'); 1367ebc7895Ssplitbrain if ($textRenderer instanceof PluginInterface) { 13788305719SAndreas Gohr global $ID; 13888305719SAndreas Gohr $ID = $page; 13988305719SAndreas Gohr $text = p_cached_output(wikiFN($page), 'text', $page); 14088305719SAndreas Gohr } else { 14188305719SAndreas Gohr $text = rawWiki($page); 14288305719SAndreas Gohr } 14388305719SAndreas Gohr 14488305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 1457ee8b02dSAndreas Gohr foreach ($parts as $part) { 14630b9cbc7Ssplitbrain if (trim((string) $part) == '') continue; // skip empty chunks 14793c1dbf4SAndreas Gohr 148ad38c5fdSAndreas Gohr try { 149f6ef2e50SAndreas Gohr $embedding = $this->model->getEmbedding($part); 150ad38c5fdSAndreas Gohr } catch (\Exception $e) { 1517ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 152ad38c5fdSAndreas Gohr $this->logger->error( 153ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 154ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 155ad38c5fdSAndreas Gohr ); 156ad38c5fdSAndreas Gohr } 157ad38c5fdSAndreas Gohr continue; 158ad38c5fdSAndreas Gohr } 1597ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 1607ee8b02dSAndreas Gohr $firstChunkID++; 1618817535bSAndreas Gohr } 1627ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 1637ebc7895Ssplitbrain if ($chunkList !== []) { 164f8d5ae01SAndreas Gohr $this->logger->success( 165f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 166f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 167f8d5ae01SAndreas Gohr ); 16893c1dbf4SAndreas Gohr } else { 16993c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 17093c1dbf4SAndreas Gohr } 1718817535bSAndreas Gohr } 1727ee8b02dSAndreas Gohr return $chunkList; 1738817535bSAndreas Gohr } 1748817535bSAndreas Gohr 1759e81bea7SAndreas Gohr /** 1769e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 1779e81bea7SAndreas Gohr * 1789e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 17968908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 1809e81bea7SAndreas Gohr * 1819e81bea7SAndreas Gohr * @param string $query The question 182e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 1837ee8b02dSAndreas Gohr * @return Chunk[] 1849e81bea7SAndreas Gohr * @throws \Exception 1859e81bea7SAndreas Gohr */ 186e33a1d7aSAndreas Gohr public function getSimilarChunks($query, $lang = '') 1878817535bSAndreas Gohr { 1889e81bea7SAndreas Gohr global $auth; 189f6ef2e50SAndreas Gohr $vector = $this->model->getEmbedding($query); 1908817535bSAndreas Gohr 191f6ef2e50SAndreas Gohr $fetch = ceil( 192f6ef2e50SAndreas Gohr ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength()) 193f6ef2e50SAndreas Gohr * 1.5 // fetch a few more than needed, since not all chunks are maximum length 194f6ef2e50SAndreas Gohr ); 195aee9b383SAndreas Gohr 196aee9b383SAndreas Gohr $time = microtime(true); 197e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 1987ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 199aee9b383SAndreas Gohr $this->logger->info( 200aee9b383SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds', 201aee9b383SAndreas Gohr ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)] 202aee9b383SAndreas Gohr ); 203aee9b383SAndreas Gohr } 20468908844SAndreas Gohr 20568908844SAndreas Gohr $size = 0; 2068817535bSAndreas Gohr $result = []; 2077ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 2089e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 2097ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 21068908844SAndreas Gohr 21168908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 212f6ef2e50SAndreas Gohr if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough 21368908844SAndreas Gohr 2149e81bea7SAndreas Gohr $result[] = $chunk; 21568908844SAndreas Gohr $size += $chunkSize; 2168817535bSAndreas Gohr } 2178817535bSAndreas Gohr return $result; 2188817535bSAndreas Gohr } 2198817535bSAndreas Gohr 2205786be46SAndreas Gohr 2215786be46SAndreas Gohr /** 2228817535bSAndreas Gohr * @param $text 2238817535bSAndreas Gohr * @return array 2248817535bSAndreas Gohr * @throws \Exception 2258817535bSAndreas Gohr * @todo support splitting too long sentences 2268817535bSAndreas Gohr */ 227ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 2288817535bSAndreas Gohr { 2298817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 23068908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 2318817535bSAndreas Gohr 2328817535bSAndreas Gohr $chunks = []; 2338817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 2348817535bSAndreas Gohr 2358817535bSAndreas Gohr $chunklen = 0; 2368817535bSAndreas Gohr $chunk = ''; 2378817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 2388817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 239f6ef2e50SAndreas Gohr if ($slen > $this->model->getMaxEmbeddingTokenLength()) { 2408817535bSAndreas Gohr // sentence is too long, we need to split it further 241f8d5ae01SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 242f8d5ae01SAndreas Gohr 'Sentence too long, splitting not implemented yet' 243f8d5ae01SAndreas Gohr ); 244ad38c5fdSAndreas Gohr continue; 2458817535bSAndreas Gohr } 2468817535bSAndreas Gohr 247f6ef2e50SAndreas Gohr if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) { 2488817535bSAndreas Gohr // add to current chunk 2498817535bSAndreas Gohr $chunk .= $sentence; 2508817535bSAndreas Gohr $chunklen += $slen; 25168908844SAndreas Gohr // remember sentence for overlap check 25268908844SAndreas Gohr $this->rememberSentence($sentence); 2538817535bSAndreas Gohr } else { 25468908844SAndreas Gohr // add current chunk to result 2558817535bSAndreas Gohr $chunks[] = $chunk; 25668908844SAndreas Gohr 25768908844SAndreas Gohr // start new chunk with remembered sentences 2587ebc7895Ssplitbrain $chunk = implode(' ', $this->sentenceQueue); 25968908844SAndreas Gohr $chunk .= $sentence; 26068908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 2618817535bSAndreas Gohr } 2628817535bSAndreas Gohr } 2638817535bSAndreas Gohr $chunks[] = $chunk; 2648817535bSAndreas Gohr 2658817535bSAndreas Gohr return $chunks; 2668817535bSAndreas Gohr } 26768908844SAndreas Gohr 26868908844SAndreas Gohr /** 26968908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 27068908844SAndreas Gohr * 27168908844SAndreas Gohr * @param string $sentence 27268908844SAndreas Gohr * @return void 27368908844SAndreas Gohr */ 27468908844SAndreas Gohr protected function rememberSentence($sentence) 27568908844SAndreas Gohr { 27668908844SAndreas Gohr // add sentence to queue 27768908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 27868908844SAndreas Gohr 27968908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 28068908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 2817ebc7895Ssplitbrain while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 28268908844SAndreas Gohr array_shift($this->sentenceQueue); 28368908844SAndreas Gohr } 28468908844SAndreas Gohr } 2858817535bSAndreas Gohr} 286