18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 57ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface; 6*6a18e0f4SAndreas Gohruse dokuwiki\plugin\aichat\Model\AbstractChatModel; 7*6a18e0f4SAndreas Gohruse dokuwiki\plugin\aichat\Model\AbstractEmbeddingModel; 8f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 98817535bSAndreas Gohruse dokuwiki\Search\Indexer; 102ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 118817535bSAndreas Gohruse TikToken\Encoder; 128817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 138817535bSAndreas Gohr 149da5f0dfSAndreas Gohr/** 159da5f0dfSAndreas Gohr * Manage the embeddings index 169da5f0dfSAndreas Gohr * 179da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 187ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 199da5f0dfSAndreas Gohr */ 208817535bSAndreas Gohrclass Embeddings 218817535bSAndreas Gohr{ 2268908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 2330b9cbc7Ssplitbrain final public const MAX_OVERLAP_LEN = 200; 248817535bSAndreas Gohr 25*6a18e0f4SAndreas Gohr /** @var AbstractChatModel */ 26*6a18e0f4SAndreas Gohr protected $chatModel; 27*6a18e0f4SAndreas Gohr 28*6a18e0f4SAndreas Gohr /** @var AbstractEmbeddingModel */ 29*6a18e0f4SAndreas Gohr protected $embedModel; 30*6a18e0f4SAndreas Gohr 312ecc089aSAndreas Gohr /** @var CLI|null */ 322ecc089aSAndreas Gohr protected $logger; 3368908844SAndreas Gohr /** @var Encoder */ 3468908844SAndreas Gohr protected $tokenEncoder; 358817535bSAndreas Gohr 367ee8b02dSAndreas Gohr /** @var AbstractStorage */ 377ee8b02dSAndreas Gohr protected $storage; 387ee8b02dSAndreas Gohr 3968908844SAndreas Gohr /** @var array remember sentences when chunking */ 4068908844SAndreas Gohr private $sentenceQueue = []; 4168908844SAndreas Gohr 42*6a18e0f4SAndreas Gohr public function __construct( 43*6a18e0f4SAndreas Gohr AbstractChatModel $chatModel, 44*6a18e0f4SAndreas Gohr AbstractEmbeddingModel $embedModel, 45*6a18e0f4SAndreas Gohr AbstractStorage $storage 46*6a18e0f4SAndreas Gohr ) { 47*6a18e0f4SAndreas Gohr $this->chatModel = $chatModel; 48*6a18e0f4SAndreas Gohr $this->embedModel = $embedModel; 49f6ef2e50SAndreas Gohr $this->storage = $storage; 507ee8b02dSAndreas Gohr } 517ee8b02dSAndreas Gohr 527ee8b02dSAndreas Gohr /** 537ee8b02dSAndreas Gohr * Access storage 547ee8b02dSAndreas Gohr * 557ee8b02dSAndreas Gohr * @return AbstractStorage 567ee8b02dSAndreas Gohr */ 577ee8b02dSAndreas Gohr public function getStorage() 587ee8b02dSAndreas Gohr { 597ee8b02dSAndreas Gohr return $this->storage; 602ecc089aSAndreas Gohr } 612ecc089aSAndreas Gohr 622ecc089aSAndreas Gohr /** 632ecc089aSAndreas Gohr * Add a logger instance 642ecc089aSAndreas Gohr * 652ecc089aSAndreas Gohr * @return void 662ecc089aSAndreas Gohr */ 672ecc089aSAndreas Gohr public function setLogger(CLI $logger) 682ecc089aSAndreas Gohr { 698817535bSAndreas Gohr $this->logger = $logger; 708817535bSAndreas Gohr } 718817535bSAndreas Gohr 722ecc089aSAndreas Gohr /** 7368908844SAndreas Gohr * Get the token encoder instance 7468908844SAndreas Gohr * 7568908844SAndreas Gohr * @return Encoder 7668908844SAndreas Gohr */ 7768908844SAndreas Gohr public function getTokenEncoder() 7868908844SAndreas Gohr { 797ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 8068908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 8168908844SAndreas Gohr } 8268908844SAndreas Gohr return $this->tokenEncoder; 8368908844SAndreas Gohr } 8468908844SAndreas Gohr 8568908844SAndreas Gohr /** 86*6a18e0f4SAndreas Gohr * Return the chunk size to use 87*6a18e0f4SAndreas Gohr * 88*6a18e0f4SAndreas Gohr * @return int 89*6a18e0f4SAndreas Gohr */ 90*6a18e0f4SAndreas Gohr public function getChunkSize() 91*6a18e0f4SAndreas Gohr { 92*6a18e0f4SAndreas Gohr return min( 93*6a18e0f4SAndreas Gohr $this->chatModel->getMaxEmbeddingTokenLength(), 94*6a18e0f4SAndreas Gohr $this->embedModel->getMaxEmbeddingTokenLength() 95*6a18e0f4SAndreas Gohr ); 96*6a18e0f4SAndreas Gohr } 97*6a18e0f4SAndreas Gohr 98*6a18e0f4SAndreas Gohr /** 995284515dSAndreas Gohr * Update the embeddings storage 1002ecc089aSAndreas Gohr * 101ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 102d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 1035284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 1042ecc089aSAndreas Gohr * @return void 1055284515dSAndreas Gohr * @throws \Exception 1062ecc089aSAndreas Gohr */ 107d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 1088817535bSAndreas Gohr { 1098817535bSAndreas Gohr $indexer = new Indexer(); 1108817535bSAndreas Gohr $pages = $indexer->getPages(); 1118817535bSAndreas Gohr 112f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 1135aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1145aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1155aa45b4dSAndreas Gohr 1165284515dSAndreas Gohr if ( 1175284515dSAndreas Gohr !page_exists($page) || 1185284515dSAndreas Gohr isHiddenPage($page) || 1194e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 120d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string) $page)) || 121d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1225284515dSAndreas Gohr ) { 1235284515dSAndreas Gohr // this page should not be in the index (anymore) 1245284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1255284515dSAndreas Gohr continue; 1265284515dSAndreas Gohr } 1275284515dSAndreas Gohr 1287ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1297ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1305aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1317ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1327ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1335aa45b4dSAndreas Gohr } else { 1345aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1357ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1367ee8b02dSAndreas Gohr $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 1375aa45b4dSAndreas Gohr } 1385aa45b4dSAndreas Gohr } 1397ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1405aa45b4dSAndreas Gohr } 1415aa45b4dSAndreas Gohr 1425aa45b4dSAndreas Gohr /** 1437ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1445aa45b4dSAndreas Gohr * 14588305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 14688305719SAndreas Gohr * Otherwise the raw wiki text is used. 14788305719SAndreas Gohr * 1485aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1497ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 1507ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 1515aa45b4dSAndreas Gohr * @throws \Exception 1525aa45b4dSAndreas Gohr */ 1537ee8b02dSAndreas Gohr protected function createPageChunks($page, $firstChunkID) 1545aa45b4dSAndreas Gohr { 1557ee8b02dSAndreas Gohr $chunkList = []; 15688305719SAndreas Gohr 15788305719SAndreas Gohr $textRenderer = plugin_load('renderer', 'text'); 1587ebc7895Ssplitbrain if ($textRenderer instanceof PluginInterface) { 15988305719SAndreas Gohr global $ID; 16088305719SAndreas Gohr $ID = $page; 16188305719SAndreas Gohr $text = p_cached_output(wikiFN($page), 'text', $page); 16288305719SAndreas Gohr } else { 16388305719SAndreas Gohr $text = rawWiki($page); 16488305719SAndreas Gohr } 16588305719SAndreas Gohr 16688305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 1677ee8b02dSAndreas Gohr foreach ($parts as $part) { 16830b9cbc7Ssplitbrain if (trim((string) $part) == '') continue; // skip empty chunks 16993c1dbf4SAndreas Gohr 170ad38c5fdSAndreas Gohr try { 171*6a18e0f4SAndreas Gohr $embedding = $this->embedModel->getEmbedding($part); 172ad38c5fdSAndreas Gohr } catch (\Exception $e) { 1737ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 174ad38c5fdSAndreas Gohr $this->logger->error( 175ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 176ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 177ad38c5fdSAndreas Gohr ); 178ad38c5fdSAndreas Gohr } 179ad38c5fdSAndreas Gohr continue; 180ad38c5fdSAndreas Gohr } 1817ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 1827ee8b02dSAndreas Gohr $firstChunkID++; 1838817535bSAndreas Gohr } 1847ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 1857ebc7895Ssplitbrain if ($chunkList !== []) { 186f8d5ae01SAndreas Gohr $this->logger->success( 187f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 188f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 189f8d5ae01SAndreas Gohr ); 19093c1dbf4SAndreas Gohr } else { 19193c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 19293c1dbf4SAndreas Gohr } 1938817535bSAndreas Gohr } 1947ee8b02dSAndreas Gohr return $chunkList; 1958817535bSAndreas Gohr } 1968817535bSAndreas Gohr 1979e81bea7SAndreas Gohr /** 1989e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 1999e81bea7SAndreas Gohr * 2009e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 20168908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 2029e81bea7SAndreas Gohr * 2039e81bea7SAndreas Gohr * @param string $query The question 204e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 2057ee8b02dSAndreas Gohr * @return Chunk[] 2069e81bea7SAndreas Gohr * @throws \Exception 2079e81bea7SAndreas Gohr */ 208e33a1d7aSAndreas Gohr public function getSimilarChunks($query, $lang = '') 2098817535bSAndreas Gohr { 2109e81bea7SAndreas Gohr global $auth; 211*6a18e0f4SAndreas Gohr $vector = $this->embedModel->getEmbedding($query); 2128817535bSAndreas Gohr 213f6ef2e50SAndreas Gohr $fetch = ceil( 214*6a18e0f4SAndreas Gohr ($this->getChunkSize() / $this->chatModel->getMaxEmbeddingTokenLength()) 215f6ef2e50SAndreas Gohr * 1.5 // fetch a few more than needed, since not all chunks are maximum length 216f6ef2e50SAndreas Gohr ); 217aee9b383SAndreas Gohr 218aee9b383SAndreas Gohr $time = microtime(true); 219e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 2207ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 221aee9b383SAndreas Gohr $this->logger->info( 222aee9b383SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds', 223aee9b383SAndreas Gohr ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)] 224aee9b383SAndreas Gohr ); 225aee9b383SAndreas Gohr } 22668908844SAndreas Gohr 22768908844SAndreas Gohr $size = 0; 2288817535bSAndreas Gohr $result = []; 2297ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 2309e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 2317ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 23268908844SAndreas Gohr 23368908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 234*6a18e0f4SAndreas Gohr if ($size + $chunkSize > $this->chatModel->getMaxContextTokenLength()) break; // we have enough 23568908844SAndreas Gohr 2369e81bea7SAndreas Gohr $result[] = $chunk; 23768908844SAndreas Gohr $size += $chunkSize; 2388817535bSAndreas Gohr } 2398817535bSAndreas Gohr return $result; 2408817535bSAndreas Gohr } 2418817535bSAndreas Gohr 2425786be46SAndreas Gohr 2435786be46SAndreas Gohr /** 2448817535bSAndreas Gohr * @param $text 2458817535bSAndreas Gohr * @return array 2468817535bSAndreas Gohr * @throws \Exception 2478817535bSAndreas Gohr * @todo support splitting too long sentences 2488817535bSAndreas Gohr */ 249ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 2508817535bSAndreas Gohr { 2518817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 25268908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 2538817535bSAndreas Gohr 2548817535bSAndreas Gohr $chunks = []; 2558817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 2568817535bSAndreas Gohr 2578817535bSAndreas Gohr $chunklen = 0; 2588817535bSAndreas Gohr $chunk = ''; 2598817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 2608817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 261*6a18e0f4SAndreas Gohr if ($slen > $this->getChunkSize()) { 2628817535bSAndreas Gohr // sentence is too long, we need to split it further 263f8d5ae01SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 264f8d5ae01SAndreas Gohr 'Sentence too long, splitting not implemented yet' 265f8d5ae01SAndreas Gohr ); 266ad38c5fdSAndreas Gohr continue; 2678817535bSAndreas Gohr } 2688817535bSAndreas Gohr 269*6a18e0f4SAndreas Gohr if ($chunklen + $slen < $this->getChunkSize()) { 2708817535bSAndreas Gohr // add to current chunk 2718817535bSAndreas Gohr $chunk .= $sentence; 2728817535bSAndreas Gohr $chunklen += $slen; 27368908844SAndreas Gohr // remember sentence for overlap check 27468908844SAndreas Gohr $this->rememberSentence($sentence); 2758817535bSAndreas Gohr } else { 27668908844SAndreas Gohr // add current chunk to result 2778817535bSAndreas Gohr $chunks[] = $chunk; 27868908844SAndreas Gohr 27968908844SAndreas Gohr // start new chunk with remembered sentences 2807ebc7895Ssplitbrain $chunk = implode(' ', $this->sentenceQueue); 28168908844SAndreas Gohr $chunk .= $sentence; 28268908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 2838817535bSAndreas Gohr } 2848817535bSAndreas Gohr } 2858817535bSAndreas Gohr $chunks[] = $chunk; 2868817535bSAndreas Gohr 2878817535bSAndreas Gohr return $chunks; 2888817535bSAndreas Gohr } 28968908844SAndreas Gohr 29068908844SAndreas Gohr /** 29168908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 29268908844SAndreas Gohr * 29368908844SAndreas Gohr * @param string $sentence 29468908844SAndreas Gohr * @return void 29568908844SAndreas Gohr */ 29668908844SAndreas Gohr protected function rememberSentence($sentence) 29768908844SAndreas Gohr { 29868908844SAndreas Gohr // add sentence to queue 29968908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 30068908844SAndreas Gohr 30168908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 30268908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 3037ebc7895Ssplitbrain while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 30468908844SAndreas Gohr array_shift($this->sentenceQueue); 30568908844SAndreas Gohr } 30668908844SAndreas Gohr } 3078817535bSAndreas Gohr} 308