18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 57ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface; 6294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface; 7294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface; 8f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 98817535bSAndreas Gohruse dokuwiki\Search\Indexer; 102ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 118817535bSAndreas Gohruse TikToken\Encoder; 128817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 138817535bSAndreas Gohr 149da5f0dfSAndreas Gohr/** 159da5f0dfSAndreas Gohr * Manage the embeddings index 169da5f0dfSAndreas Gohr * 179da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 187ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 199da5f0dfSAndreas Gohr */ 208817535bSAndreas Gohrclass Embeddings 218817535bSAndreas Gohr{ 2268908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 2330b9cbc7Ssplitbrain final public const MAX_OVERLAP_LEN = 200; 248817535bSAndreas Gohr 25294a9eafSAndreas Gohr /** @var ChatInterface */ 266a18e0f4SAndreas Gohr protected $chatModel; 276a18e0f4SAndreas Gohr 28294a9eafSAndreas Gohr /** @var EmbeddingInterface */ 296a18e0f4SAndreas Gohr protected $embedModel; 306a18e0f4SAndreas Gohr 312ecc089aSAndreas Gohr /** @var CLI|null */ 322ecc089aSAndreas Gohr protected $logger; 3368908844SAndreas Gohr /** @var Encoder */ 3468908844SAndreas Gohr protected $tokenEncoder; 358817535bSAndreas Gohr 367ee8b02dSAndreas Gohr /** @var AbstractStorage */ 377ee8b02dSAndreas Gohr protected $storage; 387ee8b02dSAndreas Gohr 3968908844SAndreas Gohr /** @var array remember sentences when chunking */ 4068908844SAndreas Gohr private $sentenceQueue = []; 4168908844SAndreas Gohr 42c2b7a1f7SAndreas Gohr /** @var int the time spent for the last similar chunk retrieval */ 43c2b7a1f7SAndreas Gohr public $timeSpent = 0; 44c2b7a1f7SAndreas Gohr 4534a1c478SAndreas Gohr protected $configChunkSize; 4634a1c478SAndreas Gohr protected $configContextChunks; 47*720bb43fSAndreas Gohr protected $similarityThreshold; 4834a1c478SAndreas Gohr 4934a1c478SAndreas Gohr /** 5034a1c478SAndreas Gohr * Embeddings constructor. 5134a1c478SAndreas Gohr * 5234a1c478SAndreas Gohr * @param ChatInterface $chatModel 5334a1c478SAndreas Gohr * @param EmbeddingInterface $embedModel 5434a1c478SAndreas Gohr * @param AbstractStorage $storage 5534a1c478SAndreas Gohr * @param array $config The plugin configuration 5634a1c478SAndreas Gohr */ 576a18e0f4SAndreas Gohr public function __construct( 58294a9eafSAndreas Gohr ChatInterface $chatModel, 59294a9eafSAndreas Gohr EmbeddingInterface $embedModel, 6034a1c478SAndreas Gohr AbstractStorage $storage, 6134a1c478SAndreas Gohr $config 622071dcedSAndreas Gohr ) { 636a18e0f4SAndreas Gohr $this->chatModel = $chatModel; 646a18e0f4SAndreas Gohr $this->embedModel = $embedModel; 65f6ef2e50SAndreas Gohr $this->storage = $storage; 6634a1c478SAndreas Gohr $this->configChunkSize = $config['chunkSize']; 6734a1c478SAndreas Gohr $this->configContextChunks = $config['contextChunks']; 68*720bb43fSAndreas Gohr $this->similarityThreshold = $config['similarityThreshold']/100; 697ee8b02dSAndreas Gohr } 707ee8b02dSAndreas Gohr 717ee8b02dSAndreas Gohr /** 727ee8b02dSAndreas Gohr * Access storage 737ee8b02dSAndreas Gohr * 747ee8b02dSAndreas Gohr * @return AbstractStorage 757ee8b02dSAndreas Gohr */ 767ee8b02dSAndreas Gohr public function getStorage() 777ee8b02dSAndreas Gohr { 787ee8b02dSAndreas Gohr return $this->storage; 792ecc089aSAndreas Gohr } 802ecc089aSAndreas Gohr 812ecc089aSAndreas Gohr /** 822ecc089aSAndreas Gohr * Add a logger instance 832ecc089aSAndreas Gohr * 842ecc089aSAndreas Gohr * @return void 852ecc089aSAndreas Gohr */ 862ecc089aSAndreas Gohr public function setLogger(CLI $logger) 872ecc089aSAndreas Gohr { 888817535bSAndreas Gohr $this->logger = $logger; 898817535bSAndreas Gohr } 908817535bSAndreas Gohr 912ecc089aSAndreas Gohr /** 9268908844SAndreas Gohr * Get the token encoder instance 9368908844SAndreas Gohr * 9468908844SAndreas Gohr * @return Encoder 9568908844SAndreas Gohr */ 9668908844SAndreas Gohr public function getTokenEncoder() 9768908844SAndreas Gohr { 987ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 9968908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 10068908844SAndreas Gohr } 10168908844SAndreas Gohr return $this->tokenEncoder; 10268908844SAndreas Gohr } 10368908844SAndreas Gohr 10468908844SAndreas Gohr /** 1056a18e0f4SAndreas Gohr * Return the chunk size to use 1066a18e0f4SAndreas Gohr * 1076a18e0f4SAndreas Gohr * @return int 1086a18e0f4SAndreas Gohr */ 1096a18e0f4SAndreas Gohr public function getChunkSize() 1106a18e0f4SAndreas Gohr { 1116a18e0f4SAndreas Gohr return min( 11234a1c478SAndreas Gohr floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 11334a1c478SAndreas Gohr floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 11434a1c478SAndreas Gohr $this->configChunkSize, // this is usually the smallest 1156a18e0f4SAndreas Gohr ); 1166a18e0f4SAndreas Gohr } 1176a18e0f4SAndreas Gohr 1186a18e0f4SAndreas Gohr /** 1195284515dSAndreas Gohr * Update the embeddings storage 1202ecc089aSAndreas Gohr * 121ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 122d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 1235284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 1242ecc089aSAndreas Gohr * @return void 1255284515dSAndreas Gohr * @throws \Exception 1262ecc089aSAndreas Gohr */ 127d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 1288817535bSAndreas Gohr { 1298817535bSAndreas Gohr $indexer = new Indexer(); 1308817535bSAndreas Gohr $pages = $indexer->getPages(); 1318817535bSAndreas Gohr 132f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 1335aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1345aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1355aa45b4dSAndreas Gohr 1365284515dSAndreas Gohr if ( 1375284515dSAndreas Gohr !page_exists($page) || 1385284515dSAndreas Gohr isHiddenPage($page) || 1394e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 140d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string)$page)) || 141d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1425284515dSAndreas Gohr ) { 1435284515dSAndreas Gohr // this page should not be in the index (anymore) 1445284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1455284515dSAndreas Gohr continue; 1465284515dSAndreas Gohr } 1475284515dSAndreas Gohr 1487ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1497ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1505aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1517ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1527ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1535aa45b4dSAndreas Gohr } else { 1545aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1557ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 156ecb0a423SAndreas Gohr $chunks = $this->createPageChunks($page, $chunkID); 157ecb0a423SAndreas Gohr if ($chunks) $this->storage->addPageChunks($chunks); 1585aa45b4dSAndreas Gohr } 1595aa45b4dSAndreas Gohr } 1607ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1615aa45b4dSAndreas Gohr } 1625aa45b4dSAndreas Gohr 1635aa45b4dSAndreas Gohr /** 1647ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1655aa45b4dSAndreas Gohr * 16688305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 16788305719SAndreas Gohr * Otherwise the raw wiki text is used. 16888305719SAndreas Gohr * 1695aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1707ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 1717ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 1725aa45b4dSAndreas Gohr * @throws \Exception 1735aa45b4dSAndreas Gohr */ 1747ee8b02dSAndreas Gohr protected function createPageChunks($page, $firstChunkID) 1755aa45b4dSAndreas Gohr { 1767ee8b02dSAndreas Gohr $chunkList = []; 17788305719SAndreas Gohr 17888305719SAndreas Gohr $textRenderer = plugin_load('renderer', 'text'); 1797ebc7895Ssplitbrain if ($textRenderer instanceof PluginInterface) { 18088305719SAndreas Gohr global $ID; 18188305719SAndreas Gohr $ID = $page; 18288305719SAndreas Gohr $text = p_cached_output(wikiFN($page), 'text', $page); 18388305719SAndreas Gohr } else { 18488305719SAndreas Gohr $text = rawWiki($page); 18588305719SAndreas Gohr } 18688305719SAndreas Gohr 18788305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 1887ee8b02dSAndreas Gohr foreach ($parts as $part) { 18930b9cbc7Ssplitbrain if (trim((string)$part) == '') continue; // skip empty chunks 19093c1dbf4SAndreas Gohr 191ad38c5fdSAndreas Gohr try { 1926a18e0f4SAndreas Gohr $embedding = $this->embedModel->getEmbedding($part); 193ad38c5fdSAndreas Gohr } catch (\Exception $e) { 1947ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 195ad38c5fdSAndreas Gohr $this->logger->error( 196ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 197ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 198ad38c5fdSAndreas Gohr ); 199ad38c5fdSAndreas Gohr } 200ad38c5fdSAndreas Gohr continue; 201ad38c5fdSAndreas Gohr } 2027ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 2037ee8b02dSAndreas Gohr $firstChunkID++; 2048817535bSAndreas Gohr } 2057ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 2067ebc7895Ssplitbrain if ($chunkList !== []) { 207f8d5ae01SAndreas Gohr $this->logger->success( 208f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 209f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 210f8d5ae01SAndreas Gohr ); 21193c1dbf4SAndreas Gohr } else { 21293c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 21393c1dbf4SAndreas Gohr } 2148817535bSAndreas Gohr } 2157ee8b02dSAndreas Gohr return $chunkList; 2168817535bSAndreas Gohr } 2178817535bSAndreas Gohr 2189e81bea7SAndreas Gohr /** 2199e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 2209e81bea7SAndreas Gohr * 2219e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 22268908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 2239e81bea7SAndreas Gohr * 2249e81bea7SAndreas Gohr * @param string $query The question 225e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 2267ee8b02dSAndreas Gohr * @return Chunk[] 2279e81bea7SAndreas Gohr * @throws \Exception 2289e81bea7SAndreas Gohr */ 229e33a1d7aSAndreas Gohr public function getSimilarChunks($query, $lang = '') 2308817535bSAndreas Gohr { 2319e81bea7SAndreas Gohr global $auth; 2326a18e0f4SAndreas Gohr $vector = $this->embedModel->getEmbedding($query); 2338817535bSAndreas Gohr 234e3640be8SAndreas Gohr $fetch = min( 23534a1c478SAndreas Gohr ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 23634a1c478SAndreas Gohr $this->configContextChunks 237f6ef2e50SAndreas Gohr ); 238aee9b383SAndreas Gohr 239aee9b383SAndreas Gohr $time = microtime(true); 240e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 2415f71c9bbSAndreas Gohr $this->timeSpent = round(microtime(true) - $time, 2); 2427ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 243aee9b383SAndreas Gohr $this->logger->info( 244aee9b383SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds', 2455f71c9bbSAndreas Gohr ['count' => count($chunks), 'time' => $this->timeSpent] 246aee9b383SAndreas Gohr ); 247aee9b383SAndreas Gohr } 24868908844SAndreas Gohr 24968908844SAndreas Gohr $size = 0; 2508817535bSAndreas Gohr $result = []; 2517ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 2529e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 2537ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 254*720bb43fSAndreas Gohr if($chunk->getScore() < $this->similarityThreshold) continue; 25568908844SAndreas Gohr 25668908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 25734a1c478SAndreas Gohr if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 25868908844SAndreas Gohr 2599e81bea7SAndreas Gohr $result[] = $chunk; 26068908844SAndreas Gohr $size += $chunkSize; 2618817535bSAndreas Gohr } 2628817535bSAndreas Gohr return $result; 2638817535bSAndreas Gohr } 2648817535bSAndreas Gohr 2655786be46SAndreas Gohr 2665786be46SAndreas Gohr /** 2678817535bSAndreas Gohr * @param $text 2688817535bSAndreas Gohr * @return array 2698817535bSAndreas Gohr * @throws \Exception 2708817535bSAndreas Gohr * @todo support splitting too long sentences 2718817535bSAndreas Gohr */ 272ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 2738817535bSAndreas Gohr { 2748817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 27568908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 2768817535bSAndreas Gohr 2778817535bSAndreas Gohr $chunks = []; 2788817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 2798817535bSAndreas Gohr 2808817535bSAndreas Gohr $chunklen = 0; 2818817535bSAndreas Gohr $chunk = ''; 2828817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 2838817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 2846a18e0f4SAndreas Gohr if ($slen > $this->getChunkSize()) { 2858817535bSAndreas Gohr // sentence is too long, we need to split it further 286f8d5ae01SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 287f8d5ae01SAndreas Gohr 'Sentence too long, splitting not implemented yet' 288f8d5ae01SAndreas Gohr ); 289ad38c5fdSAndreas Gohr continue; 2908817535bSAndreas Gohr } 2918817535bSAndreas Gohr 2926a18e0f4SAndreas Gohr if ($chunklen + $slen < $this->getChunkSize()) { 2938817535bSAndreas Gohr // add to current chunk 2948817535bSAndreas Gohr $chunk .= $sentence; 2958817535bSAndreas Gohr $chunklen += $slen; 29668908844SAndreas Gohr // remember sentence for overlap check 29768908844SAndreas Gohr $this->rememberSentence($sentence); 2988817535bSAndreas Gohr } else { 29968908844SAndreas Gohr // add current chunk to result 3008817535bSAndreas Gohr $chunks[] = $chunk; 30168908844SAndreas Gohr 30268908844SAndreas Gohr // start new chunk with remembered sentences 3037ebc7895Ssplitbrain $chunk = implode(' ', $this->sentenceQueue); 30468908844SAndreas Gohr $chunk .= $sentence; 30568908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 3068817535bSAndreas Gohr } 3078817535bSAndreas Gohr } 3088817535bSAndreas Gohr $chunks[] = $chunk; 3098817535bSAndreas Gohr 3108817535bSAndreas Gohr return $chunks; 3118817535bSAndreas Gohr } 31268908844SAndreas Gohr 31368908844SAndreas Gohr /** 31468908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 31568908844SAndreas Gohr * 31668908844SAndreas Gohr * @param string $sentence 31768908844SAndreas Gohr * @return void 31868908844SAndreas Gohr */ 31968908844SAndreas Gohr protected function rememberSentence($sentence) 32068908844SAndreas Gohr { 32168908844SAndreas Gohr // add sentence to queue 32268908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 32368908844SAndreas Gohr 32468908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 32568908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 3267ebc7895Ssplitbrain while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 32768908844SAndreas Gohr array_shift($this->sentenceQueue); 32868908844SAndreas Gohr } 32968908844SAndreas Gohr } 3308817535bSAndreas Gohr} 331