18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 57ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\AbstractStorage; 67ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\Chunk; 77ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\KDTreeStorage; 87ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\SQLiteStorage; 98817535bSAndreas Gohruse dokuwiki\Search\Indexer; 10ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException; 112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 128817535bSAndreas Gohruse TikToken\Encoder; 138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 148817535bSAndreas Gohr 159da5f0dfSAndreas Gohr/** 169da5f0dfSAndreas Gohr * Manage the embeddings index 179da5f0dfSAndreas Gohr * 189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 209da5f0dfSAndreas Gohr */ 218817535bSAndreas Gohrclass Embeddings 228817535bSAndreas Gohr{ 238817535bSAndreas Gohr 24c4584168SAndreas Gohr const MAX_TOKEN_LEN = 1000; 257ee8b02dSAndreas Gohr 268817535bSAndreas Gohr 272ecc089aSAndreas Gohr /** @var OpenAI */ 288817535bSAndreas Gohr protected $openAI; 292ecc089aSAndreas Gohr /** @var CLI|null */ 302ecc089aSAndreas Gohr protected $logger; 318817535bSAndreas Gohr 327ee8b02dSAndreas Gohr /** @var AbstractStorage */ 337ee8b02dSAndreas Gohr protected $storage; 347ee8b02dSAndreas Gohr 352ecc089aSAndreas Gohr /** 362ecc089aSAndreas Gohr * @param OpenAI $openAI 372ecc089aSAndreas Gohr */ 382ecc089aSAndreas Gohr public function __construct(OpenAI $openAI) 398817535bSAndreas Gohr { 408817535bSAndreas Gohr $this->openAI = $openAI; 41*614f8ab4SAndreas Gohr $this->storage = new SQLiteStorage(); 427ee8b02dSAndreas Gohr } 437ee8b02dSAndreas Gohr 447ee8b02dSAndreas Gohr /** 457ee8b02dSAndreas Gohr * Access storage 467ee8b02dSAndreas Gohr * 477ee8b02dSAndreas Gohr * @return AbstractStorage 487ee8b02dSAndreas Gohr */ 497ee8b02dSAndreas Gohr public function getStorage() 507ee8b02dSAndreas Gohr { 517ee8b02dSAndreas Gohr return $this->storage; 522ecc089aSAndreas Gohr } 532ecc089aSAndreas Gohr 542ecc089aSAndreas Gohr /** 552ecc089aSAndreas Gohr * Add a logger instance 562ecc089aSAndreas Gohr * 572ecc089aSAndreas Gohr * @param CLI $logger 582ecc089aSAndreas Gohr * @return void 592ecc089aSAndreas Gohr */ 602ecc089aSAndreas Gohr public function setLogger(CLI $logger) 612ecc089aSAndreas Gohr { 628817535bSAndreas Gohr $this->logger = $logger; 638817535bSAndreas Gohr } 648817535bSAndreas Gohr 652ecc089aSAndreas Gohr /** 665284515dSAndreas Gohr * Update the embeddings storage 672ecc089aSAndreas Gohr * 68ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 695284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 702ecc089aSAndreas Gohr * @return void 715284515dSAndreas Gohr * @throws \Exception 722ecc089aSAndreas Gohr */ 735284515dSAndreas Gohr public function createNewIndex($skipRE = '', $clear = false) 748817535bSAndreas Gohr { 758817535bSAndreas Gohr $indexer = new Indexer(); 768817535bSAndreas Gohr $pages = $indexer->getPages(); 778817535bSAndreas Gohr 785284515dSAndreas Gohr $this->storage->startCreation(1536, $clear); 795aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 805aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 815aa45b4dSAndreas Gohr 825284515dSAndreas Gohr if ( 835284515dSAndreas Gohr !page_exists($page) || 845284515dSAndreas Gohr isHiddenPage($page) || 854e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 865284515dSAndreas Gohr ($skipRE && preg_match($skipRE, $page)) 875284515dSAndreas Gohr ) { 885284515dSAndreas Gohr // this page should not be in the index (anymore) 895284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 905284515dSAndreas Gohr continue; 915284515dSAndreas Gohr } 925284515dSAndreas Gohr 937ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 947ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 955aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 967ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 9733128f96SAndreas Gohr if ($this->logger) $this->logger->info("Reusing chunks for $page"); 985aa45b4dSAndreas Gohr } else { 995aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1007ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1017ee8b02dSAndreas Gohr $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 1025aa45b4dSAndreas Gohr } 1035aa45b4dSAndreas Gohr } 1047ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1055aa45b4dSAndreas Gohr } 1065aa45b4dSAndreas Gohr 1075aa45b4dSAndreas Gohr /** 1087ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1095aa45b4dSAndreas Gohr * 11088305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 11188305719SAndreas Gohr * Otherwise the raw wiki text is used. 11288305719SAndreas Gohr * 1135aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1147ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 1157ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 1165aa45b4dSAndreas Gohr * @throws \Exception 1175aa45b4dSAndreas Gohr */ 1187ee8b02dSAndreas Gohr protected function createPageChunks($page, $firstChunkID) 1195aa45b4dSAndreas Gohr { 1207ee8b02dSAndreas Gohr $chunkList = []; 12188305719SAndreas Gohr 12288305719SAndreas Gohr $textRenderer = plugin_load('renderer', 'text'); 12388305719SAndreas Gohr if ($textRenderer) { 12488305719SAndreas Gohr global $ID; 12588305719SAndreas Gohr $ID = $page; 12688305719SAndreas Gohr $text = p_cached_output(wikiFN($page), 'text', $page); 12788305719SAndreas Gohr } else { 12888305719SAndreas Gohr $text = rawWiki($page); 12988305719SAndreas Gohr } 13088305719SAndreas Gohr 13188305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 1327ee8b02dSAndreas Gohr foreach ($parts as $part) { 13393c1dbf4SAndreas Gohr if(trim($part) == '') continue; // skip empty chunks 13493c1dbf4SAndreas Gohr 135ad38c5fdSAndreas Gohr try { 1367ee8b02dSAndreas Gohr $embedding = $this->openAI->getEmbedding($part); 137ad38c5fdSAndreas Gohr } catch (\Exception $e) { 138ad38c5fdSAndreas Gohr if ($this->logger) { 139ad38c5fdSAndreas Gohr $this->logger->error( 140ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 141ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 142ad38c5fdSAndreas Gohr ); 143ad38c5fdSAndreas Gohr } 144ad38c5fdSAndreas Gohr continue; 145ad38c5fdSAndreas Gohr } 1467ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 1477ee8b02dSAndreas Gohr $firstChunkID++; 1488817535bSAndreas Gohr } 1498817535bSAndreas Gohr if ($this->logger) { 15093c1dbf4SAndreas Gohr if(count($chunkList)) { 15193c1dbf4SAndreas Gohr $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]); 15293c1dbf4SAndreas Gohr } else { 15393c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 15493c1dbf4SAndreas Gohr } 1558817535bSAndreas Gohr } 1567ee8b02dSAndreas Gohr return $chunkList; 1578817535bSAndreas Gohr } 1588817535bSAndreas Gohr 1599e81bea7SAndreas Gohr /** 1609e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 1619e81bea7SAndreas Gohr * 1629e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 1639e81bea7SAndreas Gohr * 1649e81bea7SAndreas Gohr * @param string $query The question 1659e81bea7SAndreas Gohr * @param int $limit The number of results to return 1667ee8b02dSAndreas Gohr * @return Chunk[] 1679e81bea7SAndreas Gohr * @throws \Exception 1689e81bea7SAndreas Gohr */ 1698817535bSAndreas Gohr public function getSimilarChunks($query, $limit = 4) 1708817535bSAndreas Gohr { 1719e81bea7SAndreas Gohr global $auth; 1727ee8b02dSAndreas Gohr $vector = $this->openAI->getEmbedding($query); 1738817535bSAndreas Gohr 1747ee8b02dSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $limit); 1758817535bSAndreas Gohr $result = []; 1767ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 1779e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 1787ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 1799e81bea7SAndreas Gohr $result[] = $chunk; 1809e81bea7SAndreas Gohr if (count($result) >= $limit) break; 1818817535bSAndreas Gohr } 1828817535bSAndreas Gohr return $result; 1838817535bSAndreas Gohr } 1848817535bSAndreas Gohr 1855786be46SAndreas Gohr 1865786be46SAndreas Gohr /** 1878817535bSAndreas Gohr * @param $text 1888817535bSAndreas Gohr * @return array 1898817535bSAndreas Gohr * @throws \Exception 1908817535bSAndreas Gohr * @todo maybe add overlap support 1918817535bSAndreas Gohr * @todo support splitting too long sentences 1928817535bSAndreas Gohr */ 193ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 1948817535bSAndreas Gohr { 1958817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 1968817535bSAndreas Gohr $tiktok = new Encoder(); 1978817535bSAndreas Gohr 1988817535bSAndreas Gohr $chunks = []; 1998817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 2008817535bSAndreas Gohr 2018817535bSAndreas Gohr $chunklen = 0; 2028817535bSAndreas Gohr $chunk = ''; 2038817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 2048817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 2058817535bSAndreas Gohr if ($slen > self::MAX_TOKEN_LEN) { 2068817535bSAndreas Gohr // sentence is too long, we need to split it further 207ad38c5fdSAndreas Gohr if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 208ad38c5fdSAndreas Gohr continue; 2098817535bSAndreas Gohr } 2108817535bSAndreas Gohr 2118817535bSAndreas Gohr if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 2128817535bSAndreas Gohr // add to current chunk 2138817535bSAndreas Gohr $chunk .= $sentence; 2148817535bSAndreas Gohr $chunklen += $slen; 2158817535bSAndreas Gohr } else { 2168817535bSAndreas Gohr // start new chunk 2178817535bSAndreas Gohr $chunks[] = $chunk; 2188817535bSAndreas Gohr $chunk = $sentence; 2198817535bSAndreas Gohr $chunklen = $slen; 2208817535bSAndreas Gohr } 2218817535bSAndreas Gohr } 2228817535bSAndreas Gohr $chunks[] = $chunk; 2238817535bSAndreas Gohr 2248817535bSAndreas Gohr return $chunks; 2258817535bSAndreas Gohr } 2268817535bSAndreas Gohr} 227