18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 57ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\AbstractStorage; 67ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\Chunk; 77ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\SQLiteStorage; 88817535bSAndreas Gohruse dokuwiki\Search\Indexer; 92ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 108817535bSAndreas Gohruse TikToken\Encoder; 118817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 128817535bSAndreas Gohr 139da5f0dfSAndreas Gohr/** 149da5f0dfSAndreas Gohr * Manage the embeddings index 159da5f0dfSAndreas Gohr * 169da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 177ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 189da5f0dfSAndreas Gohr */ 198817535bSAndreas Gohrclass Embeddings 208817535bSAndreas Gohr{ 21*68908844SAndreas Gohr /** @var int length of all context chunks together */ 22*68908844SAndreas Gohr const MAX_CONTEXT_LEN = 3800; 238817535bSAndreas Gohr 24*68908844SAndreas Gohr /** @var int size of the chunks in tokens */ 25*68908844SAndreas Gohr const MAX_CHUNK_LEN = 1000; 267ee8b02dSAndreas Gohr 27*68908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 28*68908844SAndreas Gohr const MAX_OVERLAP_LEN = 200; 298817535bSAndreas Gohr 302ecc089aSAndreas Gohr /** @var OpenAI */ 318817535bSAndreas Gohr protected $openAI; 322ecc089aSAndreas Gohr /** @var CLI|null */ 332ecc089aSAndreas Gohr protected $logger; 34*68908844SAndreas Gohr /** @var Encoder */ 35*68908844SAndreas Gohr protected $tokenEncoder; 368817535bSAndreas Gohr 377ee8b02dSAndreas Gohr /** @var AbstractStorage */ 387ee8b02dSAndreas Gohr protected $storage; 397ee8b02dSAndreas Gohr 40*68908844SAndreas Gohr /** @var array remember sentences when chunking */ 41*68908844SAndreas Gohr private $sentenceQueue = []; 42*68908844SAndreas Gohr 432ecc089aSAndreas Gohr /** 442ecc089aSAndreas Gohr * @param OpenAI $openAI 452ecc089aSAndreas Gohr */ 462ecc089aSAndreas Gohr public function __construct(OpenAI $openAI) 478817535bSAndreas Gohr { 488817535bSAndreas Gohr $this->openAI = $openAI; 49614f8ab4SAndreas Gohr $this->storage = new SQLiteStorage(); 507ee8b02dSAndreas Gohr } 517ee8b02dSAndreas Gohr 527ee8b02dSAndreas Gohr /** 537ee8b02dSAndreas Gohr * Access storage 547ee8b02dSAndreas Gohr * 557ee8b02dSAndreas Gohr * @return AbstractStorage 567ee8b02dSAndreas Gohr */ 577ee8b02dSAndreas Gohr public function getStorage() 587ee8b02dSAndreas Gohr { 597ee8b02dSAndreas Gohr return $this->storage; 602ecc089aSAndreas Gohr } 612ecc089aSAndreas Gohr 622ecc089aSAndreas Gohr /** 632ecc089aSAndreas Gohr * Add a logger instance 642ecc089aSAndreas Gohr * 652ecc089aSAndreas Gohr * @param CLI $logger 662ecc089aSAndreas Gohr * @return void 672ecc089aSAndreas Gohr */ 682ecc089aSAndreas Gohr public function setLogger(CLI $logger) 692ecc089aSAndreas Gohr { 708817535bSAndreas Gohr $this->logger = $logger; 718817535bSAndreas Gohr } 728817535bSAndreas Gohr 732ecc089aSAndreas Gohr /** 74*68908844SAndreas Gohr * Get the token encoder instance 75*68908844SAndreas Gohr * 76*68908844SAndreas Gohr * @return Encoder 77*68908844SAndreas Gohr */ 78*68908844SAndreas Gohr public function getTokenEncoder() 79*68908844SAndreas Gohr { 80*68908844SAndreas Gohr if ($this->tokenEncoder === null) { 81*68908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 82*68908844SAndreas Gohr } 83*68908844SAndreas Gohr return $this->tokenEncoder; 84*68908844SAndreas Gohr } 85*68908844SAndreas Gohr 86*68908844SAndreas Gohr /** 875284515dSAndreas Gohr * Update the embeddings storage 882ecc089aSAndreas Gohr * 89ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 905284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 912ecc089aSAndreas Gohr * @return void 925284515dSAndreas Gohr * @throws \Exception 932ecc089aSAndreas Gohr */ 945284515dSAndreas Gohr public function createNewIndex($skipRE = '', $clear = false) 958817535bSAndreas Gohr { 968817535bSAndreas Gohr $indexer = new Indexer(); 978817535bSAndreas Gohr $pages = $indexer->getPages(); 988817535bSAndreas Gohr 995284515dSAndreas Gohr $this->storage->startCreation(1536, $clear); 1005aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1015aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1025aa45b4dSAndreas Gohr 1035284515dSAndreas Gohr if ( 1045284515dSAndreas Gohr !page_exists($page) || 1055284515dSAndreas Gohr isHiddenPage($page) || 1064e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 1075284515dSAndreas Gohr ($skipRE && preg_match($skipRE, $page)) 1085284515dSAndreas Gohr ) { 1095284515dSAndreas Gohr // this page should not be in the index (anymore) 1105284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1115284515dSAndreas Gohr continue; 1125284515dSAndreas Gohr } 1135284515dSAndreas Gohr 1147ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1157ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1165aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1177ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 11833128f96SAndreas Gohr if ($this->logger) $this->logger->info("Reusing chunks for $page"); 1195aa45b4dSAndreas Gohr } else { 1205aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1217ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1227ee8b02dSAndreas Gohr $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 1235aa45b4dSAndreas Gohr } 1245aa45b4dSAndreas Gohr } 1257ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1265aa45b4dSAndreas Gohr } 1275aa45b4dSAndreas Gohr 1285aa45b4dSAndreas Gohr /** 1297ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1305aa45b4dSAndreas Gohr * 13188305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 13288305719SAndreas Gohr * Otherwise the raw wiki text is used. 13388305719SAndreas Gohr * 1345aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1357ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 1367ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 1375aa45b4dSAndreas Gohr * @throws \Exception 1385aa45b4dSAndreas Gohr */ 1397ee8b02dSAndreas Gohr protected function createPageChunks($page, $firstChunkID) 1405aa45b4dSAndreas Gohr { 1417ee8b02dSAndreas Gohr $chunkList = []; 14288305719SAndreas Gohr 14388305719SAndreas Gohr $textRenderer = plugin_load('renderer', 'text'); 14488305719SAndreas Gohr if ($textRenderer) { 14588305719SAndreas Gohr global $ID; 14688305719SAndreas Gohr $ID = $page; 14788305719SAndreas Gohr $text = p_cached_output(wikiFN($page), 'text', $page); 14888305719SAndreas Gohr } else { 14988305719SAndreas Gohr $text = rawWiki($page); 15088305719SAndreas Gohr } 15188305719SAndreas Gohr 15288305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 1537ee8b02dSAndreas Gohr foreach ($parts as $part) { 15493c1dbf4SAndreas Gohr if (trim($part) == '') continue; // skip empty chunks 15593c1dbf4SAndreas Gohr 156ad38c5fdSAndreas Gohr try { 1577ee8b02dSAndreas Gohr $embedding = $this->openAI->getEmbedding($part); 158ad38c5fdSAndreas Gohr } catch (\Exception $e) { 159ad38c5fdSAndreas Gohr if ($this->logger) { 160ad38c5fdSAndreas Gohr $this->logger->error( 161ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 162ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 163ad38c5fdSAndreas Gohr ); 164ad38c5fdSAndreas Gohr } 165ad38c5fdSAndreas Gohr continue; 166ad38c5fdSAndreas Gohr } 1677ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 1687ee8b02dSAndreas Gohr $firstChunkID++; 1698817535bSAndreas Gohr } 1708817535bSAndreas Gohr if ($this->logger) { 17193c1dbf4SAndreas Gohr if (count($chunkList)) { 17293c1dbf4SAndreas Gohr $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]); 17393c1dbf4SAndreas Gohr } else { 17493c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 17593c1dbf4SAndreas Gohr } 1768817535bSAndreas Gohr } 1777ee8b02dSAndreas Gohr return $chunkList; 1788817535bSAndreas Gohr } 1798817535bSAndreas Gohr 1809e81bea7SAndreas Gohr /** 1819e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 1829e81bea7SAndreas Gohr * 1839e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 184*68908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 1859e81bea7SAndreas Gohr * 1869e81bea7SAndreas Gohr * @param string $query The question 1877ee8b02dSAndreas Gohr * @return Chunk[] 1889e81bea7SAndreas Gohr * @throws \Exception 1899e81bea7SAndreas Gohr */ 190*68908844SAndreas Gohr public function getSimilarChunks($query) 1918817535bSAndreas Gohr { 1929e81bea7SAndreas Gohr global $auth; 1937ee8b02dSAndreas Gohr $vector = $this->openAI->getEmbedding($query); 1948817535bSAndreas Gohr 195*68908844SAndreas Gohr // fetch a few more than needed, since not all chunks are maximum length 196*68908844SAndreas Gohr $fetch = ceil((self::MAX_CONTEXT_LEN / self::MAX_CHUNK_LEN) * 1.2); 197*68908844SAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $fetch); 198*68908844SAndreas Gohr 199*68908844SAndreas Gohr $size = 0; 2008817535bSAndreas Gohr $result = []; 2017ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 2029e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 2037ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 204*68908844SAndreas Gohr 205*68908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 206*68908844SAndreas Gohr if ($size + $chunkSize > self::MAX_CONTEXT_LEN) break; // we have enough 207*68908844SAndreas Gohr 2089e81bea7SAndreas Gohr $result[] = $chunk; 209*68908844SAndreas Gohr $size += $chunkSize; 2108817535bSAndreas Gohr } 2118817535bSAndreas Gohr return $result; 2128817535bSAndreas Gohr } 2138817535bSAndreas Gohr 2145786be46SAndreas Gohr 2155786be46SAndreas Gohr /** 2168817535bSAndreas Gohr * @param $text 2178817535bSAndreas Gohr * @return array 2188817535bSAndreas Gohr * @throws \Exception 2198817535bSAndreas Gohr * @todo support splitting too long sentences 2208817535bSAndreas Gohr */ 221ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 2228817535bSAndreas Gohr { 2238817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 224*68908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 2258817535bSAndreas Gohr 2268817535bSAndreas Gohr $chunks = []; 2278817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 2288817535bSAndreas Gohr 2298817535bSAndreas Gohr $chunklen = 0; 2308817535bSAndreas Gohr $chunk = ''; 2318817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 2328817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 233*68908844SAndreas Gohr if ($slen > self::MAX_CHUNK_LEN) { 2348817535bSAndreas Gohr // sentence is too long, we need to split it further 235ad38c5fdSAndreas Gohr if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 236ad38c5fdSAndreas Gohr continue; 2378817535bSAndreas Gohr } 2388817535bSAndreas Gohr 239*68908844SAndreas Gohr if ($chunklen + $slen < self::MAX_CHUNK_LEN) { 2408817535bSAndreas Gohr // add to current chunk 2418817535bSAndreas Gohr $chunk .= $sentence; 2428817535bSAndreas Gohr $chunklen += $slen; 243*68908844SAndreas Gohr // remember sentence for overlap check 244*68908844SAndreas Gohr $this->rememberSentence($sentence); 2458817535bSAndreas Gohr } else { 246*68908844SAndreas Gohr // add current chunk to result 2478817535bSAndreas Gohr $chunks[] = $chunk; 248*68908844SAndreas Gohr 249*68908844SAndreas Gohr // start new chunk with remembered sentences 250*68908844SAndreas Gohr $chunk = join(' ', $this->sentenceQueue); 251*68908844SAndreas Gohr $chunk .= $sentence; 252*68908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 2538817535bSAndreas Gohr } 2548817535bSAndreas Gohr } 2558817535bSAndreas Gohr $chunks[] = $chunk; 2568817535bSAndreas Gohr 2578817535bSAndreas Gohr return $chunks; 2588817535bSAndreas Gohr } 259*68908844SAndreas Gohr 260*68908844SAndreas Gohr /** 261*68908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 262*68908844SAndreas Gohr * 263*68908844SAndreas Gohr * @param string $sentence 264*68908844SAndreas Gohr * @return void 265*68908844SAndreas Gohr */ 266*68908844SAndreas Gohr protected function rememberSentence($sentence) 267*68908844SAndreas Gohr { 268*68908844SAndreas Gohr // add sentence to queue 269*68908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 270*68908844SAndreas Gohr 271*68908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 272*68908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 273*68908844SAndreas Gohr while (count($encoder->encode(join(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 274*68908844SAndreas Gohr array_shift($this->sentenceQueue); 275*68908844SAndreas Gohr } 276*68908844SAndreas Gohr } 2778817535bSAndreas Gohr} 278