18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 5*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\AbstractStorage; 6*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\Chunk; 7*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\KDTreeStorage; 8*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\SQLiteStorage; 98817535bSAndreas Gohruse dokuwiki\Search\Indexer; 10ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException; 112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 128817535bSAndreas Gohruse TikToken\Encoder; 138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 148817535bSAndreas Gohr 159da5f0dfSAndreas Gohr/** 169da5f0dfSAndreas Gohr * Manage the embeddings index 179da5f0dfSAndreas Gohr * 189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 19*7ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 209da5f0dfSAndreas Gohr */ 218817535bSAndreas Gohrclass Embeddings 228817535bSAndreas Gohr{ 238817535bSAndreas Gohr 24c4584168SAndreas Gohr const MAX_TOKEN_LEN = 1000; 25*7ee8b02dSAndreas Gohr 268817535bSAndreas Gohr 272ecc089aSAndreas Gohr /** @var OpenAI */ 288817535bSAndreas Gohr protected $openAI; 292ecc089aSAndreas Gohr /** @var CLI|null */ 302ecc089aSAndreas Gohr protected $logger; 318817535bSAndreas Gohr 32*7ee8b02dSAndreas Gohr /** @var AbstractStorage */ 33*7ee8b02dSAndreas Gohr protected $storage; 34*7ee8b02dSAndreas Gohr 352ecc089aSAndreas Gohr /** 362ecc089aSAndreas Gohr * @param OpenAI $openAI 372ecc089aSAndreas Gohr */ 382ecc089aSAndreas Gohr public function __construct(OpenAI $openAI) 398817535bSAndreas Gohr { 408817535bSAndreas Gohr $this->openAI = $openAI; 41*7ee8b02dSAndreas Gohr //$this->storage = new KDTreeStorage(); // FIXME make configurable 42*7ee8b02dSAndreas Gohr $this->storage = new SQLiteStorage(); // FIXME make configurable 43*7ee8b02dSAndreas Gohr } 44*7ee8b02dSAndreas Gohr 45*7ee8b02dSAndreas Gohr /** 46*7ee8b02dSAndreas Gohr * Access storage 47*7ee8b02dSAndreas Gohr * 48*7ee8b02dSAndreas Gohr * @return AbstractStorage 49*7ee8b02dSAndreas Gohr */ 50*7ee8b02dSAndreas Gohr public function getStorage() 51*7ee8b02dSAndreas Gohr { 52*7ee8b02dSAndreas Gohr return $this->storage; 532ecc089aSAndreas Gohr } 542ecc089aSAndreas Gohr 552ecc089aSAndreas Gohr /** 562ecc089aSAndreas Gohr * Add a logger instance 572ecc089aSAndreas Gohr * 582ecc089aSAndreas Gohr * @param CLI $logger 592ecc089aSAndreas Gohr * @return void 602ecc089aSAndreas Gohr */ 612ecc089aSAndreas Gohr public function setLogger(CLI $logger) 622ecc089aSAndreas Gohr { 638817535bSAndreas Gohr $this->logger = $logger; 648817535bSAndreas Gohr } 658817535bSAndreas Gohr 662ecc089aSAndreas Gohr /** 672ecc089aSAndreas Gohr * Create a new K-D Tree from all pages 682ecc089aSAndreas Gohr * 692ecc089aSAndreas Gohr * Deletes the existing index 702ecc089aSAndreas Gohr * 71ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 722ecc089aSAndreas Gohr * @return void 73ad38c5fdSAndreas Gohr * @throws ValidationException 742ecc089aSAndreas Gohr */ 75ad38c5fdSAndreas Gohr public function createNewIndex($skipRE = '') 768817535bSAndreas Gohr { 778817535bSAndreas Gohr $indexer = new Indexer(); 788817535bSAndreas Gohr $pages = $indexer->getPages(); 798817535bSAndreas Gohr 80*7ee8b02dSAndreas Gohr $this->storage->startCreation(1536); 815aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 828817535bSAndreas Gohr if (!page_exists($page)) continue; 836f9744f7SAndreas Gohr if (isHiddenPage($page)) continue; 84*7ee8b02dSAndreas Gohr if ($skipRE && preg_match($skipRE, $page)) continue; // FIXME delete previous chunks 855aa45b4dSAndreas Gohr 865aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 875aa45b4dSAndreas Gohr 88*7ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 89*7ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 905aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 91*7ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 925aa45b4dSAndreas Gohr } else { 935aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 94*7ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 95*7ee8b02dSAndreas Gohr $this->storage->addPageChunks($this->createPageChunks($page, $chunkID)); 965aa45b4dSAndreas Gohr } 975aa45b4dSAndreas Gohr } 98*7ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 995aa45b4dSAndreas Gohr } 1005aa45b4dSAndreas Gohr 1015aa45b4dSAndreas Gohr /** 102*7ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1035aa45b4dSAndreas Gohr * 1045aa45b4dSAndreas Gohr * @param string $page Name of the page to split 105*7ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 106*7ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 1075aa45b4dSAndreas Gohr * @throws \Exception 1085aa45b4dSAndreas Gohr */ 109*7ee8b02dSAndreas Gohr protected function createPageChunks($page, $firstChunkID) 1105aa45b4dSAndreas Gohr { 111*7ee8b02dSAndreas Gohr $chunkList = []; 112*7ee8b02dSAndreas Gohr $parts = $this->splitIntoChunks(rawWiki($page)); 113*7ee8b02dSAndreas Gohr foreach ($parts as $part) { 114ad38c5fdSAndreas Gohr try { 115*7ee8b02dSAndreas Gohr $embedding = $this->openAI->getEmbedding($part); 116ad38c5fdSAndreas Gohr } catch (\Exception $e) { 117ad38c5fdSAndreas Gohr if ($this->logger) { 118ad38c5fdSAndreas Gohr $this->logger->error( 119ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 120ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 121ad38c5fdSAndreas Gohr ); 122ad38c5fdSAndreas Gohr } 123ad38c5fdSAndreas Gohr continue; 124ad38c5fdSAndreas Gohr } 125*7ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 126*7ee8b02dSAndreas Gohr $firstChunkID++; 1278817535bSAndreas Gohr } 1288817535bSAndreas Gohr if ($this->logger) { 129*7ee8b02dSAndreas Gohr $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($parts)]); 1308817535bSAndreas Gohr } 131*7ee8b02dSAndreas Gohr return $chunkList; 1328817535bSAndreas Gohr } 1338817535bSAndreas Gohr 1349e81bea7SAndreas Gohr /** 1359e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 1369e81bea7SAndreas Gohr * 1379e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 1389e81bea7SAndreas Gohr * 1399e81bea7SAndreas Gohr * @param string $query The question 1409e81bea7SAndreas Gohr * @param int $limit The number of results to return 141*7ee8b02dSAndreas Gohr * @return Chunk[] 1429e81bea7SAndreas Gohr * @throws \Exception 1439e81bea7SAndreas Gohr */ 1448817535bSAndreas Gohr public function getSimilarChunks($query, $limit = 4) 1458817535bSAndreas Gohr { 1469e81bea7SAndreas Gohr global $auth; 147*7ee8b02dSAndreas Gohr $vector = $this->openAI->getEmbedding($query); 1488817535bSAndreas Gohr 149*7ee8b02dSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $limit); 1508817535bSAndreas Gohr $result = []; 151*7ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 1529e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 153*7ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 1549e81bea7SAndreas Gohr $result[] = $chunk; 1559e81bea7SAndreas Gohr if (count($result) >= $limit) break; 1568817535bSAndreas Gohr } 1578817535bSAndreas Gohr return $result; 1588817535bSAndreas Gohr } 1598817535bSAndreas Gohr 1605786be46SAndreas Gohr 1615786be46SAndreas Gohr /** 1628817535bSAndreas Gohr * @param $text 1638817535bSAndreas Gohr * @return array 1648817535bSAndreas Gohr * @throws \Exception 1658817535bSAndreas Gohr * @todo maybe add overlap support 1668817535bSAndreas Gohr * @todo support splitting too long sentences 1678817535bSAndreas Gohr */ 168ad38c5fdSAndreas Gohr public function splitIntoChunks($text) 1698817535bSAndreas Gohr { 1708817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 1718817535bSAndreas Gohr $tiktok = new Encoder(); 1728817535bSAndreas Gohr 1738817535bSAndreas Gohr $chunks = []; 1748817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 1758817535bSAndreas Gohr 1768817535bSAndreas Gohr $chunklen = 0; 1778817535bSAndreas Gohr $chunk = ''; 1788817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 1798817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 1808817535bSAndreas Gohr if ($slen > self::MAX_TOKEN_LEN) { 1818817535bSAndreas Gohr // sentence is too long, we need to split it further 182ad38c5fdSAndreas Gohr if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet'); 183ad38c5fdSAndreas Gohr continue; 1848817535bSAndreas Gohr } 1858817535bSAndreas Gohr 1868817535bSAndreas Gohr if ($chunklen + $slen < self::MAX_TOKEN_LEN) { 1878817535bSAndreas Gohr // add to current chunk 1888817535bSAndreas Gohr $chunk .= $sentence; 1898817535bSAndreas Gohr $chunklen += $slen; 1908817535bSAndreas Gohr } else { 1918817535bSAndreas Gohr // start new chunk 1928817535bSAndreas Gohr $chunks[] = $chunk; 1938817535bSAndreas Gohr $chunk = $sentence; 1948817535bSAndreas Gohr $chunklen = $slen; 1958817535bSAndreas Gohr } 1968817535bSAndreas Gohr } 1978817535bSAndreas Gohr $chunks[] = $chunk; 1988817535bSAndreas Gohr 1998817535bSAndreas Gohr return $chunks; 2008817535bSAndreas Gohr } 2018817535bSAndreas Gohr} 202