18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 5ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event; 67ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface; 7661701eeSAndreas Gohruse dokuwiki\File\PageResolver; 8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface; 9294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface; 10f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 118817535bSAndreas Gohruse dokuwiki\Search\Indexer; 122ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 138817535bSAndreas Gohruse TikToken\Encoder; 148817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 158817535bSAndreas Gohr 169da5f0dfSAndreas Gohr/** 179da5f0dfSAndreas Gohr * Manage the embeddings index 189da5f0dfSAndreas Gohr * 199da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 207ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 219da5f0dfSAndreas Gohr */ 228817535bSAndreas Gohrclass Embeddings 238817535bSAndreas Gohr{ 2468908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 2530b9cbc7Ssplitbrain final public const MAX_OVERLAP_LEN = 200; 268817535bSAndreas Gohr 27294a9eafSAndreas Gohr /** @var ChatInterface */ 286a18e0f4SAndreas Gohr protected $chatModel; 296a18e0f4SAndreas Gohr 30294a9eafSAndreas Gohr /** @var EmbeddingInterface */ 316a18e0f4SAndreas Gohr protected $embedModel; 326a18e0f4SAndreas Gohr 332ecc089aSAndreas Gohr /** @var CLI|null */ 342ecc089aSAndreas Gohr protected $logger; 3568908844SAndreas Gohr /** @var Encoder */ 3668908844SAndreas Gohr protected $tokenEncoder; 378817535bSAndreas Gohr 387ee8b02dSAndreas Gohr /** @var AbstractStorage */ 397ee8b02dSAndreas Gohr protected $storage; 407ee8b02dSAndreas Gohr 4168908844SAndreas Gohr /** @var array remember sentences when chunking */ 4268908844SAndreas Gohr private $sentenceQueue = []; 4368908844SAndreas Gohr 44c2b7a1f7SAndreas Gohr /** @var int the time spent for the last similar chunk retrieval */ 45c2b7a1f7SAndreas Gohr public $timeSpent = 0; 46c2b7a1f7SAndreas Gohr 4734a1c478SAndreas Gohr protected $configChunkSize; 4834a1c478SAndreas Gohr protected $configContextChunks; 49720bb43fSAndreas Gohr protected $similarityThreshold; 5034a1c478SAndreas Gohr 5134a1c478SAndreas Gohr /** 5234a1c478SAndreas Gohr * Embeddings constructor. 5334a1c478SAndreas Gohr * 5434a1c478SAndreas Gohr * @param ChatInterface $chatModel 5534a1c478SAndreas Gohr * @param EmbeddingInterface $embedModel 5634a1c478SAndreas Gohr * @param AbstractStorage $storage 5734a1c478SAndreas Gohr * @param array $config The plugin configuration 5834a1c478SAndreas Gohr */ 596a18e0f4SAndreas Gohr public function __construct( 60294a9eafSAndreas Gohr ChatInterface $chatModel, 61294a9eafSAndreas Gohr EmbeddingInterface $embedModel, 6234a1c478SAndreas Gohr AbstractStorage $storage, 6334a1c478SAndreas Gohr $config 648c08cb3fSAndreas Gohr ) { 656a18e0f4SAndreas Gohr $this->chatModel = $chatModel; 666a18e0f4SAndreas Gohr $this->embedModel = $embedModel; 67f6ef2e50SAndreas Gohr $this->storage = $storage; 6834a1c478SAndreas Gohr $this->configChunkSize = $config['chunkSize']; 6934a1c478SAndreas Gohr $this->configContextChunks = $config['contextChunks']; 70720bb43fSAndreas Gohr $this->similarityThreshold = $config['similarityThreshold'] / 100; 717ee8b02dSAndreas Gohr } 727ee8b02dSAndreas Gohr 737ee8b02dSAndreas Gohr /** 747ee8b02dSAndreas Gohr * Access storage 757ee8b02dSAndreas Gohr * 767ee8b02dSAndreas Gohr * @return AbstractStorage 777ee8b02dSAndreas Gohr */ 787ee8b02dSAndreas Gohr public function getStorage() 797ee8b02dSAndreas Gohr { 807ee8b02dSAndreas Gohr return $this->storage; 812ecc089aSAndreas Gohr } 822ecc089aSAndreas Gohr 832ecc089aSAndreas Gohr /** 842ecc089aSAndreas Gohr * Add a logger instance 852ecc089aSAndreas Gohr * 862ecc089aSAndreas Gohr * @return void 872ecc089aSAndreas Gohr */ 882ecc089aSAndreas Gohr public function setLogger(CLI $logger) 892ecc089aSAndreas Gohr { 908817535bSAndreas Gohr $this->logger = $logger; 918817535bSAndreas Gohr } 928817535bSAndreas Gohr 932ecc089aSAndreas Gohr /** 9468908844SAndreas Gohr * Get the token encoder instance 9568908844SAndreas Gohr * 9668908844SAndreas Gohr * @return Encoder 9768908844SAndreas Gohr */ 9868908844SAndreas Gohr public function getTokenEncoder() 9968908844SAndreas Gohr { 1007ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 10168908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 10268908844SAndreas Gohr } 10368908844SAndreas Gohr return $this->tokenEncoder; 10468908844SAndreas Gohr } 10568908844SAndreas Gohr 10668908844SAndreas Gohr /** 1076a18e0f4SAndreas Gohr * Return the chunk size to use 1086a18e0f4SAndreas Gohr * 1096a18e0f4SAndreas Gohr * @return int 1106a18e0f4SAndreas Gohr */ 1116a18e0f4SAndreas Gohr public function getChunkSize() 1126a18e0f4SAndreas Gohr { 1136a18e0f4SAndreas Gohr return min( 11434a1c478SAndreas Gohr floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 11534a1c478SAndreas Gohr floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 11634a1c478SAndreas Gohr $this->configChunkSize, // this is usually the smallest 1176a18e0f4SAndreas Gohr ); 1186a18e0f4SAndreas Gohr } 1196a18e0f4SAndreas Gohr 1206a18e0f4SAndreas Gohr /** 1215284515dSAndreas Gohr * Update the embeddings storage 1222ecc089aSAndreas Gohr * 123ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 124d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 1255284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 1262ecc089aSAndreas Gohr * @return void 1275284515dSAndreas Gohr * @throws \Exception 1282ecc089aSAndreas Gohr */ 129d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 1308817535bSAndreas Gohr { 1318817535bSAndreas Gohr $indexer = new Indexer(); 1328817535bSAndreas Gohr $pages = $indexer->getPages(); 1338817535bSAndreas Gohr 134f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 1355aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1365aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1375aa45b4dSAndreas Gohr 1385284515dSAndreas Gohr if ( 1395284515dSAndreas Gohr !page_exists($page) || 1405284515dSAndreas Gohr isHiddenPage($page) || 1414e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 142d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string)$page)) || 143d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1445284515dSAndreas Gohr ) { 1455284515dSAndreas Gohr // this page should not be in the index (anymore) 1465284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1475284515dSAndreas Gohr continue; 1485284515dSAndreas Gohr } 1495284515dSAndreas Gohr 1507ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1517ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1525aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1537ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1547ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1555aa45b4dSAndreas Gohr } else { 1565aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1577ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 158ecb0a423SAndreas Gohr $chunks = $this->createPageChunks($page, $chunkID); 159ecb0a423SAndreas Gohr if ($chunks) $this->storage->addPageChunks($chunks); 1605aa45b4dSAndreas Gohr } 1615aa45b4dSAndreas Gohr } 1627ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1635aa45b4dSAndreas Gohr } 1645aa45b4dSAndreas Gohr 1655aa45b4dSAndreas Gohr /** 1667ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1675aa45b4dSAndreas Gohr * 16888305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 16988305719SAndreas Gohr * Otherwise the raw wiki text is used. 17088305719SAndreas Gohr * 1715aa45b4dSAndreas Gohr * @param string $page Name of the page to split 1727ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 1737ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 174ab1f8ddeSAndreas Gohr * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 1755aa45b4dSAndreas Gohr * @throws \Exception 1765aa45b4dSAndreas Gohr */ 177ab1f8ddeSAndreas Gohr public function createPageChunks($page, $firstChunkID) 1785aa45b4dSAndreas Gohr { 1797ee8b02dSAndreas Gohr $chunkList = []; 18088305719SAndreas Gohr 18188305719SAndreas Gohr global $ID; 18288305719SAndreas Gohr $ID = $page; 183303d0c59SAndreas Gohr try { 184661701eeSAndreas Gohr $text = p_cached_output(wikiFN($page), 'aichat', $page); 185303d0c59SAndreas Gohr } catch (\Throwable $e) { 186303d0c59SAndreas Gohr if ($this->logger) $this->logger->error( 187661701eeSAndreas Gohr 'Failed to render page {page}. Using raw text instead. {msg}', 188303d0c59SAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 189303d0c59SAndreas Gohr ); 190303d0c59SAndreas Gohr $text = rawWiki($page); 191303d0c59SAndreas Gohr } 192661701eeSAndreas Gohr 193661701eeSAndreas Gohr $crumbs = $this->breadcrumbTrail($page); 19488305719SAndreas Gohr 195ab1f8ddeSAndreas Gohr // allow plugins to modify the text before splitting 196ab1f8ddeSAndreas Gohr $eventData = [ 197ab1f8ddeSAndreas Gohr 'page' => $page, 198ab1f8ddeSAndreas Gohr 'body' => '', 199ab1f8ddeSAndreas Gohr 'metadata' => ['title' => $page, 'relation_references' => []], 200ab1f8ddeSAndreas Gohr ]; 201ab1f8ddeSAndreas Gohr $event = new Event('INDEXER_PAGE_ADD', $eventData); 202ab1f8ddeSAndreas Gohr if ($event->advise_before()) { 203ab1f8ddeSAndreas Gohr $text = $eventData['body'] . ' ' . $text; 204ab1f8ddeSAndreas Gohr } else { 205ab1f8ddeSAndreas Gohr $text = $eventData['body']; 206ab1f8ddeSAndreas Gohr } 207ab1f8ddeSAndreas Gohr 20888305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 2097ee8b02dSAndreas Gohr foreach ($parts as $part) { 21030b9cbc7Ssplitbrain if (trim((string)$part) == '') continue; // skip empty chunks 21193c1dbf4SAndreas Gohr 212661701eeSAndreas Gohr $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 213661701eeSAndreas Gohr 214ad38c5fdSAndreas Gohr try { 2156a18e0f4SAndreas Gohr $embedding = $this->embedModel->getEmbedding($part); 216ad38c5fdSAndreas Gohr } catch (\Exception $e) { 2177ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 218ad38c5fdSAndreas Gohr $this->logger->error( 219ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 220ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 221ad38c5fdSAndreas Gohr ); 222ad38c5fdSAndreas Gohr } 223ad38c5fdSAndreas Gohr continue; 224ad38c5fdSAndreas Gohr } 2257ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 2267ee8b02dSAndreas Gohr $firstChunkID++; 2278817535bSAndreas Gohr } 2287ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 2297ebc7895Ssplitbrain if ($chunkList !== []) { 230f8d5ae01SAndreas Gohr $this->logger->success( 231f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 232f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 233f8d5ae01SAndreas Gohr ); 23493c1dbf4SAndreas Gohr } else { 23593c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 23693c1dbf4SAndreas Gohr } 2378817535bSAndreas Gohr } 2387ee8b02dSAndreas Gohr return $chunkList; 2398817535bSAndreas Gohr } 2408817535bSAndreas Gohr 2419e81bea7SAndreas Gohr /** 2429e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 2439e81bea7SAndreas Gohr * 2449e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 24568908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 2469e81bea7SAndreas Gohr * 2479e81bea7SAndreas Gohr * @param string $query The question 248e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 2497ee8b02dSAndreas Gohr * @return Chunk[] 2509e81bea7SAndreas Gohr * @throws \Exception 2519e81bea7SAndreas Gohr */ 252e33a1d7aSAndreas Gohr public function getSimilarChunks($query, $lang = '') 2538817535bSAndreas Gohr { 2549e81bea7SAndreas Gohr global $auth; 2556a18e0f4SAndreas Gohr $vector = $this->embedModel->getEmbedding($query); 2568817535bSAndreas Gohr 257e3640be8SAndreas Gohr $fetch = min( 25834a1c478SAndreas Gohr ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()), 25934a1c478SAndreas Gohr $this->configContextChunks 260f6ef2e50SAndreas Gohr ); 261aee9b383SAndreas Gohr 262aee9b383SAndreas Gohr $time = microtime(true); 263e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 2645f71c9bbSAndreas Gohr $this->timeSpent = round(microtime(true) - $time, 2); 2657ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 266aee9b383SAndreas Gohr $this->logger->info( 267*c2f55081SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 268*c2f55081SAndreas Gohr ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 269aee9b383SAndreas Gohr ); 270aee9b383SAndreas Gohr } 27168908844SAndreas Gohr 27268908844SAndreas Gohr $size = 0; 2738817535bSAndreas Gohr $result = []; 2747ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 2759e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 2767ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 277720bb43fSAndreas Gohr if ($chunk->getScore() < $this->similarityThreshold) continue; 27868908844SAndreas Gohr 27968908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 28034a1c478SAndreas Gohr if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough 28168908844SAndreas Gohr 2829e81bea7SAndreas Gohr $result[] = $chunk; 28368908844SAndreas Gohr $size += $chunkSize; 2848817535bSAndreas Gohr } 2858817535bSAndreas Gohr return $result; 2868817535bSAndreas Gohr } 2878817535bSAndreas Gohr 288661701eeSAndreas Gohr /** 289661701eeSAndreas Gohr * Create a breadcrumb trail for the given page 290661701eeSAndreas Gohr * 291661701eeSAndreas Gohr * Uses the first heading of each namespace and the page itself. This is added as a prefix to 292661701eeSAndreas Gohr * each chunk to give the AI some context. 293661701eeSAndreas Gohr * 294661701eeSAndreas Gohr * @param string $id 295661701eeSAndreas Gohr * @return string 296661701eeSAndreas Gohr */ 297661701eeSAndreas Gohr protected function breadcrumbTrail($id) 298661701eeSAndreas Gohr { 299661701eeSAndreas Gohr $namespaces = explode(':', getNS($id)); 300661701eeSAndreas Gohr $resolver = new PageResolver($id); 301661701eeSAndreas Gohr $crumbs = []; 302661701eeSAndreas Gohr 303661701eeSAndreas Gohr // all namespaces 304661701eeSAndreas Gohr $check = ''; 305661701eeSAndreas Gohr foreach ($namespaces as $namespace) { 306661701eeSAndreas Gohr $check .= $namespace . ':'; 307661701eeSAndreas Gohr $page = $resolver->resolveId($check); 308661701eeSAndreas Gohr $title = p_get_first_heading($page); 309661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($namespace)" : $namespace; 310661701eeSAndreas Gohr } 311661701eeSAndreas Gohr 312661701eeSAndreas Gohr // the page itself 313661701eeSAndreas Gohr $title = p_get_first_heading($id); 314661701eeSAndreas Gohr $page = noNS($id); 315661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($page)" : $page; 316661701eeSAndreas Gohr 317661701eeSAndreas Gohr return implode(' » ', $crumbs); 318661701eeSAndreas Gohr } 3195786be46SAndreas Gohr 3205786be46SAndreas Gohr /** 3218817535bSAndreas Gohr * @param $text 3228817535bSAndreas Gohr * @return array 3238817535bSAndreas Gohr * @throws \Exception 3248817535bSAndreas Gohr * @todo support splitting too long sentences 3258817535bSAndreas Gohr */ 326ab1f8ddeSAndreas Gohr protected function splitIntoChunks($text) 3278817535bSAndreas Gohr { 3288817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 32968908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 3308817535bSAndreas Gohr 3318817535bSAndreas Gohr $chunks = []; 3328817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 3338817535bSAndreas Gohr 3348817535bSAndreas Gohr $chunklen = 0; 3358817535bSAndreas Gohr $chunk = ''; 3368817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 3378817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 3386a18e0f4SAndreas Gohr if ($slen > $this->getChunkSize()) { 3398817535bSAndreas Gohr // sentence is too long, we need to split it further 340f8d5ae01SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 341f8d5ae01SAndreas Gohr 'Sentence too long, splitting not implemented yet' 342f8d5ae01SAndreas Gohr ); 343ad38c5fdSAndreas Gohr continue; 3448817535bSAndreas Gohr } 3458817535bSAndreas Gohr 3466a18e0f4SAndreas Gohr if ($chunklen + $slen < $this->getChunkSize()) { 3478817535bSAndreas Gohr // add to current chunk 3488817535bSAndreas Gohr $chunk .= $sentence; 3498817535bSAndreas Gohr $chunklen += $slen; 35068908844SAndreas Gohr // remember sentence for overlap check 35168908844SAndreas Gohr $this->rememberSentence($sentence); 3528817535bSAndreas Gohr } else { 35368908844SAndreas Gohr // add current chunk to result 354ab1f8ddeSAndreas Gohr $chunk = trim($chunk); 355ab1f8ddeSAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 35668908844SAndreas Gohr 35768908844SAndreas Gohr // start new chunk with remembered sentences 3587ebc7895Ssplitbrain $chunk = implode(' ', $this->sentenceQueue); 35968908844SAndreas Gohr $chunk .= $sentence; 36068908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 3618817535bSAndreas Gohr } 3628817535bSAndreas Gohr } 3638817535bSAndreas Gohr $chunks[] = $chunk; 3648817535bSAndreas Gohr 3658817535bSAndreas Gohr return $chunks; 3668817535bSAndreas Gohr } 36768908844SAndreas Gohr 36868908844SAndreas Gohr /** 36968908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 37068908844SAndreas Gohr * 37168908844SAndreas Gohr * @param string $sentence 37268908844SAndreas Gohr * @return void 37368908844SAndreas Gohr */ 37468908844SAndreas Gohr protected function rememberSentence($sentence) 37568908844SAndreas Gohr { 37668908844SAndreas Gohr // add sentence to queue 37768908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 37868908844SAndreas Gohr 37968908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 38068908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 3817ebc7895Ssplitbrain while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 38268908844SAndreas Gohr array_shift($this->sentenceQueue); 38368908844SAndreas Gohr } 38468908844SAndreas Gohr } 3858817535bSAndreas Gohr} 386