18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 5*2d02fff5SAndreas Gohruse dokuwiki\Cache\CacheRenderer; 6ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event; 7661701eeSAndreas Gohruse dokuwiki\File\PageResolver; 8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface; 9294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface; 10f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 118817535bSAndreas Gohruse dokuwiki\Search\Indexer; 122ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 138817535bSAndreas Gohruse TikToken\Encoder; 148817535bSAndreas Gohr 159da5f0dfSAndreas Gohr/** 169da5f0dfSAndreas Gohr * Manage the embeddings index 179da5f0dfSAndreas Gohr * 189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 209da5f0dfSAndreas Gohr */ 218817535bSAndreas Gohrclass Embeddings 228817535bSAndreas Gohr{ 23294a9eafSAndreas Gohr /** @var ChatInterface */ 246a18e0f4SAndreas Gohr protected $chatModel; 256a18e0f4SAndreas Gohr 26294a9eafSAndreas Gohr /** @var EmbeddingInterface */ 276a18e0f4SAndreas Gohr protected $embedModel; 286a18e0f4SAndreas Gohr 292ecc089aSAndreas Gohr /** @var CLI|null */ 302ecc089aSAndreas Gohr protected $logger; 3168908844SAndreas Gohr /** @var Encoder */ 3268908844SAndreas Gohr protected $tokenEncoder; 338817535bSAndreas Gohr 347ee8b02dSAndreas Gohr /** @var AbstractStorage */ 357ee8b02dSAndreas Gohr protected $storage; 367ee8b02dSAndreas Gohr 3768908844SAndreas Gohr /** @var array remember sentences when chunking */ 3868908844SAndreas Gohr private $sentenceQueue = []; 3968908844SAndreas Gohr 40c2b7a1f7SAndreas Gohr /** @var int the time spent for the last similar chunk retrieval */ 41c2b7a1f7SAndreas Gohr public $timeSpent = 0; 42c2b7a1f7SAndreas Gohr 4334a1c478SAndreas Gohr protected $configChunkSize; 4434a1c478SAndreas Gohr protected $configContextChunks; 45720bb43fSAndreas Gohr protected $similarityThreshold; 4634a1c478SAndreas Gohr 4734a1c478SAndreas Gohr /** 4834a1c478SAndreas Gohr * Embeddings constructor. 4934a1c478SAndreas Gohr * 5034a1c478SAndreas Gohr * @param ChatInterface $chatModel 5134a1c478SAndreas Gohr * @param EmbeddingInterface $embedModel 5234a1c478SAndreas Gohr * @param AbstractStorage $storage 5334a1c478SAndreas Gohr * @param array $config The plugin configuration 5434a1c478SAndreas Gohr */ 556a18e0f4SAndreas Gohr public function __construct( 56294a9eafSAndreas Gohr ChatInterface $chatModel, 57294a9eafSAndreas Gohr EmbeddingInterface $embedModel, 5834a1c478SAndreas Gohr AbstractStorage $storage, 5934a1c478SAndreas Gohr $config 60aa6bbe75SAndreas Gohr ) 61aa6bbe75SAndreas Gohr { 626a18e0f4SAndreas Gohr $this->chatModel = $chatModel; 636a18e0f4SAndreas Gohr $this->embedModel = $embedModel; 64f6ef2e50SAndreas Gohr $this->storage = $storage; 6534a1c478SAndreas Gohr $this->configChunkSize = $config['chunkSize']; 6634a1c478SAndreas Gohr $this->configContextChunks = $config['contextChunks']; 67720bb43fSAndreas Gohr $this->similarityThreshold = $config['similarityThreshold'] / 100; 687ee8b02dSAndreas Gohr } 697ee8b02dSAndreas Gohr 707ee8b02dSAndreas Gohr /** 717ee8b02dSAndreas Gohr * Access storage 727ee8b02dSAndreas Gohr * 737ee8b02dSAndreas Gohr * @return AbstractStorage 747ee8b02dSAndreas Gohr */ 757ee8b02dSAndreas Gohr public function getStorage() 767ee8b02dSAndreas Gohr { 777ee8b02dSAndreas Gohr return $this->storage; 782ecc089aSAndreas Gohr } 792ecc089aSAndreas Gohr 802ecc089aSAndreas Gohr /** 81aa6bbe75SAndreas Gohr * Override the number of used context chunks 82aa6bbe75SAndreas Gohr * 83aa6bbe75SAndreas Gohr * @param int $max 84aa6bbe75SAndreas Gohr * @return void 85aa6bbe75SAndreas Gohr */ 86aa6bbe75SAndreas Gohr public function setConfigContextChunks(int $max) 87aa6bbe75SAndreas Gohr { 88aa6bbe75SAndreas Gohr if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 89aa6bbe75SAndreas Gohr $this->configContextChunks = $max; 90aa6bbe75SAndreas Gohr } 91aa6bbe75SAndreas Gohr 92aa6bbe75SAndreas Gohr /** 93aa6bbe75SAndreas Gohr * Override the similiarity threshold 94aa6bbe75SAndreas Gohr * 95aa6bbe75SAndreas Gohr * @param float $threshold 96aa6bbe75SAndreas Gohr * @return void 97aa6bbe75SAndreas Gohr */ 98aa6bbe75SAndreas Gohr public function setSimilarityThreshold(float $threshold) 99aa6bbe75SAndreas Gohr { 100aa6bbe75SAndreas Gohr if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 101aa6bbe75SAndreas Gohr $this->similarityThreshold = $threshold; 102aa6bbe75SAndreas Gohr } 103aa6bbe75SAndreas Gohr 104aa6bbe75SAndreas Gohr /** 1052ecc089aSAndreas Gohr * Add a logger instance 1062ecc089aSAndreas Gohr * 1072ecc089aSAndreas Gohr * @return void 1082ecc089aSAndreas Gohr */ 1092ecc089aSAndreas Gohr public function setLogger(CLI $logger) 1102ecc089aSAndreas Gohr { 1118817535bSAndreas Gohr $this->logger = $logger; 1128817535bSAndreas Gohr } 1138817535bSAndreas Gohr 1142ecc089aSAndreas Gohr /** 11568908844SAndreas Gohr * Get the token encoder instance 11668908844SAndreas Gohr * 11768908844SAndreas Gohr * @return Encoder 11868908844SAndreas Gohr */ 11968908844SAndreas Gohr public function getTokenEncoder() 12068908844SAndreas Gohr { 1217ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 12268908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 12368908844SAndreas Gohr } 12468908844SAndreas Gohr return $this->tokenEncoder; 12568908844SAndreas Gohr } 12668908844SAndreas Gohr 12768908844SAndreas Gohr /** 1286a18e0f4SAndreas Gohr * Return the chunk size to use 1296a18e0f4SAndreas Gohr * 1306a18e0f4SAndreas Gohr * @return int 1316a18e0f4SAndreas Gohr */ 1326a18e0f4SAndreas Gohr public function getChunkSize() 1336a18e0f4SAndreas Gohr { 1347be8078eSAndreas Gohr $tokenlimit = $this->chatModel->getMaxInputTokenLength(); 1357be8078eSAndreas Gohr if (!$tokenlimit) { 1367be8078eSAndreas Gohr // no token limit, use the configured chunk size 1377be8078eSAndreas Gohr return $this->configChunkSize; 1387be8078eSAndreas Gohr } 1397be8078eSAndreas Gohr 1406a18e0f4SAndreas Gohr return min( 14134a1c478SAndreas Gohr floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 14234a1c478SAndreas Gohr floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 14334a1c478SAndreas Gohr $this->configChunkSize, // this is usually the smallest 1446a18e0f4SAndreas Gohr ); 1456a18e0f4SAndreas Gohr } 1466a18e0f4SAndreas Gohr 1476a18e0f4SAndreas Gohr /** 1485284515dSAndreas Gohr * Update the embeddings storage 1492ecc089aSAndreas Gohr * 150ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 151d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 1525284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 1532ecc089aSAndreas Gohr * @return void 1545284515dSAndreas Gohr * @throws \Exception 1552ecc089aSAndreas Gohr */ 156d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 1578817535bSAndreas Gohr { 1588817535bSAndreas Gohr $indexer = new Indexer(); 1598817535bSAndreas Gohr $pages = $indexer->getPages(); 1608817535bSAndreas Gohr 161f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 1625aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1635aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1645aa45b4dSAndreas Gohr 1655284515dSAndreas Gohr if ( 1665284515dSAndreas Gohr !page_exists($page) || 1675284515dSAndreas Gohr isHiddenPage($page) || 1684e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 169d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string)$page)) || 170d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1715284515dSAndreas Gohr ) { 1725284515dSAndreas Gohr // this page should not be in the index (anymore) 173*2d02fff5SAndreas Gohr $this->dropPageChunks($page, $chunkID); 1745284515dSAndreas Gohr continue; 1755284515dSAndreas Gohr } 1765284515dSAndreas Gohr 1777ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1787ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1795aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1807ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1817ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1825aa45b4dSAndreas Gohr } else { 1835aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1847ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 185ecb0a423SAndreas Gohr $chunks = $this->createPageChunks($page, $chunkID); 186ecb0a423SAndreas Gohr if ($chunks) $this->storage->addPageChunks($chunks); 1875aa45b4dSAndreas Gohr } 1885aa45b4dSAndreas Gohr } 1897ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1905aa45b4dSAndreas Gohr } 1915aa45b4dSAndreas Gohr 1925aa45b4dSAndreas Gohr /** 1939634d734SAndreas Gohr * Get the content of a page 1949634d734SAndreas Gohr * 1959634d734SAndreas Gohr * Uses our own renderer to format the contents in an LLM friendly way. Falls back to 1969634d734SAndreas Gohr * raw syntax if the renderer fails for some reason 1979634d734SAndreas Gohr * 1989634d734SAndreas Gohr * @param string $page Name of the page to read 1999634d734SAndreas Gohr * @return string The content of the page 2009634d734SAndreas Gohr */ 2019634d734SAndreas Gohr public function getPageContent($page) 2029634d734SAndreas Gohr { 2039634d734SAndreas Gohr global $ID; 2049634d734SAndreas Gohr $ID = $page; 2059634d734SAndreas Gohr try { 2069634d734SAndreas Gohr $text = p_cached_output(wikiFN($page), 'aichat', $page); 2079634d734SAndreas Gohr } catch (\Throwable $e) { 2089634d734SAndreas Gohr if ($this->logger) $this->logger->error( 2099634d734SAndreas Gohr 'Failed to render page {page}. Using raw text instead. {msg}', 2109634d734SAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 2119634d734SAndreas Gohr ); 2129634d734SAndreas Gohr $text = rawWiki($page); 2139634d734SAndreas Gohr } 2149634d734SAndreas Gohr return $text; 2159634d734SAndreas Gohr } 2169634d734SAndreas Gohr 2179634d734SAndreas Gohr /** 2187ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 2195aa45b4dSAndreas Gohr * 22088305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 22188305719SAndreas Gohr * Otherwise the raw wiki text is used. 22288305719SAndreas Gohr * 2235aa45b4dSAndreas Gohr * @param string $page Name of the page to split 2247ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 2257ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 226ab1f8ddeSAndreas Gohr * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 2275aa45b4dSAndreas Gohr * @throws \Exception 2285aa45b4dSAndreas Gohr */ 229ab1f8ddeSAndreas Gohr public function createPageChunks($page, $firstChunkID) 2305aa45b4dSAndreas Gohr { 2317ee8b02dSAndreas Gohr $chunkList = []; 23288305719SAndreas Gohr 2339634d734SAndreas Gohr $text = $this->getPageContent($page); 234661701eeSAndreas Gohr $crumbs = $this->breadcrumbTrail($page); 23588305719SAndreas Gohr 236ab1f8ddeSAndreas Gohr // allow plugins to modify the text before splitting 237ab1f8ddeSAndreas Gohr $eventData = [ 238ab1f8ddeSAndreas Gohr 'page' => $page, 239ab1f8ddeSAndreas Gohr 'body' => '', 240ab1f8ddeSAndreas Gohr 'metadata' => ['title' => $page, 'relation_references' => []], 241ab1f8ddeSAndreas Gohr ]; 242ab1f8ddeSAndreas Gohr $event = new Event('INDEXER_PAGE_ADD', $eventData); 243ab1f8ddeSAndreas Gohr if ($event->advise_before()) { 244ab1f8ddeSAndreas Gohr $text = $eventData['body'] . ' ' . $text; 245ab1f8ddeSAndreas Gohr } else { 246ab1f8ddeSAndreas Gohr $text = $eventData['body']; 247ab1f8ddeSAndreas Gohr } 248ab1f8ddeSAndreas Gohr 249072e0099SAndreas Gohr $splitter = new TextSplitter($this->getChunkSize(), $this->getTokenEncoder()); 250072e0099SAndreas Gohr $parts = $splitter->splitIntoChunks($text); 2517ee8b02dSAndreas Gohr foreach ($parts as $part) { 252072e0099SAndreas Gohr if (trim($part) === '') continue; // skip empty chunks 25393c1dbf4SAndreas Gohr 254661701eeSAndreas Gohr $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 255661701eeSAndreas Gohr 256ad38c5fdSAndreas Gohr try { 2576a18e0f4SAndreas Gohr $embedding = $this->embedModel->getEmbedding($part); 258ad38c5fdSAndreas Gohr } catch (\Exception $e) { 2597ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 260ad38c5fdSAndreas Gohr $this->logger->error( 261ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 262ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 263ad38c5fdSAndreas Gohr ); 264ad38c5fdSAndreas Gohr } 265ad38c5fdSAndreas Gohr continue; 266ad38c5fdSAndreas Gohr } 2677ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 2687ee8b02dSAndreas Gohr $firstChunkID++; 2698817535bSAndreas Gohr } 2707ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 2717ebc7895Ssplitbrain if ($chunkList !== []) { 272f8d5ae01SAndreas Gohr $this->logger->success( 273f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 274f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 275f8d5ae01SAndreas Gohr ); 27693c1dbf4SAndreas Gohr } else { 27793c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 27893c1dbf4SAndreas Gohr } 2798817535bSAndreas Gohr } 2807ee8b02dSAndreas Gohr return $chunkList; 2818817535bSAndreas Gohr } 2828817535bSAndreas Gohr 2839e81bea7SAndreas Gohr /** 2849e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 2859e81bea7SAndreas Gohr * 2869e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 28768908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 2889e81bea7SAndreas Gohr * 2899e81bea7SAndreas Gohr * @param string $query The question 290e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 291aa6bbe75SAndreas Gohr * @param bool $limits Apply chat token limits to the number of chunks returned? 2927ee8b02dSAndreas Gohr * @return Chunk[] 2939e81bea7SAndreas Gohr * @throws \Exception 2949e81bea7SAndreas Gohr */ 295aa6bbe75SAndreas Gohr public function getSimilarChunks($query, $lang = '', $limits = true) 2968817535bSAndreas Gohr { 2979e81bea7SAndreas Gohr global $auth; 2986a18e0f4SAndreas Gohr $vector = $this->embedModel->getEmbedding($query); 2998817535bSAndreas Gohr 3007be8078eSAndreas Gohr $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 3017be8078eSAndreas Gohr 3027be8078eSAndreas Gohr if ($tokenlimit) { 303e3640be8SAndreas Gohr $fetch = min( 3047be8078eSAndreas Gohr ($tokenlimit / $this->getChunkSize()), 30534a1c478SAndreas Gohr $this->configContextChunks 306f6ef2e50SAndreas Gohr ); 307aa6bbe75SAndreas Gohr } else { 308aa6bbe75SAndreas Gohr $fetch = $this->configContextChunks; 309aa6bbe75SAndreas Gohr } 310aee9b383SAndreas Gohr 311aee9b383SAndreas Gohr $time = microtime(true); 312e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 3135f71c9bbSAndreas Gohr $this->timeSpent = round(microtime(true) - $time, 2); 3147ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 315aee9b383SAndreas Gohr $this->logger->info( 316c2f55081SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 317c2f55081SAndreas Gohr ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 318aee9b383SAndreas Gohr ); 319aee9b383SAndreas Gohr } 32068908844SAndreas Gohr 32168908844SAndreas Gohr $size = 0; 3228817535bSAndreas Gohr $result = []; 3237ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 3249e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 3257ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 326720bb43fSAndreas Gohr if ($chunk->getScore() < $this->similarityThreshold) continue; 32768908844SAndreas Gohr 3287be8078eSAndreas Gohr if ($tokenlimit) { 32968908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 3307be8078eSAndreas Gohr if ($size + $chunkSize > $tokenlimit) break; // we have enough 331aa6bbe75SAndreas Gohr } 33268908844SAndreas Gohr 3339e81bea7SAndreas Gohr $result[] = $chunk; 334aa6bbe75SAndreas Gohr $size += $chunkSize ?? 0; 335aa6bbe75SAndreas Gohr 336aa6bbe75SAndreas Gohr if (count($result) >= $this->configContextChunks) break; // we have enough 3378817535bSAndreas Gohr } 3388817535bSAndreas Gohr return $result; 3398817535bSAndreas Gohr } 3408817535bSAndreas Gohr 341661701eeSAndreas Gohr /** 3429634d734SAndreas Gohr * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk 3439634d734SAndreas Gohr * 3449634d734SAndreas Gohr * This will not apply any token limits 3459634d734SAndreas Gohr * 3469634d734SAndreas Gohr * @param string $query The question 3479634d734SAndreas Gohr * @param string $lang Limit results to this language 3489634d734SAndreas Gohr * @return Chunk[] 3499634d734SAndreas Gohr * @throws \Exception 3509634d734SAndreas Gohr */ 3519634d734SAndreas Gohr public function getSimilarPages($query, $lang = '') 3529634d734SAndreas Gohr { 3539634d734SAndreas Gohr $chunks = $this->getSimilarChunks($query, $lang, false); 3549634d734SAndreas Gohr $pages = []; 3559634d734SAndreas Gohr 3569634d734SAndreas Gohr foreach ($chunks as $chunk) { 3579634d734SAndreas Gohr $page = $chunk->getPage(); 3589634d734SAndreas Gohr if (isset($pages[$page])) continue; // we already have this page 3599634d734SAndreas Gohr 3609634d734SAndreas Gohr $content = $this->getPageContent($chunk->getPage()); 3619634d734SAndreas Gohr $crumbs = $this->breadcrumbTrail($chunk->getPage()); 3629634d734SAndreas Gohr 3639634d734SAndreas Gohr $pages[$page] = new Chunk( 3649634d734SAndreas Gohr $page, 3659634d734SAndreas Gohr $chunk->getId(), 3669634d734SAndreas Gohr $crumbs . "\n\n" . $content, 3679634d734SAndreas Gohr $chunk->getEmbedding(), 3689634d734SAndreas Gohr $chunk->getLanguage(), 3699634d734SAndreas Gohr $chunk->getCreated(), 3709634d734SAndreas Gohr $chunk->getScore() 3719634d734SAndreas Gohr ); 3729634d734SAndreas Gohr } 3739634d734SAndreas Gohr return $pages; 3749634d734SAndreas Gohr } 3759634d734SAndreas Gohr 3769634d734SAndreas Gohr /** 377ed47fd87SAndreas Gohr * Returns all chunks for a page 378ed47fd87SAndreas Gohr * 379ed47fd87SAndreas Gohr * Does not apply configContextChunks but checks token limits if requested 380ed47fd87SAndreas Gohr * 381ed47fd87SAndreas Gohr * @param string $page 382ed47fd87SAndreas Gohr * @param bool $limits Apply chat token limits to the number of chunks returned? 383ed47fd87SAndreas Gohr * @return Chunk[] 384ed47fd87SAndreas Gohr */ 385ed47fd87SAndreas Gohr public function getPageChunks($page, $limits = true) 386ed47fd87SAndreas Gohr { 387ed47fd87SAndreas Gohr global $auth; 388ed47fd87SAndreas Gohr if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 389ed47fd87SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 390ed47fd87SAndreas Gohr 'User not allowed to read context page {page}', ['page' => $page] 391ed47fd87SAndreas Gohr ); 392ed47fd87SAndreas Gohr return []; 393ed47fd87SAndreas Gohr } 394ed47fd87SAndreas Gohr 395ed47fd87SAndreas Gohr $indexer = new Indexer(); 396ed47fd87SAndreas Gohr $pages = $indexer->getPages(); 397ed47fd87SAndreas Gohr $pos = array_search(cleanID($page), $pages); 398ed47fd87SAndreas Gohr 399ed47fd87SAndreas Gohr if ($pos === false) { 400ed47fd87SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 401ed47fd87SAndreas Gohr 'Context page {page} is not in index', ['page' => $page] 402ed47fd87SAndreas Gohr ); 403ed47fd87SAndreas Gohr return []; 404ed47fd87SAndreas Gohr } 405ed47fd87SAndreas Gohr 406ed47fd87SAndreas Gohr $chunks = $this->storage->getPageChunks($page, $pos * 100); 407ed47fd87SAndreas Gohr 4087be8078eSAndreas Gohr $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 4097be8078eSAndreas Gohr 410ed47fd87SAndreas Gohr $size = 0; 411ed47fd87SAndreas Gohr $result = []; 412ed47fd87SAndreas Gohr foreach ($chunks as $chunk) { 4137be8078eSAndreas Gohr if ($tokenlimit) { 414ed47fd87SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 4157be8078eSAndreas Gohr if ($size + $chunkSize > $tokenlimit) break; // we have enough 416ed47fd87SAndreas Gohr } 417ed47fd87SAndreas Gohr 418ed47fd87SAndreas Gohr $result[] = $chunk; 419ed47fd87SAndreas Gohr $size += $chunkSize ?? 0; 420ed47fd87SAndreas Gohr } 421ed47fd87SAndreas Gohr 422ed47fd87SAndreas Gohr return $result; 423ed47fd87SAndreas Gohr } 424ed47fd87SAndreas Gohr 425*2d02fff5SAndreas Gohr /** 426*2d02fff5SAndreas Gohr * Drop the chunks for the given page from the storage and delete the render cache file 427*2d02fff5SAndreas Gohr * 428*2d02fff5SAndreas Gohr * This is a performance optimization, it only deletes chunks when a previously rendered cache 429*2d02fff5SAndreas Gohr * file exists or if forced. 430*2d02fff5SAndreas Gohr * 431*2d02fff5SAndreas Gohr * @param string $page 432*2d02fff5SAndreas Gohr * @param int $chunkID 433*2d02fff5SAndreas Gohr * @param bool $force Should we force deletion even if no cache file exists? 434*2d02fff5SAndreas Gohr * @return void 435*2d02fff5SAndreas Gohr */ 436*2d02fff5SAndreas Gohr public function dropPageChunks($page, $chunkID, $force = false) 437*2d02fff5SAndreas Gohr { 438*2d02fff5SAndreas Gohr $cache = new CacheRenderer($page, wikiFN($page), 'aichat'); 439*2d02fff5SAndreas Gohr $x = $cache->cache; 440*2d02fff5SAndreas Gohr if ($force || file_exists($cache->cache)) { 441*2d02fff5SAndreas Gohr if ($this->logger instanceof CLI) { 442*2d02fff5SAndreas Gohr $this->logger->info("Deleting chunks for page $page"); 443*2d02fff5SAndreas Gohr } 444*2d02fff5SAndreas Gohr 445*2d02fff5SAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 446*2d02fff5SAndreas Gohr @unlink($cache->cache); 447*2d02fff5SAndreas Gohr } 448*2d02fff5SAndreas Gohr } 449ed47fd87SAndreas Gohr 450ed47fd87SAndreas Gohr /** 451661701eeSAndreas Gohr * Create a breadcrumb trail for the given page 452661701eeSAndreas Gohr * 453661701eeSAndreas Gohr * Uses the first heading of each namespace and the page itself. This is added as a prefix to 454661701eeSAndreas Gohr * each chunk to give the AI some context. 455661701eeSAndreas Gohr * 456661701eeSAndreas Gohr * @param string $id 457661701eeSAndreas Gohr * @return string 458661701eeSAndreas Gohr */ 459661701eeSAndreas Gohr protected function breadcrumbTrail($id) 460661701eeSAndreas Gohr { 461661701eeSAndreas Gohr $namespaces = explode(':', getNS($id)); 462661701eeSAndreas Gohr $resolver = new PageResolver($id); 463661701eeSAndreas Gohr $crumbs = []; 464661701eeSAndreas Gohr 465661701eeSAndreas Gohr // all namespaces 466661701eeSAndreas Gohr $check = ''; 467661701eeSAndreas Gohr foreach ($namespaces as $namespace) { 468661701eeSAndreas Gohr $check .= $namespace . ':'; 469661701eeSAndreas Gohr $page = $resolver->resolveId($check); 470661701eeSAndreas Gohr $title = p_get_first_heading($page); 471661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($namespace)" : $namespace; 472661701eeSAndreas Gohr } 473661701eeSAndreas Gohr 474661701eeSAndreas Gohr // the page itself 475661701eeSAndreas Gohr $title = p_get_first_heading($id); 476661701eeSAndreas Gohr $page = noNS($id); 477661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($page)" : $page; 478661701eeSAndreas Gohr 479661701eeSAndreas Gohr return implode(' » ', $crumbs); 480661701eeSAndreas Gohr } 4818817535bSAndreas Gohr} 482