18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 5ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event; 6661701eeSAndreas Gohruse dokuwiki\File\PageResolver; 7294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface; 8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 108817535bSAndreas Gohruse dokuwiki\Search\Indexer; 112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 128817535bSAndreas Gohruse TikToken\Encoder; 138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 148817535bSAndreas Gohr 159da5f0dfSAndreas Gohr/** 169da5f0dfSAndreas Gohr * Manage the embeddings index 179da5f0dfSAndreas Gohr * 189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 209da5f0dfSAndreas Gohr */ 218817535bSAndreas Gohrclass Embeddings 228817535bSAndreas Gohr{ 2368908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 2430b9cbc7Ssplitbrain final public const MAX_OVERLAP_LEN = 200; 258817535bSAndreas Gohr 26294a9eafSAndreas Gohr /** @var ChatInterface */ 276a18e0f4SAndreas Gohr protected $chatModel; 286a18e0f4SAndreas Gohr 29294a9eafSAndreas Gohr /** @var EmbeddingInterface */ 306a18e0f4SAndreas Gohr protected $embedModel; 316a18e0f4SAndreas Gohr 322ecc089aSAndreas Gohr /** @var CLI|null */ 332ecc089aSAndreas Gohr protected $logger; 3468908844SAndreas Gohr /** @var Encoder */ 3568908844SAndreas Gohr protected $tokenEncoder; 368817535bSAndreas Gohr 377ee8b02dSAndreas Gohr /** @var AbstractStorage */ 387ee8b02dSAndreas Gohr protected $storage; 397ee8b02dSAndreas Gohr 4068908844SAndreas Gohr /** @var array remember sentences when chunking */ 4168908844SAndreas Gohr private $sentenceQueue = []; 4268908844SAndreas Gohr 43c2b7a1f7SAndreas Gohr /** @var int the time spent for the last similar chunk retrieval */ 44c2b7a1f7SAndreas Gohr public $timeSpent = 0; 45c2b7a1f7SAndreas Gohr 4634a1c478SAndreas Gohr protected $configChunkSize; 4734a1c478SAndreas Gohr protected $configContextChunks; 48720bb43fSAndreas Gohr protected $similarityThreshold; 4934a1c478SAndreas Gohr 5034a1c478SAndreas Gohr /** 5134a1c478SAndreas Gohr * Embeddings constructor. 5234a1c478SAndreas Gohr * 5334a1c478SAndreas Gohr * @param ChatInterface $chatModel 5434a1c478SAndreas Gohr * @param EmbeddingInterface $embedModel 5534a1c478SAndreas Gohr * @param AbstractStorage $storage 5634a1c478SAndreas Gohr * @param array $config The plugin configuration 5734a1c478SAndreas Gohr */ 586a18e0f4SAndreas Gohr public function __construct( 59294a9eafSAndreas Gohr ChatInterface $chatModel, 60294a9eafSAndreas Gohr EmbeddingInterface $embedModel, 6134a1c478SAndreas Gohr AbstractStorage $storage, 6234a1c478SAndreas Gohr $config 63aa6bbe75SAndreas Gohr ) 64aa6bbe75SAndreas Gohr { 656a18e0f4SAndreas Gohr $this->chatModel = $chatModel; 666a18e0f4SAndreas Gohr $this->embedModel = $embedModel; 67f6ef2e50SAndreas Gohr $this->storage = $storage; 6834a1c478SAndreas Gohr $this->configChunkSize = $config['chunkSize']; 6934a1c478SAndreas Gohr $this->configContextChunks = $config['contextChunks']; 70720bb43fSAndreas Gohr $this->similarityThreshold = $config['similarityThreshold'] / 100; 717ee8b02dSAndreas Gohr } 727ee8b02dSAndreas Gohr 737ee8b02dSAndreas Gohr /** 747ee8b02dSAndreas Gohr * Access storage 757ee8b02dSAndreas Gohr * 767ee8b02dSAndreas Gohr * @return AbstractStorage 777ee8b02dSAndreas Gohr */ 787ee8b02dSAndreas Gohr public function getStorage() 797ee8b02dSAndreas Gohr { 807ee8b02dSAndreas Gohr return $this->storage; 812ecc089aSAndreas Gohr } 822ecc089aSAndreas Gohr 832ecc089aSAndreas Gohr /** 84aa6bbe75SAndreas Gohr * Override the number of used context chunks 85aa6bbe75SAndreas Gohr * 86aa6bbe75SAndreas Gohr * @param int $max 87aa6bbe75SAndreas Gohr * @return void 88aa6bbe75SAndreas Gohr */ 89aa6bbe75SAndreas Gohr public function setConfigContextChunks(int $max) 90aa6bbe75SAndreas Gohr { 91aa6bbe75SAndreas Gohr if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 92aa6bbe75SAndreas Gohr $this->configContextChunks = $max; 93aa6bbe75SAndreas Gohr } 94aa6bbe75SAndreas Gohr 95aa6bbe75SAndreas Gohr /** 96aa6bbe75SAndreas Gohr * Override the similiarity threshold 97aa6bbe75SAndreas Gohr * 98aa6bbe75SAndreas Gohr * @param float $threshold 99aa6bbe75SAndreas Gohr * @return void 100aa6bbe75SAndreas Gohr */ 101aa6bbe75SAndreas Gohr public function setSimilarityThreshold(float $threshold) 102aa6bbe75SAndreas Gohr { 103aa6bbe75SAndreas Gohr if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 104aa6bbe75SAndreas Gohr $this->similarityThreshold = $threshold; 105aa6bbe75SAndreas Gohr } 106aa6bbe75SAndreas Gohr 107aa6bbe75SAndreas Gohr /** 1082ecc089aSAndreas Gohr * Add a logger instance 1092ecc089aSAndreas Gohr * 1102ecc089aSAndreas Gohr * @return void 1112ecc089aSAndreas Gohr */ 1122ecc089aSAndreas Gohr public function setLogger(CLI $logger) 1132ecc089aSAndreas Gohr { 1148817535bSAndreas Gohr $this->logger = $logger; 1158817535bSAndreas Gohr } 1168817535bSAndreas Gohr 1172ecc089aSAndreas Gohr /** 11868908844SAndreas Gohr * Get the token encoder instance 11968908844SAndreas Gohr * 12068908844SAndreas Gohr * @return Encoder 12168908844SAndreas Gohr */ 12268908844SAndreas Gohr public function getTokenEncoder() 12368908844SAndreas Gohr { 1247ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 12568908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 12668908844SAndreas Gohr } 12768908844SAndreas Gohr return $this->tokenEncoder; 12868908844SAndreas Gohr } 12968908844SAndreas Gohr 13068908844SAndreas Gohr /** 1316a18e0f4SAndreas Gohr * Return the chunk size to use 1326a18e0f4SAndreas Gohr * 1336a18e0f4SAndreas Gohr * @return int 1346a18e0f4SAndreas Gohr */ 1356a18e0f4SAndreas Gohr public function getChunkSize() 1366a18e0f4SAndreas Gohr { 1377be8078eSAndreas Gohr $tokenlimit = $this->chatModel->getMaxInputTokenLength(); 1387be8078eSAndreas Gohr if (!$tokenlimit) { 1397be8078eSAndreas Gohr // no token limit, use the configured chunk size 1407be8078eSAndreas Gohr return $this->configChunkSize; 1417be8078eSAndreas Gohr } 1427be8078eSAndreas Gohr 1436a18e0f4SAndreas Gohr return min( 14434a1c478SAndreas Gohr floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 14534a1c478SAndreas Gohr floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 14634a1c478SAndreas Gohr $this->configChunkSize, // this is usually the smallest 1476a18e0f4SAndreas Gohr ); 1486a18e0f4SAndreas Gohr } 1496a18e0f4SAndreas Gohr 1506a18e0f4SAndreas Gohr /** 1515284515dSAndreas Gohr * Update the embeddings storage 1522ecc089aSAndreas Gohr * 153ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 154d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 1555284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 1562ecc089aSAndreas Gohr * @return void 1575284515dSAndreas Gohr * @throws \Exception 1582ecc089aSAndreas Gohr */ 159d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 1608817535bSAndreas Gohr { 1618817535bSAndreas Gohr $indexer = new Indexer(); 1628817535bSAndreas Gohr $pages = $indexer->getPages(); 1638817535bSAndreas Gohr 164f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 1655aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1665aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1675aa45b4dSAndreas Gohr 1685284515dSAndreas Gohr if ( 1695284515dSAndreas Gohr !page_exists($page) || 1705284515dSAndreas Gohr isHiddenPage($page) || 1714e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 172d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string)$page)) || 173d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1745284515dSAndreas Gohr ) { 1755284515dSAndreas Gohr // this page should not be in the index (anymore) 1765284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1775284515dSAndreas Gohr continue; 1785284515dSAndreas Gohr } 1795284515dSAndreas Gohr 1807ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1817ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1825aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1837ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1847ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1855aa45b4dSAndreas Gohr } else { 1865aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1877ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 188ecb0a423SAndreas Gohr $chunks = $this->createPageChunks($page, $chunkID); 189ecb0a423SAndreas Gohr if ($chunks) $this->storage->addPageChunks($chunks); 1905aa45b4dSAndreas Gohr } 1915aa45b4dSAndreas Gohr } 1927ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1935aa45b4dSAndreas Gohr } 1945aa45b4dSAndreas Gohr 1955aa45b4dSAndreas Gohr /** 196*9634d734SAndreas Gohr * Get the content of a page 197*9634d734SAndreas Gohr * 198*9634d734SAndreas Gohr * Uses our own renderer to format the contents in an LLM friendly way. Falls back to 199*9634d734SAndreas Gohr * raw syntax if the renderer fails for some reason 200*9634d734SAndreas Gohr * 201*9634d734SAndreas Gohr * @param string $page Name of the page to read 202*9634d734SAndreas Gohr * @return string The content of the page 203*9634d734SAndreas Gohr */ 204*9634d734SAndreas Gohr public function getPageContent($page) 205*9634d734SAndreas Gohr { 206*9634d734SAndreas Gohr global $ID; 207*9634d734SAndreas Gohr $ID = $page; 208*9634d734SAndreas Gohr try { 209*9634d734SAndreas Gohr $text = p_cached_output(wikiFN($page), 'aichat', $page); 210*9634d734SAndreas Gohr } catch (\Throwable $e) { 211*9634d734SAndreas Gohr if ($this->logger) $this->logger->error( 212*9634d734SAndreas Gohr 'Failed to render page {page}. Using raw text instead. {msg}', 213*9634d734SAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 214*9634d734SAndreas Gohr ); 215*9634d734SAndreas Gohr $text = rawWiki($page); 216*9634d734SAndreas Gohr } 217*9634d734SAndreas Gohr return $text; 218*9634d734SAndreas Gohr } 219*9634d734SAndreas Gohr 220*9634d734SAndreas Gohr /** 2217ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 2225aa45b4dSAndreas Gohr * 22388305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 22488305719SAndreas Gohr * Otherwise the raw wiki text is used. 22588305719SAndreas Gohr * 2265aa45b4dSAndreas Gohr * @param string $page Name of the page to split 2277ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 2287ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 229ab1f8ddeSAndreas Gohr * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 2305aa45b4dSAndreas Gohr * @throws \Exception 2315aa45b4dSAndreas Gohr */ 232ab1f8ddeSAndreas Gohr public function createPageChunks($page, $firstChunkID) 2335aa45b4dSAndreas Gohr { 2347ee8b02dSAndreas Gohr $chunkList = []; 23588305719SAndreas Gohr 236*9634d734SAndreas Gohr $text = $this->getPageContent($page); 237661701eeSAndreas Gohr $crumbs = $this->breadcrumbTrail($page); 23888305719SAndreas Gohr 239ab1f8ddeSAndreas Gohr // allow plugins to modify the text before splitting 240ab1f8ddeSAndreas Gohr $eventData = [ 241ab1f8ddeSAndreas Gohr 'page' => $page, 242ab1f8ddeSAndreas Gohr 'body' => '', 243ab1f8ddeSAndreas Gohr 'metadata' => ['title' => $page, 'relation_references' => []], 244ab1f8ddeSAndreas Gohr ]; 245ab1f8ddeSAndreas Gohr $event = new Event('INDEXER_PAGE_ADD', $eventData); 246ab1f8ddeSAndreas Gohr if ($event->advise_before()) { 247ab1f8ddeSAndreas Gohr $text = $eventData['body'] . ' ' . $text; 248ab1f8ddeSAndreas Gohr } else { 249ab1f8ddeSAndreas Gohr $text = $eventData['body']; 250ab1f8ddeSAndreas Gohr } 251ab1f8ddeSAndreas Gohr 25288305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 2537ee8b02dSAndreas Gohr foreach ($parts as $part) { 25430b9cbc7Ssplitbrain if (trim((string)$part) == '') continue; // skip empty chunks 25593c1dbf4SAndreas Gohr 256661701eeSAndreas Gohr $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 257661701eeSAndreas Gohr 258ad38c5fdSAndreas Gohr try { 2596a18e0f4SAndreas Gohr $embedding = $this->embedModel->getEmbedding($part); 260ad38c5fdSAndreas Gohr } catch (\Exception $e) { 2617ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 262ad38c5fdSAndreas Gohr $this->logger->error( 263ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 264ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 265ad38c5fdSAndreas Gohr ); 266ad38c5fdSAndreas Gohr } 267ad38c5fdSAndreas Gohr continue; 268ad38c5fdSAndreas Gohr } 2697ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 2707ee8b02dSAndreas Gohr $firstChunkID++; 2718817535bSAndreas Gohr } 2727ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 2737ebc7895Ssplitbrain if ($chunkList !== []) { 274f8d5ae01SAndreas Gohr $this->logger->success( 275f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 276f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 277f8d5ae01SAndreas Gohr ); 27893c1dbf4SAndreas Gohr } else { 27993c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 28093c1dbf4SAndreas Gohr } 2818817535bSAndreas Gohr } 2827ee8b02dSAndreas Gohr return $chunkList; 2838817535bSAndreas Gohr } 2848817535bSAndreas Gohr 2859e81bea7SAndreas Gohr /** 2869e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 2879e81bea7SAndreas Gohr * 2889e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 28968908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 2909e81bea7SAndreas Gohr * 2919e81bea7SAndreas Gohr * @param string $query The question 292e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 293aa6bbe75SAndreas Gohr * @param bool $limits Apply chat token limits to the number of chunks returned? 2947ee8b02dSAndreas Gohr * @return Chunk[] 2959e81bea7SAndreas Gohr * @throws \Exception 2969e81bea7SAndreas Gohr */ 297aa6bbe75SAndreas Gohr public function getSimilarChunks($query, $lang = '', $limits = true) 2988817535bSAndreas Gohr { 2999e81bea7SAndreas Gohr global $auth; 3006a18e0f4SAndreas Gohr $vector = $this->embedModel->getEmbedding($query); 3018817535bSAndreas Gohr 3027be8078eSAndreas Gohr $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 3037be8078eSAndreas Gohr 3047be8078eSAndreas Gohr if ($tokenlimit) { 305e3640be8SAndreas Gohr $fetch = min( 3067be8078eSAndreas Gohr ($tokenlimit / $this->getChunkSize()), 30734a1c478SAndreas Gohr $this->configContextChunks 308f6ef2e50SAndreas Gohr ); 309aa6bbe75SAndreas Gohr } else { 310aa6bbe75SAndreas Gohr $fetch = $this->configContextChunks; 311aa6bbe75SAndreas Gohr } 312aee9b383SAndreas Gohr 313aee9b383SAndreas Gohr $time = microtime(true); 314e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 3155f71c9bbSAndreas Gohr $this->timeSpent = round(microtime(true) - $time, 2); 3167ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 317aee9b383SAndreas Gohr $this->logger->info( 318c2f55081SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 319c2f55081SAndreas Gohr ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 320aee9b383SAndreas Gohr ); 321aee9b383SAndreas Gohr } 32268908844SAndreas Gohr 32368908844SAndreas Gohr $size = 0; 3248817535bSAndreas Gohr $result = []; 3257ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 3269e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 3277ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 328720bb43fSAndreas Gohr if ($chunk->getScore() < $this->similarityThreshold) continue; 32968908844SAndreas Gohr 3307be8078eSAndreas Gohr if ($tokenlimit) { 33168908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 3327be8078eSAndreas Gohr if ($size + $chunkSize > $tokenlimit) break; // we have enough 333aa6bbe75SAndreas Gohr } 33468908844SAndreas Gohr 3359e81bea7SAndreas Gohr $result[] = $chunk; 336aa6bbe75SAndreas Gohr $size += $chunkSize ?? 0; 337aa6bbe75SAndreas Gohr 338aa6bbe75SAndreas Gohr if (count($result) >= $this->configContextChunks) break; // we have enough 3398817535bSAndreas Gohr } 3408817535bSAndreas Gohr return $result; 3418817535bSAndreas Gohr } 3428817535bSAndreas Gohr 343661701eeSAndreas Gohr /** 344*9634d734SAndreas Gohr * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk 345*9634d734SAndreas Gohr * 346*9634d734SAndreas Gohr * This will not apply any token limits 347*9634d734SAndreas Gohr * 348*9634d734SAndreas Gohr * @param string $query The question 349*9634d734SAndreas Gohr * @param string $lang Limit results to this language 350*9634d734SAndreas Gohr * @return Chunk[] 351*9634d734SAndreas Gohr * @throws \Exception 352*9634d734SAndreas Gohr */ 353*9634d734SAndreas Gohr public function getSimilarPages($query, $lang = '') 354*9634d734SAndreas Gohr { 355*9634d734SAndreas Gohr $chunks = $this->getSimilarChunks($query, $lang, false); 356*9634d734SAndreas Gohr $pages = []; 357*9634d734SAndreas Gohr 358*9634d734SAndreas Gohr foreach ($chunks as $chunk) { 359*9634d734SAndreas Gohr $page = $chunk->getPage(); 360*9634d734SAndreas Gohr if (isset($pages[$page])) continue; // we already have this page 361*9634d734SAndreas Gohr 362*9634d734SAndreas Gohr $content = $this->getPageContent($chunk->getPage()); 363*9634d734SAndreas Gohr $crumbs = $this->breadcrumbTrail($chunk->getPage()); 364*9634d734SAndreas Gohr 365*9634d734SAndreas Gohr $pages[$page] = new Chunk( 366*9634d734SAndreas Gohr $page, 367*9634d734SAndreas Gohr $chunk->getId(), 368*9634d734SAndreas Gohr $crumbs . "\n\n" . $content, 369*9634d734SAndreas Gohr $chunk->getEmbedding(), 370*9634d734SAndreas Gohr $chunk->getLanguage(), 371*9634d734SAndreas Gohr $chunk->getCreated(), 372*9634d734SAndreas Gohr $chunk->getScore() 373*9634d734SAndreas Gohr ); 374*9634d734SAndreas Gohr } 375*9634d734SAndreas Gohr return $pages; 376*9634d734SAndreas Gohr } 377*9634d734SAndreas Gohr 378*9634d734SAndreas Gohr /** 379ed47fd87SAndreas Gohr * Returns all chunks for a page 380ed47fd87SAndreas Gohr * 381ed47fd87SAndreas Gohr * Does not apply configContextChunks but checks token limits if requested 382ed47fd87SAndreas Gohr * 383ed47fd87SAndreas Gohr * @param string $page 384ed47fd87SAndreas Gohr * @param bool $limits Apply chat token limits to the number of chunks returned? 385ed47fd87SAndreas Gohr * @return Chunk[] 386ed47fd87SAndreas Gohr */ 387ed47fd87SAndreas Gohr public function getPageChunks($page, $limits = true) 388ed47fd87SAndreas Gohr { 389ed47fd87SAndreas Gohr global $auth; 390ed47fd87SAndreas Gohr if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 391ed47fd87SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 392ed47fd87SAndreas Gohr 'User not allowed to read context page {page}', ['page' => $page] 393ed47fd87SAndreas Gohr ); 394ed47fd87SAndreas Gohr return []; 395ed47fd87SAndreas Gohr } 396ed47fd87SAndreas Gohr 397ed47fd87SAndreas Gohr $indexer = new Indexer(); 398ed47fd87SAndreas Gohr $pages = $indexer->getPages(); 399ed47fd87SAndreas Gohr $pos = array_search(cleanID($page), $pages); 400ed47fd87SAndreas Gohr 401ed47fd87SAndreas Gohr if ($pos === false) { 402ed47fd87SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 403ed47fd87SAndreas Gohr 'Context page {page} is not in index', ['page' => $page] 404ed47fd87SAndreas Gohr ); 405ed47fd87SAndreas Gohr return []; 406ed47fd87SAndreas Gohr } 407ed47fd87SAndreas Gohr 408ed47fd87SAndreas Gohr $chunks = $this->storage->getPageChunks($page, $pos * 100); 409ed47fd87SAndreas Gohr 4107be8078eSAndreas Gohr $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 4117be8078eSAndreas Gohr 412ed47fd87SAndreas Gohr $size = 0; 413ed47fd87SAndreas Gohr $result = []; 414ed47fd87SAndreas Gohr foreach ($chunks as $chunk) { 4157be8078eSAndreas Gohr if ($tokenlimit) { 416ed47fd87SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 4177be8078eSAndreas Gohr if ($size + $chunkSize > $tokenlimit) break; // we have enough 418ed47fd87SAndreas Gohr } 419ed47fd87SAndreas Gohr 420ed47fd87SAndreas Gohr $result[] = $chunk; 421ed47fd87SAndreas Gohr $size += $chunkSize ?? 0; 422ed47fd87SAndreas Gohr } 423ed47fd87SAndreas Gohr 424ed47fd87SAndreas Gohr return $result; 425ed47fd87SAndreas Gohr } 426ed47fd87SAndreas Gohr 427ed47fd87SAndreas Gohr 428ed47fd87SAndreas Gohr /** 429661701eeSAndreas Gohr * Create a breadcrumb trail for the given page 430661701eeSAndreas Gohr * 431661701eeSAndreas Gohr * Uses the first heading of each namespace and the page itself. This is added as a prefix to 432661701eeSAndreas Gohr * each chunk to give the AI some context. 433661701eeSAndreas Gohr * 434661701eeSAndreas Gohr * @param string $id 435661701eeSAndreas Gohr * @return string 436661701eeSAndreas Gohr */ 437661701eeSAndreas Gohr protected function breadcrumbTrail($id) 438661701eeSAndreas Gohr { 439661701eeSAndreas Gohr $namespaces = explode(':', getNS($id)); 440661701eeSAndreas Gohr $resolver = new PageResolver($id); 441661701eeSAndreas Gohr $crumbs = []; 442661701eeSAndreas Gohr 443661701eeSAndreas Gohr // all namespaces 444661701eeSAndreas Gohr $check = ''; 445661701eeSAndreas Gohr foreach ($namespaces as $namespace) { 446661701eeSAndreas Gohr $check .= $namespace . ':'; 447661701eeSAndreas Gohr $page = $resolver->resolveId($check); 448661701eeSAndreas Gohr $title = p_get_first_heading($page); 449661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($namespace)" : $namespace; 450661701eeSAndreas Gohr } 451661701eeSAndreas Gohr 452661701eeSAndreas Gohr // the page itself 453661701eeSAndreas Gohr $title = p_get_first_heading($id); 454661701eeSAndreas Gohr $page = noNS($id); 455661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($page)" : $page; 456661701eeSAndreas Gohr 457661701eeSAndreas Gohr return implode(' » ', $crumbs); 458661701eeSAndreas Gohr } 4595786be46SAndreas Gohr 4605786be46SAndreas Gohr /** 4618817535bSAndreas Gohr * @param $text 4628817535bSAndreas Gohr * @return array 4638817535bSAndreas Gohr * @throws \Exception 4648817535bSAndreas Gohr * @todo support splitting too long sentences 4658817535bSAndreas Gohr */ 466ab1f8ddeSAndreas Gohr protected function splitIntoChunks($text) 4678817535bSAndreas Gohr { 4688817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 46968908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 4708817535bSAndreas Gohr 4718817535bSAndreas Gohr $chunks = []; 4728817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 4738817535bSAndreas Gohr 4748817535bSAndreas Gohr $chunklen = 0; 4758817535bSAndreas Gohr $chunk = ''; 4768817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 4778817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 4786a18e0f4SAndreas Gohr if ($slen > $this->getChunkSize()) { 4798817535bSAndreas Gohr // sentence is too long, we need to split it further 480f8d5ae01SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 481f8d5ae01SAndreas Gohr 'Sentence too long, splitting not implemented yet' 482f8d5ae01SAndreas Gohr ); 483ad38c5fdSAndreas Gohr continue; 4848817535bSAndreas Gohr } 4858817535bSAndreas Gohr 4866a18e0f4SAndreas Gohr if ($chunklen + $slen < $this->getChunkSize()) { 4878817535bSAndreas Gohr // add to current chunk 4888817535bSAndreas Gohr $chunk .= $sentence; 4898817535bSAndreas Gohr $chunklen += $slen; 49068908844SAndreas Gohr // remember sentence for overlap check 49168908844SAndreas Gohr $this->rememberSentence($sentence); 4928817535bSAndreas Gohr } else { 49368908844SAndreas Gohr // add current chunk to result 494ab1f8ddeSAndreas Gohr $chunk = trim($chunk); 495ab1f8ddeSAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 49668908844SAndreas Gohr 49768908844SAndreas Gohr // start new chunk with remembered sentences 4987ebc7895Ssplitbrain $chunk = implode(' ', $this->sentenceQueue); 49968908844SAndreas Gohr $chunk .= $sentence; 50068908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 5018817535bSAndreas Gohr } 5028817535bSAndreas Gohr } 5038817535bSAndreas Gohr $chunks[] = $chunk; 5048817535bSAndreas Gohr 5058817535bSAndreas Gohr return $chunks; 5068817535bSAndreas Gohr } 50768908844SAndreas Gohr 50868908844SAndreas Gohr /** 50968908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 51068908844SAndreas Gohr * 51168908844SAndreas Gohr * @param string $sentence 51268908844SAndreas Gohr * @return void 51368908844SAndreas Gohr */ 51468908844SAndreas Gohr protected function rememberSentence($sentence) 51568908844SAndreas Gohr { 51668908844SAndreas Gohr // add sentence to queue 51768908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 51868908844SAndreas Gohr 51968908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 52068908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 5217ebc7895Ssplitbrain while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 52268908844SAndreas Gohr array_shift($this->sentenceQueue); 52368908844SAndreas Gohr } 52468908844SAndreas Gohr } 5258817535bSAndreas Gohr} 526