18817535bSAndreas Gohr<?php 28817535bSAndreas Gohr 38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat; 48817535bSAndreas Gohr 5ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event; 6661701eeSAndreas Gohruse dokuwiki\File\PageResolver; 7294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface; 8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface; 9f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage; 108817535bSAndreas Gohruse dokuwiki\Search\Indexer; 112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI; 128817535bSAndreas Gohruse TikToken\Encoder; 138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence; 148817535bSAndreas Gohr 159da5f0dfSAndreas Gohr/** 169da5f0dfSAndreas Gohr * Manage the embeddings index 179da5f0dfSAndreas Gohr * 189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from 197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend. 209da5f0dfSAndreas Gohr */ 218817535bSAndreas Gohrclass Embeddings 228817535bSAndreas Gohr{ 2368908844SAndreas Gohr /** @var int maximum overlap between chunks in tokens */ 2430b9cbc7Ssplitbrain final public const MAX_OVERLAP_LEN = 200; 258817535bSAndreas Gohr 26294a9eafSAndreas Gohr /** @var ChatInterface */ 276a18e0f4SAndreas Gohr protected $chatModel; 286a18e0f4SAndreas Gohr 29294a9eafSAndreas Gohr /** @var EmbeddingInterface */ 306a18e0f4SAndreas Gohr protected $embedModel; 316a18e0f4SAndreas Gohr 322ecc089aSAndreas Gohr /** @var CLI|null */ 332ecc089aSAndreas Gohr protected $logger; 3468908844SAndreas Gohr /** @var Encoder */ 3568908844SAndreas Gohr protected $tokenEncoder; 368817535bSAndreas Gohr 377ee8b02dSAndreas Gohr /** @var AbstractStorage */ 387ee8b02dSAndreas Gohr protected $storage; 397ee8b02dSAndreas Gohr 4068908844SAndreas Gohr /** @var array remember sentences when chunking */ 4168908844SAndreas Gohr private $sentenceQueue = []; 4268908844SAndreas Gohr 43c2b7a1f7SAndreas Gohr /** @var int the time spent for the last similar chunk retrieval */ 44c2b7a1f7SAndreas Gohr public $timeSpent = 0; 45c2b7a1f7SAndreas Gohr 4634a1c478SAndreas Gohr protected $configChunkSize; 4734a1c478SAndreas Gohr protected $configContextChunks; 48720bb43fSAndreas Gohr protected $similarityThreshold; 4934a1c478SAndreas Gohr 5034a1c478SAndreas Gohr /** 5134a1c478SAndreas Gohr * Embeddings constructor. 5234a1c478SAndreas Gohr * 5334a1c478SAndreas Gohr * @param ChatInterface $chatModel 5434a1c478SAndreas Gohr * @param EmbeddingInterface $embedModel 5534a1c478SAndreas Gohr * @param AbstractStorage $storage 5634a1c478SAndreas Gohr * @param array $config The plugin configuration 5734a1c478SAndreas Gohr */ 586a18e0f4SAndreas Gohr public function __construct( 59294a9eafSAndreas Gohr ChatInterface $chatModel, 60294a9eafSAndreas Gohr EmbeddingInterface $embedModel, 6134a1c478SAndreas Gohr AbstractStorage $storage, 6234a1c478SAndreas Gohr $config 63aa6bbe75SAndreas Gohr ) 64aa6bbe75SAndreas Gohr { 656a18e0f4SAndreas Gohr $this->chatModel = $chatModel; 666a18e0f4SAndreas Gohr $this->embedModel = $embedModel; 67f6ef2e50SAndreas Gohr $this->storage = $storage; 6834a1c478SAndreas Gohr $this->configChunkSize = $config['chunkSize']; 6934a1c478SAndreas Gohr $this->configContextChunks = $config['contextChunks']; 70720bb43fSAndreas Gohr $this->similarityThreshold = $config['similarityThreshold'] / 100; 717ee8b02dSAndreas Gohr } 727ee8b02dSAndreas Gohr 737ee8b02dSAndreas Gohr /** 747ee8b02dSAndreas Gohr * Access storage 757ee8b02dSAndreas Gohr * 767ee8b02dSAndreas Gohr * @return AbstractStorage 777ee8b02dSAndreas Gohr */ 787ee8b02dSAndreas Gohr public function getStorage() 797ee8b02dSAndreas Gohr { 807ee8b02dSAndreas Gohr return $this->storage; 812ecc089aSAndreas Gohr } 822ecc089aSAndreas Gohr 832ecc089aSAndreas Gohr /** 84aa6bbe75SAndreas Gohr * Override the number of used context chunks 85aa6bbe75SAndreas Gohr * 86aa6bbe75SAndreas Gohr * @param int $max 87aa6bbe75SAndreas Gohr * @return void 88aa6bbe75SAndreas Gohr */ 89aa6bbe75SAndreas Gohr public function setConfigContextChunks(int $max) 90aa6bbe75SAndreas Gohr { 91aa6bbe75SAndreas Gohr if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); 92aa6bbe75SAndreas Gohr $this->configContextChunks = $max; 93aa6bbe75SAndreas Gohr } 94aa6bbe75SAndreas Gohr 95aa6bbe75SAndreas Gohr /** 96aa6bbe75SAndreas Gohr * Override the similiarity threshold 97aa6bbe75SAndreas Gohr * 98aa6bbe75SAndreas Gohr * @param float $threshold 99aa6bbe75SAndreas Gohr * @return void 100aa6bbe75SAndreas Gohr */ 101aa6bbe75SAndreas Gohr public function setSimilarityThreshold(float $threshold) 102aa6bbe75SAndreas Gohr { 103aa6bbe75SAndreas Gohr if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); 104aa6bbe75SAndreas Gohr $this->similarityThreshold = $threshold; 105aa6bbe75SAndreas Gohr } 106aa6bbe75SAndreas Gohr 107aa6bbe75SAndreas Gohr /** 1082ecc089aSAndreas Gohr * Add a logger instance 1092ecc089aSAndreas Gohr * 1102ecc089aSAndreas Gohr * @return void 1112ecc089aSAndreas Gohr */ 1122ecc089aSAndreas Gohr public function setLogger(CLI $logger) 1132ecc089aSAndreas Gohr { 1148817535bSAndreas Gohr $this->logger = $logger; 1158817535bSAndreas Gohr } 1168817535bSAndreas Gohr 1172ecc089aSAndreas Gohr /** 11868908844SAndreas Gohr * Get the token encoder instance 11968908844SAndreas Gohr * 12068908844SAndreas Gohr * @return Encoder 12168908844SAndreas Gohr */ 12268908844SAndreas Gohr public function getTokenEncoder() 12368908844SAndreas Gohr { 1247ebc7895Ssplitbrain if (!$this->tokenEncoder instanceof Encoder) { 12568908844SAndreas Gohr $this->tokenEncoder = new Encoder(); 12668908844SAndreas Gohr } 12768908844SAndreas Gohr return $this->tokenEncoder; 12868908844SAndreas Gohr } 12968908844SAndreas Gohr 13068908844SAndreas Gohr /** 1316a18e0f4SAndreas Gohr * Return the chunk size to use 1326a18e0f4SAndreas Gohr * 1336a18e0f4SAndreas Gohr * @return int 1346a18e0f4SAndreas Gohr */ 1356a18e0f4SAndreas Gohr public function getChunkSize() 1366a18e0f4SAndreas Gohr { 137*7be8078eSAndreas Gohr $tokenlimit = $this->chatModel->getMaxInputTokenLength(); 138*7be8078eSAndreas Gohr if(!$tokenlimit) { 139*7be8078eSAndreas Gohr // no token limit, use the configured chunk size 140*7be8078eSAndreas Gohr return $this->configChunkSize; 141*7be8078eSAndreas Gohr } 142*7be8078eSAndreas Gohr 1436a18e0f4SAndreas Gohr return min( 14434a1c478SAndreas Gohr floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input 14534a1c478SAndreas Gohr floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe 14634a1c478SAndreas Gohr $this->configChunkSize, // this is usually the smallest 1476a18e0f4SAndreas Gohr ); 1486a18e0f4SAndreas Gohr } 1496a18e0f4SAndreas Gohr 1506a18e0f4SAndreas Gohr /** 1515284515dSAndreas Gohr * Update the embeddings storage 1522ecc089aSAndreas Gohr * 153ad38c5fdSAndreas Gohr * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) 154d5c102b3SAndreas Gohr * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) 1555284515dSAndreas Gohr * @param bool $clear Should any existing storage be cleared before updating? 1562ecc089aSAndreas Gohr * @return void 1575284515dSAndreas Gohr * @throws \Exception 1582ecc089aSAndreas Gohr */ 159d5c102b3SAndreas Gohr public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) 1608817535bSAndreas Gohr { 1618817535bSAndreas Gohr $indexer = new Indexer(); 1628817535bSAndreas Gohr $pages = $indexer->getPages(); 1638817535bSAndreas Gohr 164f6ef2e50SAndreas Gohr $this->storage->startCreation($clear); 1655aa45b4dSAndreas Gohr foreach ($pages as $pid => $page) { 1665aa45b4dSAndreas Gohr $chunkID = $pid * 100; // chunk IDs start at page ID * 100 1675aa45b4dSAndreas Gohr 1685284515dSAndreas Gohr if ( 1695284515dSAndreas Gohr !page_exists($page) || 1705284515dSAndreas Gohr isHiddenPage($page) || 1714e206c13SAndreas Gohr filesize(wikiFN($page)) < 150 || // skip very small pages 172d5c102b3SAndreas Gohr ($skipRE && preg_match($skipRE, (string)$page)) || 173d5c102b3SAndreas Gohr ($matchRE && !preg_match($matchRE, ":$page")) 1745284515dSAndreas Gohr ) { 1755284515dSAndreas Gohr // this page should not be in the index (anymore) 1765284515dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 1775284515dSAndreas Gohr continue; 1785284515dSAndreas Gohr } 1795284515dSAndreas Gohr 1807ee8b02dSAndreas Gohr $firstChunk = $this->storage->getChunk($chunkID); 1817ee8b02dSAndreas Gohr if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { 1825aa45b4dSAndreas Gohr // page is older than the chunks we have, reuse the existing chunks 1837ee8b02dSAndreas Gohr $this->storage->reusePageChunks($page, $chunkID); 1847ebc7895Ssplitbrain if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); 1855aa45b4dSAndreas Gohr } else { 1865aa45b4dSAndreas Gohr // page is newer than the chunks we have, create new chunks 1877ee8b02dSAndreas Gohr $this->storage->deletePageChunks($page, $chunkID); 188ecb0a423SAndreas Gohr $chunks = $this->createPageChunks($page, $chunkID); 189ecb0a423SAndreas Gohr if ($chunks) $this->storage->addPageChunks($chunks); 1905aa45b4dSAndreas Gohr } 1915aa45b4dSAndreas Gohr } 1927ee8b02dSAndreas Gohr $this->storage->finalizeCreation(); 1935aa45b4dSAndreas Gohr } 1945aa45b4dSAndreas Gohr 1955aa45b4dSAndreas Gohr /** 1967ee8b02dSAndreas Gohr * Split the given page, fetch embedding vectors and return Chunks 1975aa45b4dSAndreas Gohr * 19888305719SAndreas Gohr * Will use the text renderer plugin if available to get the rendered text. 19988305719SAndreas Gohr * Otherwise the raw wiki text is used. 20088305719SAndreas Gohr * 2015aa45b4dSAndreas Gohr * @param string $page Name of the page to split 2027ee8b02dSAndreas Gohr * @param int $firstChunkID The ID of the first chunk of this page 2037ee8b02dSAndreas Gohr * @return Chunk[] A list of chunks created for this page 204ab1f8ddeSAndreas Gohr * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page 2055aa45b4dSAndreas Gohr * @throws \Exception 2065aa45b4dSAndreas Gohr */ 207ab1f8ddeSAndreas Gohr public function createPageChunks($page, $firstChunkID) 2085aa45b4dSAndreas Gohr { 2097ee8b02dSAndreas Gohr $chunkList = []; 21088305719SAndreas Gohr 21188305719SAndreas Gohr global $ID; 21288305719SAndreas Gohr $ID = $page; 213303d0c59SAndreas Gohr try { 214661701eeSAndreas Gohr $text = p_cached_output(wikiFN($page), 'aichat', $page); 215303d0c59SAndreas Gohr } catch (\Throwable $e) { 216303d0c59SAndreas Gohr if ($this->logger) $this->logger->error( 217661701eeSAndreas Gohr 'Failed to render page {page}. Using raw text instead. {msg}', 218303d0c59SAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 219303d0c59SAndreas Gohr ); 220303d0c59SAndreas Gohr $text = rawWiki($page); 221303d0c59SAndreas Gohr } 222661701eeSAndreas Gohr 223661701eeSAndreas Gohr $crumbs = $this->breadcrumbTrail($page); 22488305719SAndreas Gohr 225ab1f8ddeSAndreas Gohr // allow plugins to modify the text before splitting 226ab1f8ddeSAndreas Gohr $eventData = [ 227ab1f8ddeSAndreas Gohr 'page' => $page, 228ab1f8ddeSAndreas Gohr 'body' => '', 229ab1f8ddeSAndreas Gohr 'metadata' => ['title' => $page, 'relation_references' => []], 230ab1f8ddeSAndreas Gohr ]; 231ab1f8ddeSAndreas Gohr $event = new Event('INDEXER_PAGE_ADD', $eventData); 232ab1f8ddeSAndreas Gohr if ($event->advise_before()) { 233ab1f8ddeSAndreas Gohr $text = $eventData['body'] . ' ' . $text; 234ab1f8ddeSAndreas Gohr } else { 235ab1f8ddeSAndreas Gohr $text = $eventData['body']; 236ab1f8ddeSAndreas Gohr } 237ab1f8ddeSAndreas Gohr 23888305719SAndreas Gohr $parts = $this->splitIntoChunks($text); 2397ee8b02dSAndreas Gohr foreach ($parts as $part) { 24030b9cbc7Ssplitbrain if (trim((string)$part) == '') continue; // skip empty chunks 24193c1dbf4SAndreas Gohr 242661701eeSAndreas Gohr $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk 243661701eeSAndreas Gohr 244ad38c5fdSAndreas Gohr try { 2456a18e0f4SAndreas Gohr $embedding = $this->embedModel->getEmbedding($part); 246ad38c5fdSAndreas Gohr } catch (\Exception $e) { 2477ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 248ad38c5fdSAndreas Gohr $this->logger->error( 249ad38c5fdSAndreas Gohr 'Failed to get embedding for chunk of page {page}: {msg}', 250ad38c5fdSAndreas Gohr ['page' => $page, 'msg' => $e->getMessage()] 251ad38c5fdSAndreas Gohr ); 252ad38c5fdSAndreas Gohr } 253ad38c5fdSAndreas Gohr continue; 254ad38c5fdSAndreas Gohr } 2557ee8b02dSAndreas Gohr $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); 2567ee8b02dSAndreas Gohr $firstChunkID++; 2578817535bSAndreas Gohr } 2587ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 2597ebc7895Ssplitbrain if ($chunkList !== []) { 260f8d5ae01SAndreas Gohr $this->logger->success( 261f8d5ae01SAndreas Gohr '{id} split into {count} chunks', 262f8d5ae01SAndreas Gohr ['id' => $page, 'count' => count($chunkList)] 263f8d5ae01SAndreas Gohr ); 26493c1dbf4SAndreas Gohr } else { 26593c1dbf4SAndreas Gohr $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); 26693c1dbf4SAndreas Gohr } 2678817535bSAndreas Gohr } 2687ee8b02dSAndreas Gohr return $chunkList; 2698817535bSAndreas Gohr } 2708817535bSAndreas Gohr 2719e81bea7SAndreas Gohr /** 2729e81bea7SAndreas Gohr * Do a nearest neighbor search for chunks similar to the given question 2739e81bea7SAndreas Gohr * 2749e81bea7SAndreas Gohr * Returns only chunks the current user is allowed to read, may return an empty result. 27568908844SAndreas Gohr * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. 2769e81bea7SAndreas Gohr * 2779e81bea7SAndreas Gohr * @param string $query The question 278e33a1d7aSAndreas Gohr * @param string $lang Limit results to this language 279aa6bbe75SAndreas Gohr * @param bool $limits Apply chat token limits to the number of chunks returned? 2807ee8b02dSAndreas Gohr * @return Chunk[] 2819e81bea7SAndreas Gohr * @throws \Exception 2829e81bea7SAndreas Gohr */ 283aa6bbe75SAndreas Gohr public function getSimilarChunks($query, $lang = '', $limits = true) 2848817535bSAndreas Gohr { 2859e81bea7SAndreas Gohr global $auth; 2866a18e0f4SAndreas Gohr $vector = $this->embedModel->getEmbedding($query); 2878817535bSAndreas Gohr 288*7be8078eSAndreas Gohr $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 289*7be8078eSAndreas Gohr 290*7be8078eSAndreas Gohr if ($tokenlimit) { 291e3640be8SAndreas Gohr $fetch = min( 292*7be8078eSAndreas Gohr ($tokenlimit / $this->getChunkSize()), 29334a1c478SAndreas Gohr $this->configContextChunks 294f6ef2e50SAndreas Gohr ); 295aa6bbe75SAndreas Gohr } else { 296aa6bbe75SAndreas Gohr $fetch = $this->configContextChunks; 297aa6bbe75SAndreas Gohr } 298aee9b383SAndreas Gohr 299aee9b383SAndreas Gohr $time = microtime(true); 300e33a1d7aSAndreas Gohr $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); 3015f71c9bbSAndreas Gohr $this->timeSpent = round(microtime(true) - $time, 2); 3027ebc7895Ssplitbrain if ($this->logger instanceof CLI) { 303aee9b383SAndreas Gohr $this->logger->info( 304c2f55081SAndreas Gohr 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', 305c2f55081SAndreas Gohr ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] 306aee9b383SAndreas Gohr ); 307aee9b383SAndreas Gohr } 30868908844SAndreas Gohr 30968908844SAndreas Gohr $size = 0; 3108817535bSAndreas Gohr $result = []; 3117ee8b02dSAndreas Gohr foreach ($chunks as $chunk) { 3129e81bea7SAndreas Gohr // filter out chunks the user is not allowed to read 3137ee8b02dSAndreas Gohr if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; 314720bb43fSAndreas Gohr if ($chunk->getScore() < $this->similarityThreshold) continue; 31568908844SAndreas Gohr 316*7be8078eSAndreas Gohr if ($tokenlimit) { 31768908844SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 318*7be8078eSAndreas Gohr if ($size + $chunkSize > $tokenlimit) break; // we have enough 319aa6bbe75SAndreas Gohr } 32068908844SAndreas Gohr 3219e81bea7SAndreas Gohr $result[] = $chunk; 322aa6bbe75SAndreas Gohr $size += $chunkSize ?? 0; 323aa6bbe75SAndreas Gohr 324aa6bbe75SAndreas Gohr if (count($result) >= $this->configContextChunks) break; // we have enough 3258817535bSAndreas Gohr } 3268817535bSAndreas Gohr return $result; 3278817535bSAndreas Gohr } 3288817535bSAndreas Gohr 329661701eeSAndreas Gohr /** 330ed47fd87SAndreas Gohr * Returns all chunks for a page 331ed47fd87SAndreas Gohr * 332ed47fd87SAndreas Gohr * Does not apply configContextChunks but checks token limits if requested 333ed47fd87SAndreas Gohr * 334ed47fd87SAndreas Gohr * @param string $page 335ed47fd87SAndreas Gohr * @param bool $limits Apply chat token limits to the number of chunks returned? 336ed47fd87SAndreas Gohr * @return Chunk[] 337ed47fd87SAndreas Gohr */ 338ed47fd87SAndreas Gohr public function getPageChunks($page, $limits = true) 339ed47fd87SAndreas Gohr { 340ed47fd87SAndreas Gohr global $auth; 341ed47fd87SAndreas Gohr if ($auth && auth_quickaclcheck($page) < AUTH_READ) { 342ed47fd87SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 343ed47fd87SAndreas Gohr 'User not allowed to read context page {page}', ['page' => $page] 344ed47fd87SAndreas Gohr ); 345ed47fd87SAndreas Gohr return []; 346ed47fd87SAndreas Gohr } 347ed47fd87SAndreas Gohr 348ed47fd87SAndreas Gohr $indexer = new Indexer(); 349ed47fd87SAndreas Gohr $pages = $indexer->getPages(); 350ed47fd87SAndreas Gohr $pos = array_search(cleanID($page), $pages); 351ed47fd87SAndreas Gohr 352ed47fd87SAndreas Gohr if ($pos === false) { 353ed47fd87SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 354ed47fd87SAndreas Gohr 'Context page {page} is not in index', ['page' => $page] 355ed47fd87SAndreas Gohr ); 356ed47fd87SAndreas Gohr return []; 357ed47fd87SAndreas Gohr } 358ed47fd87SAndreas Gohr 359ed47fd87SAndreas Gohr $chunks = $this->storage->getPageChunks($page, $pos * 100); 360ed47fd87SAndreas Gohr 361*7be8078eSAndreas Gohr $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; 362*7be8078eSAndreas Gohr 363ed47fd87SAndreas Gohr $size = 0; 364ed47fd87SAndreas Gohr $result = []; 365ed47fd87SAndreas Gohr foreach ($chunks as $chunk) { 366*7be8078eSAndreas Gohr if ($tokenlimit) { 367ed47fd87SAndreas Gohr $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); 368*7be8078eSAndreas Gohr if ($size + $chunkSize > $tokenlimit) break; // we have enough 369ed47fd87SAndreas Gohr } 370ed47fd87SAndreas Gohr 371ed47fd87SAndreas Gohr $result[] = $chunk; 372ed47fd87SAndreas Gohr $size += $chunkSize ?? 0; 373ed47fd87SAndreas Gohr } 374ed47fd87SAndreas Gohr 375ed47fd87SAndreas Gohr return $result; 376ed47fd87SAndreas Gohr } 377ed47fd87SAndreas Gohr 378ed47fd87SAndreas Gohr 379ed47fd87SAndreas Gohr /** 380661701eeSAndreas Gohr * Create a breadcrumb trail for the given page 381661701eeSAndreas Gohr * 382661701eeSAndreas Gohr * Uses the first heading of each namespace and the page itself. This is added as a prefix to 383661701eeSAndreas Gohr * each chunk to give the AI some context. 384661701eeSAndreas Gohr * 385661701eeSAndreas Gohr * @param string $id 386661701eeSAndreas Gohr * @return string 387661701eeSAndreas Gohr */ 388661701eeSAndreas Gohr protected function breadcrumbTrail($id) 389661701eeSAndreas Gohr { 390661701eeSAndreas Gohr $namespaces = explode(':', getNS($id)); 391661701eeSAndreas Gohr $resolver = new PageResolver($id); 392661701eeSAndreas Gohr $crumbs = []; 393661701eeSAndreas Gohr 394661701eeSAndreas Gohr // all namespaces 395661701eeSAndreas Gohr $check = ''; 396661701eeSAndreas Gohr foreach ($namespaces as $namespace) { 397661701eeSAndreas Gohr $check .= $namespace . ':'; 398661701eeSAndreas Gohr $page = $resolver->resolveId($check); 399661701eeSAndreas Gohr $title = p_get_first_heading($page); 400661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($namespace)" : $namespace; 401661701eeSAndreas Gohr } 402661701eeSAndreas Gohr 403661701eeSAndreas Gohr // the page itself 404661701eeSAndreas Gohr $title = p_get_first_heading($id); 405661701eeSAndreas Gohr $page = noNS($id); 406661701eeSAndreas Gohr $crumbs[] = $title ? "$title ($page)" : $page; 407661701eeSAndreas Gohr 408661701eeSAndreas Gohr return implode(' » ', $crumbs); 409661701eeSAndreas Gohr } 4105786be46SAndreas Gohr 4115786be46SAndreas Gohr /** 4128817535bSAndreas Gohr * @param $text 4138817535bSAndreas Gohr * @return array 4148817535bSAndreas Gohr * @throws \Exception 4158817535bSAndreas Gohr * @todo support splitting too long sentences 4168817535bSAndreas Gohr */ 417ab1f8ddeSAndreas Gohr protected function splitIntoChunks($text) 4188817535bSAndreas Gohr { 4198817535bSAndreas Gohr $sentenceSplitter = new Sentence(); 42068908844SAndreas Gohr $tiktok = $this->getTokenEncoder(); 4218817535bSAndreas Gohr 4228817535bSAndreas Gohr $chunks = []; 4238817535bSAndreas Gohr $sentences = $sentenceSplitter->split($text); 4248817535bSAndreas Gohr 4258817535bSAndreas Gohr $chunklen = 0; 4268817535bSAndreas Gohr $chunk = ''; 4278817535bSAndreas Gohr while ($sentence = array_shift($sentences)) { 4288817535bSAndreas Gohr $slen = count($tiktok->encode($sentence)); 4296a18e0f4SAndreas Gohr if ($slen > $this->getChunkSize()) { 4308817535bSAndreas Gohr // sentence is too long, we need to split it further 431f8d5ae01SAndreas Gohr if ($this->logger instanceof CLI) $this->logger->warning( 432f8d5ae01SAndreas Gohr 'Sentence too long, splitting not implemented yet' 433f8d5ae01SAndreas Gohr ); 434ad38c5fdSAndreas Gohr continue; 4358817535bSAndreas Gohr } 4368817535bSAndreas Gohr 4376a18e0f4SAndreas Gohr if ($chunklen + $slen < $this->getChunkSize()) { 4388817535bSAndreas Gohr // add to current chunk 4398817535bSAndreas Gohr $chunk .= $sentence; 4408817535bSAndreas Gohr $chunklen += $slen; 44168908844SAndreas Gohr // remember sentence for overlap check 44268908844SAndreas Gohr $this->rememberSentence($sentence); 4438817535bSAndreas Gohr } else { 44468908844SAndreas Gohr // add current chunk to result 445ab1f8ddeSAndreas Gohr $chunk = trim($chunk); 446ab1f8ddeSAndreas Gohr if ($chunk !== '') $chunks[] = $chunk; 44768908844SAndreas Gohr 44868908844SAndreas Gohr // start new chunk with remembered sentences 4497ebc7895Ssplitbrain $chunk = implode(' ', $this->sentenceQueue); 45068908844SAndreas Gohr $chunk .= $sentence; 45168908844SAndreas Gohr $chunklen = count($tiktok->encode($chunk)); 4528817535bSAndreas Gohr } 4538817535bSAndreas Gohr } 4548817535bSAndreas Gohr $chunks[] = $chunk; 4558817535bSAndreas Gohr 4568817535bSAndreas Gohr return $chunks; 4578817535bSAndreas Gohr } 45868908844SAndreas Gohr 45968908844SAndreas Gohr /** 46068908844SAndreas Gohr * Add a sentence to the queue of remembered sentences 46168908844SAndreas Gohr * 46268908844SAndreas Gohr * @param string $sentence 46368908844SAndreas Gohr * @return void 46468908844SAndreas Gohr */ 46568908844SAndreas Gohr protected function rememberSentence($sentence) 46668908844SAndreas Gohr { 46768908844SAndreas Gohr // add sentence to queue 46868908844SAndreas Gohr $this->sentenceQueue[] = $sentence; 46968908844SAndreas Gohr 47068908844SAndreas Gohr // remove oldest sentences from queue until we are below the max overlap 47168908844SAndreas Gohr $encoder = $this->getTokenEncoder(); 4727ebc7895Ssplitbrain while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) { 47368908844SAndreas Gohr array_shift($this->sentenceQueue); 47468908844SAndreas Gohr } 47568908844SAndreas Gohr } 4768817535bSAndreas Gohr} 477