chatModel = $chatModel; $this->embedModel = $embedModel; $this->storage = $storage; $this->configChunkSize = $config['chunkSize']; $this->configContextChunks = $config['contextChunks']; $this->similarityThreshold = $config['similarityThreshold'] / 100; } /** * Access storage * * @return AbstractStorage */ public function getStorage() { return $this->storage; } /** * Override the number of used context chunks * * @param int $max * @return void */ public function setConfigContextChunks(int $max) { if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0'); $this->configContextChunks = $max; } /** * Override the similiarity threshold * * @param float $threshold * @return void */ public function setSimilarityThreshold(float $threshold) { if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1'); $this->similarityThreshold = $threshold; } /** * Add a logger instance * * @return void */ public function setLogger(CLI $logger) { $this->logger = $logger; } /** * Get the token encoder instance * * @return Encoder */ public function getTokenEncoder() { if (!$this->tokenEncoder instanceof Encoder) { $this->tokenEncoder = new Encoder(); } return $this->tokenEncoder; } /** * Return the chunk size to use * * @return int */ public function getChunkSize() { $tokenlimit = $this->chatModel->getMaxInputTokenLength(); if (!$tokenlimit) { // no token limit, use the configured chunk size return $this->configChunkSize; } return min( floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe $this->configChunkSize, // this is usually the smallest ); } /** * Update the embeddings storage * * @param string $skipRE Regular expression to filter out pages (full RE with delimiters) * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters) * @param bool $clear Should any existing storage be cleared before updating? * @return void * @throws \Exception */ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false) { $indexer = new Indexer(); $pages = $indexer->getPages(); $this->storage->startCreation($clear); foreach ($pages as $pid => $page) { $chunkID = $pid * 100; // chunk IDs start at page ID * 100 if ( !page_exists($page) || isHiddenPage($page) || filesize(wikiFN($page)) < 150 || // skip very small pages ($skipRE && preg_match($skipRE, (string)$page)) || ($matchRE && !preg_match($matchRE, ":$page")) ) { // this page should not be in the index (anymore) $this->storage->deletePageChunks($page, $chunkID); continue; } $firstChunk = $this->storage->getChunk($chunkID); if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) { // page is older than the chunks we have, reuse the existing chunks $this->storage->reusePageChunks($page, $chunkID); if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page"); } else { // page is newer than the chunks we have, create new chunks $this->storage->deletePageChunks($page, $chunkID); $chunks = $this->createPageChunks($page, $chunkID); if ($chunks) $this->storage->addPageChunks($chunks); } } $this->storage->finalizeCreation(); } /** * Get the content of a page * * Uses our own renderer to format the contents in an LLM friendly way. Falls back to * raw syntax if the renderer fails for some reason * * @param string $page Name of the page to read * @return string The content of the page */ public function getPageContent($page) { global $ID; $ID = $page; try { $text = p_cached_output(wikiFN($page), 'aichat', $page); } catch (\Throwable $e) { if ($this->logger) $this->logger->error( 'Failed to render page {page}. Using raw text instead. {msg}', ['page' => $page, 'msg' => $e->getMessage()] ); $text = rawWiki($page); } return $text; } /** * Split the given page, fetch embedding vectors and return Chunks * * Will use the text renderer plugin if available to get the rendered text. * Otherwise the raw wiki text is used. * * @param string $page Name of the page to split * @param int $firstChunkID The ID of the first chunk of this page * @return Chunk[] A list of chunks created for this page * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page * @throws \Exception */ public function createPageChunks($page, $firstChunkID) { $chunkList = []; $text = $this->getPageContent($page); $crumbs = $this->breadcrumbTrail($page); // allow plugins to modify the text before splitting $eventData = [ 'page' => $page, 'body' => '', 'metadata' => ['title' => $page, 'relation_references' => []], ]; $event = new Event('INDEXER_PAGE_ADD', $eventData); if ($event->advise_before()) { $text = $eventData['body'] . ' ' . $text; } else { $text = $eventData['body']; } $splitter = new TextSplitter($this->getChunkSize(), $this->getTokenEncoder()); $parts = $splitter->splitIntoChunks($text); foreach ($parts as $part) { if (trim($part) === '') continue; // skip empty chunks $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk try { $embedding = $this->embedModel->getEmbedding($part); } catch (\Exception $e) { if ($this->logger instanceof CLI) { $this->logger->error( 'Failed to get embedding for chunk of page {page}: {msg}', ['page' => $page, 'msg' => $e->getMessage()] ); } continue; } $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding); $firstChunkID++; } if ($this->logger instanceof CLI) { if ($chunkList !== []) { $this->logger->success( '{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)] ); } else { $this->logger->warning('{id} could not be split into chunks', ['id' => $page]); } } return $chunkList; } /** * Do a nearest neighbor search for chunks similar to the given question * * Returns only chunks the current user is allowed to read, may return an empty result. * The number of returned chunks depends on the MAX_CONTEXT_LEN setting. * * @param string $query The question * @param string $lang Limit results to this language * @param bool $limits Apply chat token limits to the number of chunks returned? * @return Chunk[] * @throws \Exception */ public function getSimilarChunks($query, $lang = '', $limits = true) { global $auth; $vector = $this->embedModel->getEmbedding($query); $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; if ($tokenlimit) { $fetch = min( ($tokenlimit / $this->getChunkSize()), $this->configContextChunks ); } else { $fetch = $this->configContextChunks; } $time = microtime(true); $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch); $this->timeSpent = round(microtime(true) - $time, 2); if ($this->logger instanceof CLI) { $this->logger->info( 'Fetched {count} similar chunks from store in {time} seconds. Query: {query}', ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query] ); } $size = 0; $result = []; foreach ($chunks as $chunk) { // filter out chunks the user is not allowed to read if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue; if ($chunk->getScore() < $this->similarityThreshold) continue; if ($tokenlimit) { $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); if ($size + $chunkSize > $tokenlimit) break; // we have enough } $result[] = $chunk; $size += $chunkSize ?? 0; if (count($result) >= $this->configContextChunks) break; // we have enough } return $result; } /** * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk * * This will not apply any token limits * * @param string $query The question * @param string $lang Limit results to this language * @return Chunk[] * @throws \Exception */ public function getSimilarPages($query, $lang = '') { $chunks = $this->getSimilarChunks($query, $lang, false); $pages = []; foreach ($chunks as $chunk) { $page = $chunk->getPage(); if (isset($pages[$page])) continue; // we already have this page $content = $this->getPageContent($chunk->getPage()); $crumbs = $this->breadcrumbTrail($chunk->getPage()); $pages[$page] = new Chunk( $page, $chunk->getId(), $crumbs . "\n\n" . $content, $chunk->getEmbedding(), $chunk->getLanguage(), $chunk->getCreated(), $chunk->getScore() ); } return $pages; } /** * Returns all chunks for a page * * Does not apply configContextChunks but checks token limits if requested * * @param string $page * @param bool $limits Apply chat token limits to the number of chunks returned? * @return Chunk[] */ public function getPageChunks($page, $limits = true) { global $auth; if ($auth && auth_quickaclcheck($page) < AUTH_READ) { if ($this->logger instanceof CLI) $this->logger->warning( 'User not allowed to read context page {page}', ['page' => $page] ); return []; } $indexer = new Indexer(); $pages = $indexer->getPages(); $pos = array_search(cleanID($page), $pages); if ($pos === false) { if ($this->logger instanceof CLI) $this->logger->warning( 'Context page {page} is not in index', ['page' => $page] ); return []; } $chunks = $this->storage->getPageChunks($page, $pos * 100); $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0; $size = 0; $result = []; foreach ($chunks as $chunk) { if ($tokenlimit) { $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText())); if ($size + $chunkSize > $tokenlimit) break; // we have enough } $result[] = $chunk; $size += $chunkSize ?? 0; } return $result; } /** * Create a breadcrumb trail for the given page * * Uses the first heading of each namespace and the page itself. This is added as a prefix to * each chunk to give the AI some context. * * @param string $id * @return string */ protected function breadcrumbTrail($id) { $namespaces = explode(':', getNS($id)); $resolver = new PageResolver($id); $crumbs = []; // all namespaces $check = ''; foreach ($namespaces as $namespace) { $check .= $namespace . ':'; $page = $resolver->resolveId($check); $title = p_get_first_heading($page); $crumbs[] = $title ? "$title ($namespace)" : $namespace; } // the page itself $title = p_get_first_heading($id); $page = noNS($id); $crumbs[] = $title ? "$title ($page)" : $page; return implode(' ยป ', $crumbs); } }