xref: /plugin/aichat/Embeddings.php (revision 2d02fff5bb8245df792cb940eaffb67df2e9387c)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
5*2d02fff5SAndreas Gohruse dokuwiki\Cache\CacheRenderer;
6ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event;
7661701eeSAndreas Gohruse dokuwiki\File\PageResolver;
8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface;
9294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface;
10f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage;
118817535bSAndreas Gohruse dokuwiki\Search\Indexer;
122ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
138817535bSAndreas Gohruse TikToken\Encoder;
148817535bSAndreas Gohr
159da5f0dfSAndreas Gohr/**
169da5f0dfSAndreas Gohr * Manage the embeddings index
179da5f0dfSAndreas Gohr *
189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
209da5f0dfSAndreas Gohr */
218817535bSAndreas Gohrclass Embeddings
228817535bSAndreas Gohr{
23294a9eafSAndreas Gohr    /** @var ChatInterface */
246a18e0f4SAndreas Gohr    protected $chatModel;
256a18e0f4SAndreas Gohr
26294a9eafSAndreas Gohr    /** @var EmbeddingInterface */
276a18e0f4SAndreas Gohr    protected $embedModel;
286a18e0f4SAndreas Gohr
292ecc089aSAndreas Gohr    /** @var CLI|null */
302ecc089aSAndreas Gohr    protected $logger;
3168908844SAndreas Gohr    /** @var Encoder */
3268908844SAndreas Gohr    protected $tokenEncoder;
338817535bSAndreas Gohr
347ee8b02dSAndreas Gohr    /** @var AbstractStorage */
357ee8b02dSAndreas Gohr    protected $storage;
367ee8b02dSAndreas Gohr
3768908844SAndreas Gohr    /** @var array remember sentences when chunking */
3868908844SAndreas Gohr    private $sentenceQueue = [];
3968908844SAndreas Gohr
40c2b7a1f7SAndreas Gohr    /** @var int the time spent for the last similar chunk retrieval */
41c2b7a1f7SAndreas Gohr    public $timeSpent = 0;
42c2b7a1f7SAndreas Gohr
4334a1c478SAndreas Gohr    protected $configChunkSize;
4434a1c478SAndreas Gohr    protected $configContextChunks;
45720bb43fSAndreas Gohr    protected $similarityThreshold;
4634a1c478SAndreas Gohr
4734a1c478SAndreas Gohr    /**
4834a1c478SAndreas Gohr     * Embeddings constructor.
4934a1c478SAndreas Gohr     *
5034a1c478SAndreas Gohr     * @param ChatInterface $chatModel
5134a1c478SAndreas Gohr     * @param EmbeddingInterface $embedModel
5234a1c478SAndreas Gohr     * @param AbstractStorage $storage
5334a1c478SAndreas Gohr     * @param array $config The plugin configuration
5434a1c478SAndreas Gohr     */
556a18e0f4SAndreas Gohr    public function __construct(
56294a9eafSAndreas Gohr        ChatInterface      $chatModel,
57294a9eafSAndreas Gohr        EmbeddingInterface $embedModel,
5834a1c478SAndreas Gohr        AbstractStorage    $storage,
5934a1c478SAndreas Gohr                           $config
60aa6bbe75SAndreas Gohr    )
61aa6bbe75SAndreas Gohr    {
626a18e0f4SAndreas Gohr        $this->chatModel = $chatModel;
636a18e0f4SAndreas Gohr        $this->embedModel = $embedModel;
64f6ef2e50SAndreas Gohr        $this->storage = $storage;
6534a1c478SAndreas Gohr        $this->configChunkSize = $config['chunkSize'];
6634a1c478SAndreas Gohr        $this->configContextChunks = $config['contextChunks'];
67720bb43fSAndreas Gohr        $this->similarityThreshold = $config['similarityThreshold'] / 100;
687ee8b02dSAndreas Gohr    }
697ee8b02dSAndreas Gohr
707ee8b02dSAndreas Gohr    /**
717ee8b02dSAndreas Gohr     * Access storage
727ee8b02dSAndreas Gohr     *
737ee8b02dSAndreas Gohr     * @return AbstractStorage
747ee8b02dSAndreas Gohr     */
757ee8b02dSAndreas Gohr    public function getStorage()
767ee8b02dSAndreas Gohr    {
777ee8b02dSAndreas Gohr        return $this->storage;
782ecc089aSAndreas Gohr    }
792ecc089aSAndreas Gohr
802ecc089aSAndreas Gohr    /**
81aa6bbe75SAndreas Gohr     * Override the number of used context chunks
82aa6bbe75SAndreas Gohr     *
83aa6bbe75SAndreas Gohr     * @param int $max
84aa6bbe75SAndreas Gohr     * @return void
85aa6bbe75SAndreas Gohr     */
86aa6bbe75SAndreas Gohr    public function setConfigContextChunks(int $max)
87aa6bbe75SAndreas Gohr    {
88aa6bbe75SAndreas Gohr        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
89aa6bbe75SAndreas Gohr        $this->configContextChunks = $max;
90aa6bbe75SAndreas Gohr    }
91aa6bbe75SAndreas Gohr
92aa6bbe75SAndreas Gohr    /**
93aa6bbe75SAndreas Gohr     * Override the similiarity threshold
94aa6bbe75SAndreas Gohr     *
95aa6bbe75SAndreas Gohr     * @param float $threshold
96aa6bbe75SAndreas Gohr     * @return void
97aa6bbe75SAndreas Gohr     */
98aa6bbe75SAndreas Gohr    public function setSimilarityThreshold(float $threshold)
99aa6bbe75SAndreas Gohr    {
100aa6bbe75SAndreas Gohr        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
101aa6bbe75SAndreas Gohr        $this->similarityThreshold = $threshold;
102aa6bbe75SAndreas Gohr    }
103aa6bbe75SAndreas Gohr
104aa6bbe75SAndreas Gohr    /**
1052ecc089aSAndreas Gohr     * Add a logger instance
1062ecc089aSAndreas Gohr     *
1072ecc089aSAndreas Gohr     * @return void
1082ecc089aSAndreas Gohr     */
1092ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
1102ecc089aSAndreas Gohr    {
1118817535bSAndreas Gohr        $this->logger = $logger;
1128817535bSAndreas Gohr    }
1138817535bSAndreas Gohr
1142ecc089aSAndreas Gohr    /**
11568908844SAndreas Gohr     * Get the token encoder instance
11668908844SAndreas Gohr     *
11768908844SAndreas Gohr     * @return Encoder
11868908844SAndreas Gohr     */
11968908844SAndreas Gohr    public function getTokenEncoder()
12068908844SAndreas Gohr    {
1217ebc7895Ssplitbrain        if (!$this->tokenEncoder instanceof Encoder) {
12268908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
12368908844SAndreas Gohr        }
12468908844SAndreas Gohr        return $this->tokenEncoder;
12568908844SAndreas Gohr    }
12668908844SAndreas Gohr
12768908844SAndreas Gohr    /**
1286a18e0f4SAndreas Gohr     * Return the chunk size to use
1296a18e0f4SAndreas Gohr     *
1306a18e0f4SAndreas Gohr     * @return int
1316a18e0f4SAndreas Gohr     */
1326a18e0f4SAndreas Gohr    public function getChunkSize()
1336a18e0f4SAndreas Gohr    {
1347be8078eSAndreas Gohr        $tokenlimit = $this->chatModel->getMaxInputTokenLength();
1357be8078eSAndreas Gohr        if (!$tokenlimit) {
1367be8078eSAndreas Gohr            // no token limit, use the configured chunk size
1377be8078eSAndreas Gohr            return $this->configChunkSize;
1387be8078eSAndreas Gohr        }
1397be8078eSAndreas Gohr
1406a18e0f4SAndreas Gohr        return min(
14134a1c478SAndreas Gohr            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
14234a1c478SAndreas Gohr            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
14334a1c478SAndreas Gohr            $this->configChunkSize, // this is usually the smallest
1446a18e0f4SAndreas Gohr        );
1456a18e0f4SAndreas Gohr    }
1466a18e0f4SAndreas Gohr
1476a18e0f4SAndreas Gohr    /**
1485284515dSAndreas Gohr     * Update the embeddings storage
1492ecc089aSAndreas Gohr     *
150ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
151d5c102b3SAndreas Gohr     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
1525284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
1532ecc089aSAndreas Gohr     * @return void
1545284515dSAndreas Gohr     * @throws \Exception
1552ecc089aSAndreas Gohr     */
156d5c102b3SAndreas Gohr    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
1578817535bSAndreas Gohr    {
1588817535bSAndreas Gohr        $indexer = new Indexer();
1598817535bSAndreas Gohr        $pages = $indexer->getPages();
1608817535bSAndreas Gohr
161f6ef2e50SAndreas Gohr        $this->storage->startCreation($clear);
1625aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
1635aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
1645aa45b4dSAndreas Gohr
1655284515dSAndreas Gohr            if (
1665284515dSAndreas Gohr                !page_exists($page) ||
1675284515dSAndreas Gohr                isHiddenPage($page) ||
1684e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
169d5c102b3SAndreas Gohr                ($skipRE && preg_match($skipRE, (string)$page)) ||
170d5c102b3SAndreas Gohr                ($matchRE && !preg_match($matchRE, ":$page"))
1715284515dSAndreas Gohr            ) {
1725284515dSAndreas Gohr                // this page should not be in the index (anymore)
173*2d02fff5SAndreas Gohr                $this->dropPageChunks($page, $chunkID);
1745284515dSAndreas Gohr                continue;
1755284515dSAndreas Gohr            }
1765284515dSAndreas Gohr
1777ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1787ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1795aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1807ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
1817ebc7895Ssplitbrain                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
1825aa45b4dSAndreas Gohr            } else {
1835aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1847ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
185ecb0a423SAndreas Gohr                $chunks = $this->createPageChunks($page, $chunkID);
186ecb0a423SAndreas Gohr                if ($chunks) $this->storage->addPageChunks($chunks);
1875aa45b4dSAndreas Gohr            }
1885aa45b4dSAndreas Gohr        }
1897ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1905aa45b4dSAndreas Gohr    }
1915aa45b4dSAndreas Gohr
1925aa45b4dSAndreas Gohr    /**
1939634d734SAndreas Gohr     * Get the content of a page
1949634d734SAndreas Gohr     *
1959634d734SAndreas Gohr     * Uses our own renderer to format the contents in an LLM friendly way. Falls back to
1969634d734SAndreas Gohr     * raw syntax if the renderer fails for some reason
1979634d734SAndreas Gohr     *
1989634d734SAndreas Gohr     * @param string $page Name of the page to read
1999634d734SAndreas Gohr     * @return string The content of the page
2009634d734SAndreas Gohr     */
2019634d734SAndreas Gohr    public function getPageContent($page)
2029634d734SAndreas Gohr    {
2039634d734SAndreas Gohr        global $ID;
2049634d734SAndreas Gohr        $ID = $page;
2059634d734SAndreas Gohr        try {
2069634d734SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'aichat', $page);
2079634d734SAndreas Gohr        } catch (\Throwable $e) {
2089634d734SAndreas Gohr            if ($this->logger) $this->logger->error(
2099634d734SAndreas Gohr                'Failed to render page {page}. Using raw text instead. {msg}',
2109634d734SAndreas Gohr                ['page' => $page, 'msg' => $e->getMessage()]
2119634d734SAndreas Gohr            );
2129634d734SAndreas Gohr            $text = rawWiki($page);
2139634d734SAndreas Gohr        }
2149634d734SAndreas Gohr        return $text;
2159634d734SAndreas Gohr    }
2169634d734SAndreas Gohr
2179634d734SAndreas Gohr    /**
2187ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
2195aa45b4dSAndreas Gohr     *
22088305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
22188305719SAndreas Gohr     * Otherwise the raw wiki text is used.
22288305719SAndreas Gohr     *
2235aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
2247ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
2257ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
226ab1f8ddeSAndreas Gohr     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
2275aa45b4dSAndreas Gohr     * @throws \Exception
2285aa45b4dSAndreas Gohr     */
229ab1f8ddeSAndreas Gohr    public function createPageChunks($page, $firstChunkID)
2305aa45b4dSAndreas Gohr    {
2317ee8b02dSAndreas Gohr        $chunkList = [];
23288305719SAndreas Gohr
2339634d734SAndreas Gohr        $text = $this->getPageContent($page);
234661701eeSAndreas Gohr        $crumbs = $this->breadcrumbTrail($page);
23588305719SAndreas Gohr
236ab1f8ddeSAndreas Gohr        // allow plugins to modify the text before splitting
237ab1f8ddeSAndreas Gohr        $eventData = [
238ab1f8ddeSAndreas Gohr            'page' => $page,
239ab1f8ddeSAndreas Gohr            'body' => '',
240ab1f8ddeSAndreas Gohr            'metadata' => ['title' => $page, 'relation_references' => []],
241ab1f8ddeSAndreas Gohr        ];
242ab1f8ddeSAndreas Gohr        $event = new Event('INDEXER_PAGE_ADD', $eventData);
243ab1f8ddeSAndreas Gohr        if ($event->advise_before()) {
244ab1f8ddeSAndreas Gohr            $text = $eventData['body'] . ' ' . $text;
245ab1f8ddeSAndreas Gohr        } else {
246ab1f8ddeSAndreas Gohr            $text = $eventData['body'];
247ab1f8ddeSAndreas Gohr        }
248ab1f8ddeSAndreas Gohr
249072e0099SAndreas Gohr        $splitter = new TextSplitter($this->getChunkSize(), $this->getTokenEncoder());
250072e0099SAndreas Gohr        $parts = $splitter->splitIntoChunks($text);
2517ee8b02dSAndreas Gohr        foreach ($parts as $part) {
252072e0099SAndreas Gohr            if (trim($part) === '') continue; // skip empty chunks
25393c1dbf4SAndreas Gohr
254661701eeSAndreas Gohr            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
255661701eeSAndreas Gohr
256ad38c5fdSAndreas Gohr            try {
2576a18e0f4SAndreas Gohr                $embedding = $this->embedModel->getEmbedding($part);
258ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
2597ebc7895Ssplitbrain                if ($this->logger instanceof CLI) {
260ad38c5fdSAndreas Gohr                    $this->logger->error(
261ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
262ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
263ad38c5fdSAndreas Gohr                    );
264ad38c5fdSAndreas Gohr                }
265ad38c5fdSAndreas Gohr                continue;
266ad38c5fdSAndreas Gohr            }
2677ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
2687ee8b02dSAndreas Gohr            $firstChunkID++;
2698817535bSAndreas Gohr        }
2707ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
2717ebc7895Ssplitbrain            if ($chunkList !== []) {
272f8d5ae01SAndreas Gohr                $this->logger->success(
273f8d5ae01SAndreas Gohr                    '{id} split into {count} chunks',
274f8d5ae01SAndreas Gohr                    ['id' => $page, 'count' => count($chunkList)]
275f8d5ae01SAndreas Gohr                );
27693c1dbf4SAndreas Gohr            } else {
27793c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
27893c1dbf4SAndreas Gohr            }
2798817535bSAndreas Gohr        }
2807ee8b02dSAndreas Gohr        return $chunkList;
2818817535bSAndreas Gohr    }
2828817535bSAndreas Gohr
2839e81bea7SAndreas Gohr    /**
2849e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
2859e81bea7SAndreas Gohr     *
2869e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
28768908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
2889e81bea7SAndreas Gohr     *
2899e81bea7SAndreas Gohr     * @param string $query The question
290e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language
291aa6bbe75SAndreas Gohr     * @param bool $limits Apply chat token limits to the number of chunks returned?
2927ee8b02dSAndreas Gohr     * @return Chunk[]
2939e81bea7SAndreas Gohr     * @throws \Exception
2949e81bea7SAndreas Gohr     */
295aa6bbe75SAndreas Gohr    public function getSimilarChunks($query, $lang = '', $limits = true)
2968817535bSAndreas Gohr    {
2979e81bea7SAndreas Gohr        global $auth;
2986a18e0f4SAndreas Gohr        $vector = $this->embedModel->getEmbedding($query);
2998817535bSAndreas Gohr
3007be8078eSAndreas Gohr        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
3017be8078eSAndreas Gohr
3027be8078eSAndreas Gohr        if ($tokenlimit) {
303e3640be8SAndreas Gohr            $fetch = min(
3047be8078eSAndreas Gohr                ($tokenlimit / $this->getChunkSize()),
30534a1c478SAndreas Gohr                $this->configContextChunks
306f6ef2e50SAndreas Gohr            );
307aa6bbe75SAndreas Gohr        } else {
308aa6bbe75SAndreas Gohr            $fetch = $this->configContextChunks;
309aa6bbe75SAndreas Gohr        }
310aee9b383SAndreas Gohr
311aee9b383SAndreas Gohr        $time = microtime(true);
312e33a1d7aSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
3135f71c9bbSAndreas Gohr        $this->timeSpent = round(microtime(true) - $time, 2);
3147ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
315aee9b383SAndreas Gohr            $this->logger->info(
316c2f55081SAndreas Gohr                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
317c2f55081SAndreas Gohr                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
318aee9b383SAndreas Gohr            );
319aee9b383SAndreas Gohr        }
32068908844SAndreas Gohr
32168908844SAndreas Gohr        $size = 0;
3228817535bSAndreas Gohr        $result = [];
3237ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
3249e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
3257ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
326720bb43fSAndreas Gohr            if ($chunk->getScore() < $this->similarityThreshold) continue;
32768908844SAndreas Gohr
3287be8078eSAndreas Gohr            if ($tokenlimit) {
32968908844SAndreas Gohr                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
3307be8078eSAndreas Gohr                if ($size + $chunkSize > $tokenlimit) break; // we have enough
331aa6bbe75SAndreas Gohr            }
33268908844SAndreas Gohr
3339e81bea7SAndreas Gohr            $result[] = $chunk;
334aa6bbe75SAndreas Gohr            $size += $chunkSize ?? 0;
335aa6bbe75SAndreas Gohr
336aa6bbe75SAndreas Gohr            if (count($result) >= $this->configContextChunks) break; // we have enough
3378817535bSAndreas Gohr        }
3388817535bSAndreas Gohr        return $result;
3398817535bSAndreas Gohr    }
3408817535bSAndreas Gohr
341661701eeSAndreas Gohr    /**
3429634d734SAndreas Gohr     * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk
3439634d734SAndreas Gohr     *
3449634d734SAndreas Gohr     * This will not apply any token limits
3459634d734SAndreas Gohr     *
3469634d734SAndreas Gohr     * @param string $query The question
3479634d734SAndreas Gohr     * @param string $lang Limit results to this language
3489634d734SAndreas Gohr     * @return Chunk[]
3499634d734SAndreas Gohr     * @throws \Exception
3509634d734SAndreas Gohr     */
3519634d734SAndreas Gohr    public function getSimilarPages($query, $lang = '')
3529634d734SAndreas Gohr    {
3539634d734SAndreas Gohr        $chunks = $this->getSimilarChunks($query, $lang, false);
3549634d734SAndreas Gohr        $pages = [];
3559634d734SAndreas Gohr
3569634d734SAndreas Gohr        foreach ($chunks as $chunk) {
3579634d734SAndreas Gohr            $page = $chunk->getPage();
3589634d734SAndreas Gohr            if (isset($pages[$page])) continue; // we already have this page
3599634d734SAndreas Gohr
3609634d734SAndreas Gohr            $content = $this->getPageContent($chunk->getPage());
3619634d734SAndreas Gohr            $crumbs = $this->breadcrumbTrail($chunk->getPage());
3629634d734SAndreas Gohr
3639634d734SAndreas Gohr            $pages[$page] = new Chunk(
3649634d734SAndreas Gohr                $page,
3659634d734SAndreas Gohr                $chunk->getId(),
3669634d734SAndreas Gohr                $crumbs . "\n\n" . $content,
3679634d734SAndreas Gohr                $chunk->getEmbedding(),
3689634d734SAndreas Gohr                $chunk->getLanguage(),
3699634d734SAndreas Gohr                $chunk->getCreated(),
3709634d734SAndreas Gohr                $chunk->getScore()
3719634d734SAndreas Gohr            );
3729634d734SAndreas Gohr        }
3739634d734SAndreas Gohr        return $pages;
3749634d734SAndreas Gohr    }
3759634d734SAndreas Gohr
3769634d734SAndreas Gohr    /**
377ed47fd87SAndreas Gohr     * Returns all chunks for a page
378ed47fd87SAndreas Gohr     *
379ed47fd87SAndreas Gohr     * Does not apply configContextChunks but checks token limits if requested
380ed47fd87SAndreas Gohr     *
381ed47fd87SAndreas Gohr     * @param string $page
382ed47fd87SAndreas Gohr     * @param bool $limits Apply chat token limits to the number of chunks returned?
383ed47fd87SAndreas Gohr     * @return Chunk[]
384ed47fd87SAndreas Gohr     */
385ed47fd87SAndreas Gohr    public function getPageChunks($page, $limits = true)
386ed47fd87SAndreas Gohr    {
387ed47fd87SAndreas Gohr        global $auth;
388ed47fd87SAndreas Gohr        if ($auth && auth_quickaclcheck($page) < AUTH_READ) {
389ed47fd87SAndreas Gohr            if ($this->logger instanceof CLI) $this->logger->warning(
390ed47fd87SAndreas Gohr                'User not allowed to read context page {page}', ['page' => $page]
391ed47fd87SAndreas Gohr            );
392ed47fd87SAndreas Gohr            return [];
393ed47fd87SAndreas Gohr        }
394ed47fd87SAndreas Gohr
395ed47fd87SAndreas Gohr        $indexer = new Indexer();
396ed47fd87SAndreas Gohr        $pages = $indexer->getPages();
397ed47fd87SAndreas Gohr        $pos = array_search(cleanID($page), $pages);
398ed47fd87SAndreas Gohr
399ed47fd87SAndreas Gohr        if ($pos === false) {
400ed47fd87SAndreas Gohr            if ($this->logger instanceof CLI) $this->logger->warning(
401ed47fd87SAndreas Gohr                'Context page {page} is not in index', ['page' => $page]
402ed47fd87SAndreas Gohr            );
403ed47fd87SAndreas Gohr            return [];
404ed47fd87SAndreas Gohr        }
405ed47fd87SAndreas Gohr
406ed47fd87SAndreas Gohr        $chunks = $this->storage->getPageChunks($page, $pos * 100);
407ed47fd87SAndreas Gohr
4087be8078eSAndreas Gohr        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
4097be8078eSAndreas Gohr
410ed47fd87SAndreas Gohr        $size = 0;
411ed47fd87SAndreas Gohr        $result = [];
412ed47fd87SAndreas Gohr        foreach ($chunks as $chunk) {
4137be8078eSAndreas Gohr            if ($tokenlimit) {
414ed47fd87SAndreas Gohr                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
4157be8078eSAndreas Gohr                if ($size + $chunkSize > $tokenlimit) break; // we have enough
416ed47fd87SAndreas Gohr            }
417ed47fd87SAndreas Gohr
418ed47fd87SAndreas Gohr            $result[] = $chunk;
419ed47fd87SAndreas Gohr            $size += $chunkSize ?? 0;
420ed47fd87SAndreas Gohr        }
421ed47fd87SAndreas Gohr
422ed47fd87SAndreas Gohr        return $result;
423ed47fd87SAndreas Gohr    }
424ed47fd87SAndreas Gohr
425*2d02fff5SAndreas Gohr    /**
426*2d02fff5SAndreas Gohr     * Drop the chunks for the given page from the storage and delete the render cache file
427*2d02fff5SAndreas Gohr     *
428*2d02fff5SAndreas Gohr     * This is a performance optimization, it only deletes chunks when a previously rendered cache
429*2d02fff5SAndreas Gohr     * file exists or if forced.
430*2d02fff5SAndreas Gohr     *
431*2d02fff5SAndreas Gohr     * @param string $page
432*2d02fff5SAndreas Gohr     * @param int $chunkID
433*2d02fff5SAndreas Gohr     * @param bool $force Should we force deletion even if no cache file exists?
434*2d02fff5SAndreas Gohr     * @return void
435*2d02fff5SAndreas Gohr     */
436*2d02fff5SAndreas Gohr    public function dropPageChunks($page, $chunkID, $force = false)
437*2d02fff5SAndreas Gohr    {
438*2d02fff5SAndreas Gohr        $cache = new CacheRenderer($page, wikiFN($page), 'aichat');
439*2d02fff5SAndreas Gohr        $x = $cache->cache;
440*2d02fff5SAndreas Gohr        if ($force || file_exists($cache->cache)) {
441*2d02fff5SAndreas Gohr            if ($this->logger instanceof CLI) {
442*2d02fff5SAndreas Gohr                $this->logger->info("Deleting chunks for page $page");
443*2d02fff5SAndreas Gohr            }
444*2d02fff5SAndreas Gohr
445*2d02fff5SAndreas Gohr            $this->storage->deletePageChunks($page, $chunkID);
446*2d02fff5SAndreas Gohr            @unlink($cache->cache);
447*2d02fff5SAndreas Gohr        }
448*2d02fff5SAndreas Gohr    }
449ed47fd87SAndreas Gohr
450ed47fd87SAndreas Gohr    /**
451661701eeSAndreas Gohr     * Create a breadcrumb trail for the given page
452661701eeSAndreas Gohr     *
453661701eeSAndreas Gohr     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
454661701eeSAndreas Gohr     * each chunk to give the AI some context.
455661701eeSAndreas Gohr     *
456661701eeSAndreas Gohr     * @param string $id
457661701eeSAndreas Gohr     * @return string
458661701eeSAndreas Gohr     */
459661701eeSAndreas Gohr    protected function breadcrumbTrail($id)
460661701eeSAndreas Gohr    {
461661701eeSAndreas Gohr        $namespaces = explode(':', getNS($id));
462661701eeSAndreas Gohr        $resolver = new PageResolver($id);
463661701eeSAndreas Gohr        $crumbs = [];
464661701eeSAndreas Gohr
465661701eeSAndreas Gohr        // all namespaces
466661701eeSAndreas Gohr        $check = '';
467661701eeSAndreas Gohr        foreach ($namespaces as $namespace) {
468661701eeSAndreas Gohr            $check .= $namespace . ':';
469661701eeSAndreas Gohr            $page = $resolver->resolveId($check);
470661701eeSAndreas Gohr            $title = p_get_first_heading($page);
471661701eeSAndreas Gohr            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
472661701eeSAndreas Gohr        }
473661701eeSAndreas Gohr
474661701eeSAndreas Gohr        // the page itself
475661701eeSAndreas Gohr        $title = p_get_first_heading($id);
476661701eeSAndreas Gohr        $page = noNS($id);
477661701eeSAndreas Gohr        $crumbs[] = $title ? "$title ($page)" : $page;
478661701eeSAndreas Gohr
479661701eeSAndreas Gohr        return implode(' » ', $crumbs);
480661701eeSAndreas Gohr    }
4818817535bSAndreas Gohr}
482