xref: /plugin/aichat/Embeddings.php (revision 9634d7345e88e8177bbba1e0ecb312352866df1d)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
5ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event;
6661701eeSAndreas Gohruse dokuwiki\File\PageResolver;
7294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface;
8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage;
108817535bSAndreas Gohruse dokuwiki\Search\Indexer;
112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
128817535bSAndreas Gohruse TikToken\Encoder;
138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
148817535bSAndreas Gohr
159da5f0dfSAndreas Gohr/**
169da5f0dfSAndreas Gohr * Manage the embeddings index
179da5f0dfSAndreas Gohr *
189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
209da5f0dfSAndreas Gohr */
218817535bSAndreas Gohrclass Embeddings
228817535bSAndreas Gohr{
2368908844SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
2430b9cbc7Ssplitbrain    final public const MAX_OVERLAP_LEN = 200;
258817535bSAndreas Gohr
26294a9eafSAndreas Gohr    /** @var ChatInterface */
276a18e0f4SAndreas Gohr    protected $chatModel;
286a18e0f4SAndreas Gohr
29294a9eafSAndreas Gohr    /** @var EmbeddingInterface */
306a18e0f4SAndreas Gohr    protected $embedModel;
316a18e0f4SAndreas Gohr
322ecc089aSAndreas Gohr    /** @var CLI|null */
332ecc089aSAndreas Gohr    protected $logger;
3468908844SAndreas Gohr    /** @var Encoder */
3568908844SAndreas Gohr    protected $tokenEncoder;
368817535bSAndreas Gohr
377ee8b02dSAndreas Gohr    /** @var AbstractStorage */
387ee8b02dSAndreas Gohr    protected $storage;
397ee8b02dSAndreas Gohr
4068908844SAndreas Gohr    /** @var array remember sentences when chunking */
4168908844SAndreas Gohr    private $sentenceQueue = [];
4268908844SAndreas Gohr
43c2b7a1f7SAndreas Gohr    /** @var int the time spent for the last similar chunk retrieval */
44c2b7a1f7SAndreas Gohr    public $timeSpent = 0;
45c2b7a1f7SAndreas Gohr
4634a1c478SAndreas Gohr    protected $configChunkSize;
4734a1c478SAndreas Gohr    protected $configContextChunks;
48720bb43fSAndreas Gohr    protected $similarityThreshold;
4934a1c478SAndreas Gohr
5034a1c478SAndreas Gohr    /**
5134a1c478SAndreas Gohr     * Embeddings constructor.
5234a1c478SAndreas Gohr     *
5334a1c478SAndreas Gohr     * @param ChatInterface $chatModel
5434a1c478SAndreas Gohr     * @param EmbeddingInterface $embedModel
5534a1c478SAndreas Gohr     * @param AbstractStorage $storage
5634a1c478SAndreas Gohr     * @param array $config The plugin configuration
5734a1c478SAndreas Gohr     */
586a18e0f4SAndreas Gohr    public function __construct(
59294a9eafSAndreas Gohr        ChatInterface      $chatModel,
60294a9eafSAndreas Gohr        EmbeddingInterface $embedModel,
6134a1c478SAndreas Gohr        AbstractStorage    $storage,
6234a1c478SAndreas Gohr                           $config
63aa6bbe75SAndreas Gohr    )
64aa6bbe75SAndreas Gohr    {
656a18e0f4SAndreas Gohr        $this->chatModel = $chatModel;
666a18e0f4SAndreas Gohr        $this->embedModel = $embedModel;
67f6ef2e50SAndreas Gohr        $this->storage = $storage;
6834a1c478SAndreas Gohr        $this->configChunkSize = $config['chunkSize'];
6934a1c478SAndreas Gohr        $this->configContextChunks = $config['contextChunks'];
70720bb43fSAndreas Gohr        $this->similarityThreshold = $config['similarityThreshold'] / 100;
717ee8b02dSAndreas Gohr    }
727ee8b02dSAndreas Gohr
737ee8b02dSAndreas Gohr    /**
747ee8b02dSAndreas Gohr     * Access storage
757ee8b02dSAndreas Gohr     *
767ee8b02dSAndreas Gohr     * @return AbstractStorage
777ee8b02dSAndreas Gohr     */
787ee8b02dSAndreas Gohr    public function getStorage()
797ee8b02dSAndreas Gohr    {
807ee8b02dSAndreas Gohr        return $this->storage;
812ecc089aSAndreas Gohr    }
822ecc089aSAndreas Gohr
832ecc089aSAndreas Gohr    /**
84aa6bbe75SAndreas Gohr     * Override the number of used context chunks
85aa6bbe75SAndreas Gohr     *
86aa6bbe75SAndreas Gohr     * @param int $max
87aa6bbe75SAndreas Gohr     * @return void
88aa6bbe75SAndreas Gohr     */
89aa6bbe75SAndreas Gohr    public function setConfigContextChunks(int $max)
90aa6bbe75SAndreas Gohr    {
91aa6bbe75SAndreas Gohr        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
92aa6bbe75SAndreas Gohr        $this->configContextChunks = $max;
93aa6bbe75SAndreas Gohr    }
94aa6bbe75SAndreas Gohr
95aa6bbe75SAndreas Gohr    /**
96aa6bbe75SAndreas Gohr     * Override the similiarity threshold
97aa6bbe75SAndreas Gohr     *
98aa6bbe75SAndreas Gohr     * @param float $threshold
99aa6bbe75SAndreas Gohr     * @return void
100aa6bbe75SAndreas Gohr     */
101aa6bbe75SAndreas Gohr    public function setSimilarityThreshold(float $threshold)
102aa6bbe75SAndreas Gohr    {
103aa6bbe75SAndreas Gohr        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
104aa6bbe75SAndreas Gohr        $this->similarityThreshold = $threshold;
105aa6bbe75SAndreas Gohr    }
106aa6bbe75SAndreas Gohr
107aa6bbe75SAndreas Gohr    /**
1082ecc089aSAndreas Gohr     * Add a logger instance
1092ecc089aSAndreas Gohr     *
1102ecc089aSAndreas Gohr     * @return void
1112ecc089aSAndreas Gohr     */
1122ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
1132ecc089aSAndreas Gohr    {
1148817535bSAndreas Gohr        $this->logger = $logger;
1158817535bSAndreas Gohr    }
1168817535bSAndreas Gohr
1172ecc089aSAndreas Gohr    /**
11868908844SAndreas Gohr     * Get the token encoder instance
11968908844SAndreas Gohr     *
12068908844SAndreas Gohr     * @return Encoder
12168908844SAndreas Gohr     */
12268908844SAndreas Gohr    public function getTokenEncoder()
12368908844SAndreas Gohr    {
1247ebc7895Ssplitbrain        if (!$this->tokenEncoder instanceof Encoder) {
12568908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
12668908844SAndreas Gohr        }
12768908844SAndreas Gohr        return $this->tokenEncoder;
12868908844SAndreas Gohr    }
12968908844SAndreas Gohr
13068908844SAndreas Gohr    /**
1316a18e0f4SAndreas Gohr     * Return the chunk size to use
1326a18e0f4SAndreas Gohr     *
1336a18e0f4SAndreas Gohr     * @return int
1346a18e0f4SAndreas Gohr     */
1356a18e0f4SAndreas Gohr    public function getChunkSize()
1366a18e0f4SAndreas Gohr    {
1377be8078eSAndreas Gohr        $tokenlimit = $this->chatModel->getMaxInputTokenLength();
1387be8078eSAndreas Gohr        if (!$tokenlimit) {
1397be8078eSAndreas Gohr            // no token limit, use the configured chunk size
1407be8078eSAndreas Gohr            return $this->configChunkSize;
1417be8078eSAndreas Gohr        }
1427be8078eSAndreas Gohr
1436a18e0f4SAndreas Gohr        return min(
14434a1c478SAndreas Gohr            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
14534a1c478SAndreas Gohr            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
14634a1c478SAndreas Gohr            $this->configChunkSize, // this is usually the smallest
1476a18e0f4SAndreas Gohr        );
1486a18e0f4SAndreas Gohr    }
1496a18e0f4SAndreas Gohr
1506a18e0f4SAndreas Gohr    /**
1515284515dSAndreas Gohr     * Update the embeddings storage
1522ecc089aSAndreas Gohr     *
153ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
154d5c102b3SAndreas Gohr     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
1555284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
1562ecc089aSAndreas Gohr     * @return void
1575284515dSAndreas Gohr     * @throws \Exception
1582ecc089aSAndreas Gohr     */
159d5c102b3SAndreas Gohr    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
1608817535bSAndreas Gohr    {
1618817535bSAndreas Gohr        $indexer = new Indexer();
1628817535bSAndreas Gohr        $pages = $indexer->getPages();
1638817535bSAndreas Gohr
164f6ef2e50SAndreas Gohr        $this->storage->startCreation($clear);
1655aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
1665aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
1675aa45b4dSAndreas Gohr
1685284515dSAndreas Gohr            if (
1695284515dSAndreas Gohr                !page_exists($page) ||
1705284515dSAndreas Gohr                isHiddenPage($page) ||
1714e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
172d5c102b3SAndreas Gohr                ($skipRE && preg_match($skipRE, (string)$page)) ||
173d5c102b3SAndreas Gohr                ($matchRE && !preg_match($matchRE, ":$page"))
1745284515dSAndreas Gohr            ) {
1755284515dSAndreas Gohr                // this page should not be in the index (anymore)
1765284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1775284515dSAndreas Gohr                continue;
1785284515dSAndreas Gohr            }
1795284515dSAndreas Gohr
1807ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1817ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1825aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1837ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
1847ebc7895Ssplitbrain                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
1855aa45b4dSAndreas Gohr            } else {
1865aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1877ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
188ecb0a423SAndreas Gohr                $chunks = $this->createPageChunks($page, $chunkID);
189ecb0a423SAndreas Gohr                if ($chunks) $this->storage->addPageChunks($chunks);
1905aa45b4dSAndreas Gohr            }
1915aa45b4dSAndreas Gohr        }
1927ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1935aa45b4dSAndreas Gohr    }
1945aa45b4dSAndreas Gohr
1955aa45b4dSAndreas Gohr    /**
196*9634d734SAndreas Gohr     * Get the content of a page
197*9634d734SAndreas Gohr     *
198*9634d734SAndreas Gohr     * Uses our own renderer to format the contents in an LLM friendly way. Falls back to
199*9634d734SAndreas Gohr     * raw syntax if the renderer fails for some reason
200*9634d734SAndreas Gohr     *
201*9634d734SAndreas Gohr     * @param string $page Name of the page to read
202*9634d734SAndreas Gohr     * @return string The content of the page
203*9634d734SAndreas Gohr     */
204*9634d734SAndreas Gohr    public function getPageContent($page)
205*9634d734SAndreas Gohr    {
206*9634d734SAndreas Gohr        global $ID;
207*9634d734SAndreas Gohr        $ID = $page;
208*9634d734SAndreas Gohr        try {
209*9634d734SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'aichat', $page);
210*9634d734SAndreas Gohr        } catch (\Throwable $e) {
211*9634d734SAndreas Gohr            if ($this->logger) $this->logger->error(
212*9634d734SAndreas Gohr                'Failed to render page {page}. Using raw text instead. {msg}',
213*9634d734SAndreas Gohr                ['page' => $page, 'msg' => $e->getMessage()]
214*9634d734SAndreas Gohr            );
215*9634d734SAndreas Gohr            $text = rawWiki($page);
216*9634d734SAndreas Gohr        }
217*9634d734SAndreas Gohr        return $text;
218*9634d734SAndreas Gohr    }
219*9634d734SAndreas Gohr
220*9634d734SAndreas Gohr    /**
2217ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
2225aa45b4dSAndreas Gohr     *
22388305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
22488305719SAndreas Gohr     * Otherwise the raw wiki text is used.
22588305719SAndreas Gohr     *
2265aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
2277ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
2287ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
229ab1f8ddeSAndreas Gohr     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
2305aa45b4dSAndreas Gohr     * @throws \Exception
2315aa45b4dSAndreas Gohr     */
232ab1f8ddeSAndreas Gohr    public function createPageChunks($page, $firstChunkID)
2335aa45b4dSAndreas Gohr    {
2347ee8b02dSAndreas Gohr        $chunkList = [];
23588305719SAndreas Gohr
236*9634d734SAndreas Gohr        $text = $this->getPageContent($page);
237661701eeSAndreas Gohr        $crumbs = $this->breadcrumbTrail($page);
23888305719SAndreas Gohr
239ab1f8ddeSAndreas Gohr        // allow plugins to modify the text before splitting
240ab1f8ddeSAndreas Gohr        $eventData = [
241ab1f8ddeSAndreas Gohr            'page' => $page,
242ab1f8ddeSAndreas Gohr            'body' => '',
243ab1f8ddeSAndreas Gohr            'metadata' => ['title' => $page, 'relation_references' => []],
244ab1f8ddeSAndreas Gohr        ];
245ab1f8ddeSAndreas Gohr        $event = new Event('INDEXER_PAGE_ADD', $eventData);
246ab1f8ddeSAndreas Gohr        if ($event->advise_before()) {
247ab1f8ddeSAndreas Gohr            $text = $eventData['body'] . ' ' . $text;
248ab1f8ddeSAndreas Gohr        } else {
249ab1f8ddeSAndreas Gohr            $text = $eventData['body'];
250ab1f8ddeSAndreas Gohr        }
251ab1f8ddeSAndreas Gohr
25288305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
2537ee8b02dSAndreas Gohr        foreach ($parts as $part) {
25430b9cbc7Ssplitbrain            if (trim((string)$part) == '') continue; // skip empty chunks
25593c1dbf4SAndreas Gohr
256661701eeSAndreas Gohr            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
257661701eeSAndreas Gohr
258ad38c5fdSAndreas Gohr            try {
2596a18e0f4SAndreas Gohr                $embedding = $this->embedModel->getEmbedding($part);
260ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
2617ebc7895Ssplitbrain                if ($this->logger instanceof CLI) {
262ad38c5fdSAndreas Gohr                    $this->logger->error(
263ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
264ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
265ad38c5fdSAndreas Gohr                    );
266ad38c5fdSAndreas Gohr                }
267ad38c5fdSAndreas Gohr                continue;
268ad38c5fdSAndreas Gohr            }
2697ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
2707ee8b02dSAndreas Gohr            $firstChunkID++;
2718817535bSAndreas Gohr        }
2727ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
2737ebc7895Ssplitbrain            if ($chunkList !== []) {
274f8d5ae01SAndreas Gohr                $this->logger->success(
275f8d5ae01SAndreas Gohr                    '{id} split into {count} chunks',
276f8d5ae01SAndreas Gohr                    ['id' => $page, 'count' => count($chunkList)]
277f8d5ae01SAndreas Gohr                );
27893c1dbf4SAndreas Gohr            } else {
27993c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
28093c1dbf4SAndreas Gohr            }
2818817535bSAndreas Gohr        }
2827ee8b02dSAndreas Gohr        return $chunkList;
2838817535bSAndreas Gohr    }
2848817535bSAndreas Gohr
2859e81bea7SAndreas Gohr    /**
2869e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
2879e81bea7SAndreas Gohr     *
2889e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
28968908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
2909e81bea7SAndreas Gohr     *
2919e81bea7SAndreas Gohr     * @param string $query The question
292e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language
293aa6bbe75SAndreas Gohr     * @param bool $limits Apply chat token limits to the number of chunks returned?
2947ee8b02dSAndreas Gohr     * @return Chunk[]
2959e81bea7SAndreas Gohr     * @throws \Exception
2969e81bea7SAndreas Gohr     */
297aa6bbe75SAndreas Gohr    public function getSimilarChunks($query, $lang = '', $limits = true)
2988817535bSAndreas Gohr    {
2999e81bea7SAndreas Gohr        global $auth;
3006a18e0f4SAndreas Gohr        $vector = $this->embedModel->getEmbedding($query);
3018817535bSAndreas Gohr
3027be8078eSAndreas Gohr        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
3037be8078eSAndreas Gohr
3047be8078eSAndreas Gohr        if ($tokenlimit) {
305e3640be8SAndreas Gohr            $fetch = min(
3067be8078eSAndreas Gohr                ($tokenlimit / $this->getChunkSize()),
30734a1c478SAndreas Gohr                $this->configContextChunks
308f6ef2e50SAndreas Gohr            );
309aa6bbe75SAndreas Gohr        } else {
310aa6bbe75SAndreas Gohr            $fetch = $this->configContextChunks;
311aa6bbe75SAndreas Gohr        }
312aee9b383SAndreas Gohr
313aee9b383SAndreas Gohr        $time = microtime(true);
314e33a1d7aSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
3155f71c9bbSAndreas Gohr        $this->timeSpent = round(microtime(true) - $time, 2);
3167ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
317aee9b383SAndreas Gohr            $this->logger->info(
318c2f55081SAndreas Gohr                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
319c2f55081SAndreas Gohr                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
320aee9b383SAndreas Gohr            );
321aee9b383SAndreas Gohr        }
32268908844SAndreas Gohr
32368908844SAndreas Gohr        $size = 0;
3248817535bSAndreas Gohr        $result = [];
3257ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
3269e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
3277ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
328720bb43fSAndreas Gohr            if ($chunk->getScore() < $this->similarityThreshold) continue;
32968908844SAndreas Gohr
3307be8078eSAndreas Gohr            if ($tokenlimit) {
33168908844SAndreas Gohr                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
3327be8078eSAndreas Gohr                if ($size + $chunkSize > $tokenlimit) break; // we have enough
333aa6bbe75SAndreas Gohr            }
33468908844SAndreas Gohr
3359e81bea7SAndreas Gohr            $result[] = $chunk;
336aa6bbe75SAndreas Gohr            $size += $chunkSize ?? 0;
337aa6bbe75SAndreas Gohr
338aa6bbe75SAndreas Gohr            if (count($result) >= $this->configContextChunks) break; // we have enough
3398817535bSAndreas Gohr        }
3408817535bSAndreas Gohr        return $result;
3418817535bSAndreas Gohr    }
3428817535bSAndreas Gohr
343661701eeSAndreas Gohr    /**
344*9634d734SAndreas Gohr     * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk
345*9634d734SAndreas Gohr     *
346*9634d734SAndreas Gohr     * This will not apply any token limits
347*9634d734SAndreas Gohr     *
348*9634d734SAndreas Gohr     * @param string $query The question
349*9634d734SAndreas Gohr     * @param string $lang Limit results to this language
350*9634d734SAndreas Gohr     * @return Chunk[]
351*9634d734SAndreas Gohr     * @throws \Exception
352*9634d734SAndreas Gohr     */
353*9634d734SAndreas Gohr    public function getSimilarPages($query, $lang = '')
354*9634d734SAndreas Gohr    {
355*9634d734SAndreas Gohr        $chunks = $this->getSimilarChunks($query, $lang, false);
356*9634d734SAndreas Gohr        $pages = [];
357*9634d734SAndreas Gohr
358*9634d734SAndreas Gohr        foreach ($chunks as $chunk) {
359*9634d734SAndreas Gohr            $page = $chunk->getPage();
360*9634d734SAndreas Gohr            if (isset($pages[$page])) continue; // we already have this page
361*9634d734SAndreas Gohr
362*9634d734SAndreas Gohr            $content = $this->getPageContent($chunk->getPage());
363*9634d734SAndreas Gohr            $crumbs = $this->breadcrumbTrail($chunk->getPage());
364*9634d734SAndreas Gohr
365*9634d734SAndreas Gohr            $pages[$page] = new Chunk(
366*9634d734SAndreas Gohr                $page,
367*9634d734SAndreas Gohr                $chunk->getId(),
368*9634d734SAndreas Gohr                $crumbs . "\n\n" . $content,
369*9634d734SAndreas Gohr                $chunk->getEmbedding(),
370*9634d734SAndreas Gohr                $chunk->getLanguage(),
371*9634d734SAndreas Gohr                $chunk->getCreated(),
372*9634d734SAndreas Gohr                $chunk->getScore()
373*9634d734SAndreas Gohr            );
374*9634d734SAndreas Gohr        }
375*9634d734SAndreas Gohr        return $pages;
376*9634d734SAndreas Gohr    }
377*9634d734SAndreas Gohr
378*9634d734SAndreas Gohr    /**
379ed47fd87SAndreas Gohr     * Returns all chunks for a page
380ed47fd87SAndreas Gohr     *
381ed47fd87SAndreas Gohr     * Does not apply configContextChunks but checks token limits if requested
382ed47fd87SAndreas Gohr     *
383ed47fd87SAndreas Gohr     * @param string $page
384ed47fd87SAndreas Gohr     * @param bool $limits Apply chat token limits to the number of chunks returned?
385ed47fd87SAndreas Gohr     * @return Chunk[]
386ed47fd87SAndreas Gohr     */
387ed47fd87SAndreas Gohr    public function getPageChunks($page, $limits = true)
388ed47fd87SAndreas Gohr    {
389ed47fd87SAndreas Gohr        global $auth;
390ed47fd87SAndreas Gohr        if ($auth && auth_quickaclcheck($page) < AUTH_READ) {
391ed47fd87SAndreas Gohr            if ($this->logger instanceof CLI) $this->logger->warning(
392ed47fd87SAndreas Gohr                'User not allowed to read context page {page}', ['page' => $page]
393ed47fd87SAndreas Gohr            );
394ed47fd87SAndreas Gohr            return [];
395ed47fd87SAndreas Gohr        }
396ed47fd87SAndreas Gohr
397ed47fd87SAndreas Gohr        $indexer = new Indexer();
398ed47fd87SAndreas Gohr        $pages = $indexer->getPages();
399ed47fd87SAndreas Gohr        $pos = array_search(cleanID($page), $pages);
400ed47fd87SAndreas Gohr
401ed47fd87SAndreas Gohr        if ($pos === false) {
402ed47fd87SAndreas Gohr            if ($this->logger instanceof CLI) $this->logger->warning(
403ed47fd87SAndreas Gohr                'Context page {page} is not in index', ['page' => $page]
404ed47fd87SAndreas Gohr            );
405ed47fd87SAndreas Gohr            return [];
406ed47fd87SAndreas Gohr        }
407ed47fd87SAndreas Gohr
408ed47fd87SAndreas Gohr        $chunks = $this->storage->getPageChunks($page, $pos * 100);
409ed47fd87SAndreas Gohr
4107be8078eSAndreas Gohr        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
4117be8078eSAndreas Gohr
412ed47fd87SAndreas Gohr        $size = 0;
413ed47fd87SAndreas Gohr        $result = [];
414ed47fd87SAndreas Gohr        foreach ($chunks as $chunk) {
4157be8078eSAndreas Gohr            if ($tokenlimit) {
416ed47fd87SAndreas Gohr                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
4177be8078eSAndreas Gohr                if ($size + $chunkSize > $tokenlimit) break; // we have enough
418ed47fd87SAndreas Gohr            }
419ed47fd87SAndreas Gohr
420ed47fd87SAndreas Gohr            $result[] = $chunk;
421ed47fd87SAndreas Gohr            $size += $chunkSize ?? 0;
422ed47fd87SAndreas Gohr        }
423ed47fd87SAndreas Gohr
424ed47fd87SAndreas Gohr        return $result;
425ed47fd87SAndreas Gohr    }
426ed47fd87SAndreas Gohr
427ed47fd87SAndreas Gohr
428ed47fd87SAndreas Gohr    /**
429661701eeSAndreas Gohr     * Create a breadcrumb trail for the given page
430661701eeSAndreas Gohr     *
431661701eeSAndreas Gohr     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
432661701eeSAndreas Gohr     * each chunk to give the AI some context.
433661701eeSAndreas Gohr     *
434661701eeSAndreas Gohr     * @param string $id
435661701eeSAndreas Gohr     * @return string
436661701eeSAndreas Gohr     */
437661701eeSAndreas Gohr    protected function breadcrumbTrail($id)
438661701eeSAndreas Gohr    {
439661701eeSAndreas Gohr        $namespaces = explode(':', getNS($id));
440661701eeSAndreas Gohr        $resolver = new PageResolver($id);
441661701eeSAndreas Gohr        $crumbs = [];
442661701eeSAndreas Gohr
443661701eeSAndreas Gohr        // all namespaces
444661701eeSAndreas Gohr        $check = '';
445661701eeSAndreas Gohr        foreach ($namespaces as $namespace) {
446661701eeSAndreas Gohr            $check .= $namespace . ':';
447661701eeSAndreas Gohr            $page = $resolver->resolveId($check);
448661701eeSAndreas Gohr            $title = p_get_first_heading($page);
449661701eeSAndreas Gohr            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
450661701eeSAndreas Gohr        }
451661701eeSAndreas Gohr
452661701eeSAndreas Gohr        // the page itself
453661701eeSAndreas Gohr        $title = p_get_first_heading($id);
454661701eeSAndreas Gohr        $page = noNS($id);
455661701eeSAndreas Gohr        $crumbs[] = $title ? "$title ($page)" : $page;
456661701eeSAndreas Gohr
457661701eeSAndreas Gohr        return implode(' » ', $crumbs);
458661701eeSAndreas Gohr    }
4595786be46SAndreas Gohr
4605786be46SAndreas Gohr    /**
4618817535bSAndreas Gohr     * @param $text
4628817535bSAndreas Gohr     * @return array
4638817535bSAndreas Gohr     * @throws \Exception
4648817535bSAndreas Gohr     * @todo support splitting too long sentences
4658817535bSAndreas Gohr     */
466ab1f8ddeSAndreas Gohr    protected function splitIntoChunks($text)
4678817535bSAndreas Gohr    {
4688817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
46968908844SAndreas Gohr        $tiktok = $this->getTokenEncoder();
4708817535bSAndreas Gohr
4718817535bSAndreas Gohr        $chunks = [];
4728817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
4738817535bSAndreas Gohr
4748817535bSAndreas Gohr        $chunklen = 0;
4758817535bSAndreas Gohr        $chunk = '';
4768817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
4778817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
4786a18e0f4SAndreas Gohr            if ($slen > $this->getChunkSize()) {
4798817535bSAndreas Gohr                // sentence is too long, we need to split it further
480f8d5ae01SAndreas Gohr                if ($this->logger instanceof CLI) $this->logger->warning(
481f8d5ae01SAndreas Gohr                    'Sentence too long, splitting not implemented yet'
482f8d5ae01SAndreas Gohr                );
483ad38c5fdSAndreas Gohr                continue;
4848817535bSAndreas Gohr            }
4858817535bSAndreas Gohr
4866a18e0f4SAndreas Gohr            if ($chunklen + $slen < $this->getChunkSize()) {
4878817535bSAndreas Gohr                // add to current chunk
4888817535bSAndreas Gohr                $chunk .= $sentence;
4898817535bSAndreas Gohr                $chunklen += $slen;
49068908844SAndreas Gohr                // remember sentence for overlap check
49168908844SAndreas Gohr                $this->rememberSentence($sentence);
4928817535bSAndreas Gohr            } else {
49368908844SAndreas Gohr                // add current chunk to result
494ab1f8ddeSAndreas Gohr                $chunk = trim($chunk);
495ab1f8ddeSAndreas Gohr                if ($chunk !== '') $chunks[] = $chunk;
49668908844SAndreas Gohr
49768908844SAndreas Gohr                // start new chunk with remembered sentences
4987ebc7895Ssplitbrain                $chunk = implode(' ', $this->sentenceQueue);
49968908844SAndreas Gohr                $chunk .= $sentence;
50068908844SAndreas Gohr                $chunklen = count($tiktok->encode($chunk));
5018817535bSAndreas Gohr            }
5028817535bSAndreas Gohr        }
5038817535bSAndreas Gohr        $chunks[] = $chunk;
5048817535bSAndreas Gohr
5058817535bSAndreas Gohr        return $chunks;
5068817535bSAndreas Gohr    }
50768908844SAndreas Gohr
50868908844SAndreas Gohr    /**
50968908844SAndreas Gohr     * Add a sentence to the queue of remembered sentences
51068908844SAndreas Gohr     *
51168908844SAndreas Gohr     * @param string $sentence
51268908844SAndreas Gohr     * @return void
51368908844SAndreas Gohr     */
51468908844SAndreas Gohr    protected function rememberSentence($sentence)
51568908844SAndreas Gohr    {
51668908844SAndreas Gohr        // add sentence to queue
51768908844SAndreas Gohr        $this->sentenceQueue[] = $sentence;
51868908844SAndreas Gohr
51968908844SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
52068908844SAndreas Gohr        $encoder = $this->getTokenEncoder();
5217ebc7895Ssplitbrain        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
52268908844SAndreas Gohr            array_shift($this->sentenceQueue);
52368908844SAndreas Gohr        }
52468908844SAndreas Gohr    }
5258817535bSAndreas Gohr}
526