xref: /plugin/aichat/Embeddings.php (revision d5c102b3f940c6a699499e715eeb66b02677d7df)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
57ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface;
6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Model\AbstractModel;
7f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage;
88817535bSAndreas Gohruse dokuwiki\Search\Indexer;
92ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
108817535bSAndreas Gohruse TikToken\Encoder;
118817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
128817535bSAndreas Gohr
139da5f0dfSAndreas Gohr/**
149da5f0dfSAndreas Gohr * Manage the embeddings index
159da5f0dfSAndreas Gohr *
169da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
177ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
189da5f0dfSAndreas Gohr */
198817535bSAndreas Gohrclass Embeddings
208817535bSAndreas Gohr{
2168908844SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
2230b9cbc7Ssplitbrain    final public const MAX_OVERLAP_LEN = 200;
238817535bSAndreas Gohr
24f6ef2e50SAndreas Gohr    /** @var AbstractModel */
25f6ef2e50SAndreas Gohr    protected $model;
262ecc089aSAndreas Gohr    /** @var CLI|null */
272ecc089aSAndreas Gohr    protected $logger;
2868908844SAndreas Gohr    /** @var Encoder */
2968908844SAndreas Gohr    protected $tokenEncoder;
308817535bSAndreas Gohr
317ee8b02dSAndreas Gohr    /** @var AbstractStorage */
327ee8b02dSAndreas Gohr    protected $storage;
337ee8b02dSAndreas Gohr
3468908844SAndreas Gohr    /** @var array remember sentences when chunking */
3568908844SAndreas Gohr    private $sentenceQueue = [];
3668908844SAndreas Gohr
37f6ef2e50SAndreas Gohr    public function __construct(AbstractModel $model, AbstractStorage $storage)
388817535bSAndreas Gohr    {
39f6ef2e50SAndreas Gohr        $this->model = $model;
40f6ef2e50SAndreas Gohr        $this->storage = $storage;
417ee8b02dSAndreas Gohr    }
427ee8b02dSAndreas Gohr
437ee8b02dSAndreas Gohr    /**
447ee8b02dSAndreas Gohr     * Access storage
457ee8b02dSAndreas Gohr     *
467ee8b02dSAndreas Gohr     * @return AbstractStorage
477ee8b02dSAndreas Gohr     */
487ee8b02dSAndreas Gohr    public function getStorage()
497ee8b02dSAndreas Gohr    {
507ee8b02dSAndreas Gohr        return $this->storage;
512ecc089aSAndreas Gohr    }
522ecc089aSAndreas Gohr
532ecc089aSAndreas Gohr    /**
542ecc089aSAndreas Gohr     * Add a logger instance
552ecc089aSAndreas Gohr     *
562ecc089aSAndreas Gohr     * @return void
572ecc089aSAndreas Gohr     */
582ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
592ecc089aSAndreas Gohr    {
608817535bSAndreas Gohr        $this->logger = $logger;
618817535bSAndreas Gohr    }
628817535bSAndreas Gohr
632ecc089aSAndreas Gohr    /**
6468908844SAndreas Gohr     * Get the token encoder instance
6568908844SAndreas Gohr     *
6668908844SAndreas Gohr     * @return Encoder
6768908844SAndreas Gohr     */
6868908844SAndreas Gohr    public function getTokenEncoder()
6968908844SAndreas Gohr    {
707ebc7895Ssplitbrain        if (!$this->tokenEncoder instanceof Encoder) {
7168908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
7268908844SAndreas Gohr        }
7368908844SAndreas Gohr        return $this->tokenEncoder;
7468908844SAndreas Gohr    }
7568908844SAndreas Gohr
7668908844SAndreas Gohr    /**
775284515dSAndreas Gohr     * Update the embeddings storage
782ecc089aSAndreas Gohr     *
79ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
80*d5c102b3SAndreas Gohr     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
815284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
822ecc089aSAndreas Gohr     * @return void
835284515dSAndreas Gohr     * @throws \Exception
842ecc089aSAndreas Gohr     */
85*d5c102b3SAndreas Gohr    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
868817535bSAndreas Gohr    {
878817535bSAndreas Gohr        $indexer = new Indexer();
888817535bSAndreas Gohr        $pages = $indexer->getPages();
898817535bSAndreas Gohr
90f6ef2e50SAndreas Gohr        $this->storage->startCreation($clear);
915aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
925aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
935aa45b4dSAndreas Gohr
945284515dSAndreas Gohr            if (
955284515dSAndreas Gohr                !page_exists($page) ||
965284515dSAndreas Gohr                isHiddenPage($page) ||
974e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
98*d5c102b3SAndreas Gohr                ($skipRE && preg_match($skipRE, (string) $page)) ||
99*d5c102b3SAndreas Gohr                ($matchRE && !preg_match($matchRE, ":$page"))
1005284515dSAndreas Gohr            ) {
1015284515dSAndreas Gohr                // this page should not be in the index (anymore)
1025284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1035284515dSAndreas Gohr                continue;
1045284515dSAndreas Gohr            }
1055284515dSAndreas Gohr
1067ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1077ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1085aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1097ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
1107ebc7895Ssplitbrain                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
1115aa45b4dSAndreas Gohr            } else {
1125aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1137ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1147ee8b02dSAndreas Gohr                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
1155aa45b4dSAndreas Gohr            }
1165aa45b4dSAndreas Gohr        }
1177ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1185aa45b4dSAndreas Gohr    }
1195aa45b4dSAndreas Gohr
1205aa45b4dSAndreas Gohr    /**
1217ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1225aa45b4dSAndreas Gohr     *
12388305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
12488305719SAndreas Gohr     * Otherwise the raw wiki text is used.
12588305719SAndreas Gohr     *
1265aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1277ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
1287ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
1295aa45b4dSAndreas Gohr     * @throws \Exception
1305aa45b4dSAndreas Gohr     */
1317ee8b02dSAndreas Gohr    protected function createPageChunks($page, $firstChunkID)
1325aa45b4dSAndreas Gohr    {
1337ee8b02dSAndreas Gohr        $chunkList = [];
13488305719SAndreas Gohr
13588305719SAndreas Gohr        $textRenderer = plugin_load('renderer', 'text');
1367ebc7895Ssplitbrain        if ($textRenderer instanceof PluginInterface) {
13788305719SAndreas Gohr            global $ID;
13888305719SAndreas Gohr            $ID = $page;
13988305719SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'text', $page);
14088305719SAndreas Gohr        } else {
14188305719SAndreas Gohr            $text = rawWiki($page);
14288305719SAndreas Gohr        }
14388305719SAndreas Gohr
14488305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
1457ee8b02dSAndreas Gohr        foreach ($parts as $part) {
14630b9cbc7Ssplitbrain            if (trim((string) $part) == '') continue; // skip empty chunks
14793c1dbf4SAndreas Gohr
148ad38c5fdSAndreas Gohr            try {
149f6ef2e50SAndreas Gohr                $embedding = $this->model->getEmbedding($part);
150ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
1517ebc7895Ssplitbrain                if ($this->logger instanceof CLI) {
152ad38c5fdSAndreas Gohr                    $this->logger->error(
153ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
154ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
155ad38c5fdSAndreas Gohr                    );
156ad38c5fdSAndreas Gohr                }
157ad38c5fdSAndreas Gohr                continue;
158ad38c5fdSAndreas Gohr            }
1597ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
1607ee8b02dSAndreas Gohr            $firstChunkID++;
1618817535bSAndreas Gohr        }
1627ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
1637ebc7895Ssplitbrain            if ($chunkList !== []) {
164f8d5ae01SAndreas Gohr                $this->logger->success(
165f8d5ae01SAndreas Gohr                    '{id} split into {count} chunks',
166f8d5ae01SAndreas Gohr                    ['id' => $page, 'count' => count($chunkList)]
167f8d5ae01SAndreas Gohr                );
16893c1dbf4SAndreas Gohr            } else {
16993c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
17093c1dbf4SAndreas Gohr            }
1718817535bSAndreas Gohr        }
1727ee8b02dSAndreas Gohr        return $chunkList;
1738817535bSAndreas Gohr    }
1748817535bSAndreas Gohr
1759e81bea7SAndreas Gohr    /**
1769e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1779e81bea7SAndreas Gohr     *
1789e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
17968908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
1809e81bea7SAndreas Gohr     *
1819e81bea7SAndreas Gohr     * @param string $query The question
182e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language
1837ee8b02dSAndreas Gohr     * @return Chunk[]
1849e81bea7SAndreas Gohr     * @throws \Exception
1859e81bea7SAndreas Gohr     */
186e33a1d7aSAndreas Gohr    public function getSimilarChunks($query, $lang = '')
1878817535bSAndreas Gohr    {
1889e81bea7SAndreas Gohr        global $auth;
189f6ef2e50SAndreas Gohr        $vector = $this->model->getEmbedding($query);
1908817535bSAndreas Gohr
191f6ef2e50SAndreas Gohr        $fetch = ceil(
192f6ef2e50SAndreas Gohr            ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength())
193f6ef2e50SAndreas Gohr            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
194f6ef2e50SAndreas Gohr        );
195aee9b383SAndreas Gohr
196aee9b383SAndreas Gohr        $time = microtime(true);
197e33a1d7aSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
1987ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
199aee9b383SAndreas Gohr            $this->logger->info(
200aee9b383SAndreas Gohr                'Fetched {count} similar chunks from store in {time} seconds',
201aee9b383SAndreas Gohr                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
202aee9b383SAndreas Gohr            );
203aee9b383SAndreas Gohr        }
20468908844SAndreas Gohr
20568908844SAndreas Gohr        $size = 0;
2068817535bSAndreas Gohr        $result = [];
2077ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
2089e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
2097ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
21068908844SAndreas Gohr
21168908844SAndreas Gohr            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
212f6ef2e50SAndreas Gohr            if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough
21368908844SAndreas Gohr
2149e81bea7SAndreas Gohr            $result[] = $chunk;
21568908844SAndreas Gohr            $size += $chunkSize;
2168817535bSAndreas Gohr        }
2178817535bSAndreas Gohr        return $result;
2188817535bSAndreas Gohr    }
2198817535bSAndreas Gohr
2205786be46SAndreas Gohr
2215786be46SAndreas Gohr    /**
2228817535bSAndreas Gohr     * @param $text
2238817535bSAndreas Gohr     * @return array
2248817535bSAndreas Gohr     * @throws \Exception
2258817535bSAndreas Gohr     * @todo support splitting too long sentences
2268817535bSAndreas Gohr     */
227ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
2288817535bSAndreas Gohr    {
2298817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
23068908844SAndreas Gohr        $tiktok = $this->getTokenEncoder();
2318817535bSAndreas Gohr
2328817535bSAndreas Gohr        $chunks = [];
2338817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
2348817535bSAndreas Gohr
2358817535bSAndreas Gohr        $chunklen = 0;
2368817535bSAndreas Gohr        $chunk = '';
2378817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
2388817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
239f6ef2e50SAndreas Gohr            if ($slen > $this->model->getMaxEmbeddingTokenLength()) {
2408817535bSAndreas Gohr                // sentence is too long, we need to split it further
241f8d5ae01SAndreas Gohr                if ($this->logger instanceof CLI) $this->logger->warning(
242f8d5ae01SAndreas Gohr                    'Sentence too long, splitting not implemented yet'
243f8d5ae01SAndreas Gohr                );
244ad38c5fdSAndreas Gohr                continue;
2458817535bSAndreas Gohr            }
2468817535bSAndreas Gohr
247f6ef2e50SAndreas Gohr            if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) {
2488817535bSAndreas Gohr                // add to current chunk
2498817535bSAndreas Gohr                $chunk .= $sentence;
2508817535bSAndreas Gohr                $chunklen += $slen;
25168908844SAndreas Gohr                // remember sentence for overlap check
25268908844SAndreas Gohr                $this->rememberSentence($sentence);
2538817535bSAndreas Gohr            } else {
25468908844SAndreas Gohr                // add current chunk to result
2558817535bSAndreas Gohr                $chunks[] = $chunk;
25668908844SAndreas Gohr
25768908844SAndreas Gohr                // start new chunk with remembered sentences
2587ebc7895Ssplitbrain                $chunk = implode(' ', $this->sentenceQueue);
25968908844SAndreas Gohr                $chunk .= $sentence;
26068908844SAndreas Gohr                $chunklen = count($tiktok->encode($chunk));
2618817535bSAndreas Gohr            }
2628817535bSAndreas Gohr        }
2638817535bSAndreas Gohr        $chunks[] = $chunk;
2648817535bSAndreas Gohr
2658817535bSAndreas Gohr        return $chunks;
2668817535bSAndreas Gohr    }
26768908844SAndreas Gohr
26868908844SAndreas Gohr    /**
26968908844SAndreas Gohr     * Add a sentence to the queue of remembered sentences
27068908844SAndreas Gohr     *
27168908844SAndreas Gohr     * @param string $sentence
27268908844SAndreas Gohr     * @return void
27368908844SAndreas Gohr     */
27468908844SAndreas Gohr    protected function rememberSentence($sentence)
27568908844SAndreas Gohr    {
27668908844SAndreas Gohr        // add sentence to queue
27768908844SAndreas Gohr        $this->sentenceQueue[] = $sentence;
27868908844SAndreas Gohr
27968908844SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
28068908844SAndreas Gohr        $encoder = $this->getTokenEncoder();
2817ebc7895Ssplitbrain        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
28268908844SAndreas Gohr            array_shift($this->sentenceQueue);
28368908844SAndreas Gohr        }
28468908844SAndreas Gohr    }
2858817535bSAndreas Gohr}
286