xref: /plugin/aichat/Embeddings.php (revision 6a18e0f40fd2d3238b0284483f1ee9aa53dad036)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
57ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface;
6*6a18e0f4SAndreas Gohruse dokuwiki\plugin\aichat\Model\AbstractChatModel;
7*6a18e0f4SAndreas Gohruse dokuwiki\plugin\aichat\Model\AbstractEmbeddingModel;
8f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage;
98817535bSAndreas Gohruse dokuwiki\Search\Indexer;
102ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
118817535bSAndreas Gohruse TikToken\Encoder;
128817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
138817535bSAndreas Gohr
149da5f0dfSAndreas Gohr/**
159da5f0dfSAndreas Gohr * Manage the embeddings index
169da5f0dfSAndreas Gohr *
179da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
187ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
199da5f0dfSAndreas Gohr */
208817535bSAndreas Gohrclass Embeddings
218817535bSAndreas Gohr{
2268908844SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
2330b9cbc7Ssplitbrain    final public const MAX_OVERLAP_LEN = 200;
248817535bSAndreas Gohr
25*6a18e0f4SAndreas Gohr    /** @var AbstractChatModel */
26*6a18e0f4SAndreas Gohr    protected $chatModel;
27*6a18e0f4SAndreas Gohr
28*6a18e0f4SAndreas Gohr    /** @var AbstractEmbeddingModel */
29*6a18e0f4SAndreas Gohr    protected $embedModel;
30*6a18e0f4SAndreas Gohr
312ecc089aSAndreas Gohr    /** @var CLI|null */
322ecc089aSAndreas Gohr    protected $logger;
3368908844SAndreas Gohr    /** @var Encoder */
3468908844SAndreas Gohr    protected $tokenEncoder;
358817535bSAndreas Gohr
367ee8b02dSAndreas Gohr    /** @var AbstractStorage */
377ee8b02dSAndreas Gohr    protected $storage;
387ee8b02dSAndreas Gohr
3968908844SAndreas Gohr    /** @var array remember sentences when chunking */
4068908844SAndreas Gohr    private $sentenceQueue = [];
4168908844SAndreas Gohr
42*6a18e0f4SAndreas Gohr    public function __construct(
43*6a18e0f4SAndreas Gohr        AbstractChatModel $chatModel,
44*6a18e0f4SAndreas Gohr        AbstractEmbeddingModel $embedModel,
45*6a18e0f4SAndreas Gohr        AbstractStorage $storage
46*6a18e0f4SAndreas Gohr    ) {
47*6a18e0f4SAndreas Gohr        $this->chatModel = $chatModel;
48*6a18e0f4SAndreas Gohr        $this->embedModel = $embedModel;
49f6ef2e50SAndreas Gohr        $this->storage = $storage;
507ee8b02dSAndreas Gohr    }
517ee8b02dSAndreas Gohr
527ee8b02dSAndreas Gohr    /**
537ee8b02dSAndreas Gohr     * Access storage
547ee8b02dSAndreas Gohr     *
557ee8b02dSAndreas Gohr     * @return AbstractStorage
567ee8b02dSAndreas Gohr     */
577ee8b02dSAndreas Gohr    public function getStorage()
587ee8b02dSAndreas Gohr    {
597ee8b02dSAndreas Gohr        return $this->storage;
602ecc089aSAndreas Gohr    }
612ecc089aSAndreas Gohr
622ecc089aSAndreas Gohr    /**
632ecc089aSAndreas Gohr     * Add a logger instance
642ecc089aSAndreas Gohr     *
652ecc089aSAndreas Gohr     * @return void
662ecc089aSAndreas Gohr     */
672ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
682ecc089aSAndreas Gohr    {
698817535bSAndreas Gohr        $this->logger = $logger;
708817535bSAndreas Gohr    }
718817535bSAndreas Gohr
722ecc089aSAndreas Gohr    /**
7368908844SAndreas Gohr     * Get the token encoder instance
7468908844SAndreas Gohr     *
7568908844SAndreas Gohr     * @return Encoder
7668908844SAndreas Gohr     */
7768908844SAndreas Gohr    public function getTokenEncoder()
7868908844SAndreas Gohr    {
797ebc7895Ssplitbrain        if (!$this->tokenEncoder instanceof Encoder) {
8068908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
8168908844SAndreas Gohr        }
8268908844SAndreas Gohr        return $this->tokenEncoder;
8368908844SAndreas Gohr    }
8468908844SAndreas Gohr
8568908844SAndreas Gohr    /**
86*6a18e0f4SAndreas Gohr     * Return the chunk size to use
87*6a18e0f4SAndreas Gohr     *
88*6a18e0f4SAndreas Gohr     * @return int
89*6a18e0f4SAndreas Gohr     */
90*6a18e0f4SAndreas Gohr    public function getChunkSize()
91*6a18e0f4SAndreas Gohr    {
92*6a18e0f4SAndreas Gohr        return min(
93*6a18e0f4SAndreas Gohr            $this->chatModel->getMaxEmbeddingTokenLength(),
94*6a18e0f4SAndreas Gohr            $this->embedModel->getMaxEmbeddingTokenLength()
95*6a18e0f4SAndreas Gohr        );
96*6a18e0f4SAndreas Gohr    }
97*6a18e0f4SAndreas Gohr
98*6a18e0f4SAndreas Gohr    /**
995284515dSAndreas Gohr     * Update the embeddings storage
1002ecc089aSAndreas Gohr     *
101ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
102d5c102b3SAndreas Gohr     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
1035284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
1042ecc089aSAndreas Gohr     * @return void
1055284515dSAndreas Gohr     * @throws \Exception
1062ecc089aSAndreas Gohr     */
107d5c102b3SAndreas Gohr    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
1088817535bSAndreas Gohr    {
1098817535bSAndreas Gohr        $indexer = new Indexer();
1108817535bSAndreas Gohr        $pages = $indexer->getPages();
1118817535bSAndreas Gohr
112f6ef2e50SAndreas Gohr        $this->storage->startCreation($clear);
1135aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
1145aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
1155aa45b4dSAndreas Gohr
1165284515dSAndreas Gohr            if (
1175284515dSAndreas Gohr                !page_exists($page) ||
1185284515dSAndreas Gohr                isHiddenPage($page) ||
1194e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
120d5c102b3SAndreas Gohr                ($skipRE && preg_match($skipRE, (string) $page)) ||
121d5c102b3SAndreas Gohr                ($matchRE && !preg_match($matchRE, ":$page"))
1225284515dSAndreas Gohr            ) {
1235284515dSAndreas Gohr                // this page should not be in the index (anymore)
1245284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1255284515dSAndreas Gohr                continue;
1265284515dSAndreas Gohr            }
1275284515dSAndreas Gohr
1287ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1297ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1305aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1317ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
1327ebc7895Ssplitbrain                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
1335aa45b4dSAndreas Gohr            } else {
1345aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1357ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1367ee8b02dSAndreas Gohr                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
1375aa45b4dSAndreas Gohr            }
1385aa45b4dSAndreas Gohr        }
1397ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1405aa45b4dSAndreas Gohr    }
1415aa45b4dSAndreas Gohr
1425aa45b4dSAndreas Gohr    /**
1437ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1445aa45b4dSAndreas Gohr     *
14588305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
14688305719SAndreas Gohr     * Otherwise the raw wiki text is used.
14788305719SAndreas Gohr     *
1485aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1497ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
1507ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
1515aa45b4dSAndreas Gohr     * @throws \Exception
1525aa45b4dSAndreas Gohr     */
1537ee8b02dSAndreas Gohr    protected function createPageChunks($page, $firstChunkID)
1545aa45b4dSAndreas Gohr    {
1557ee8b02dSAndreas Gohr        $chunkList = [];
15688305719SAndreas Gohr
15788305719SAndreas Gohr        $textRenderer = plugin_load('renderer', 'text');
1587ebc7895Ssplitbrain        if ($textRenderer instanceof PluginInterface) {
15988305719SAndreas Gohr            global $ID;
16088305719SAndreas Gohr            $ID = $page;
16188305719SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'text', $page);
16288305719SAndreas Gohr        } else {
16388305719SAndreas Gohr            $text = rawWiki($page);
16488305719SAndreas Gohr        }
16588305719SAndreas Gohr
16688305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
1677ee8b02dSAndreas Gohr        foreach ($parts as $part) {
16830b9cbc7Ssplitbrain            if (trim((string) $part) == '') continue; // skip empty chunks
16993c1dbf4SAndreas Gohr
170ad38c5fdSAndreas Gohr            try {
171*6a18e0f4SAndreas Gohr                $embedding = $this->embedModel->getEmbedding($part);
172ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
1737ebc7895Ssplitbrain                if ($this->logger instanceof CLI) {
174ad38c5fdSAndreas Gohr                    $this->logger->error(
175ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
176ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
177ad38c5fdSAndreas Gohr                    );
178ad38c5fdSAndreas Gohr                }
179ad38c5fdSAndreas Gohr                continue;
180ad38c5fdSAndreas Gohr            }
1817ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
1827ee8b02dSAndreas Gohr            $firstChunkID++;
1838817535bSAndreas Gohr        }
1847ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
1857ebc7895Ssplitbrain            if ($chunkList !== []) {
186f8d5ae01SAndreas Gohr                $this->logger->success(
187f8d5ae01SAndreas Gohr                    '{id} split into {count} chunks',
188f8d5ae01SAndreas Gohr                    ['id' => $page, 'count' => count($chunkList)]
189f8d5ae01SAndreas Gohr                );
19093c1dbf4SAndreas Gohr            } else {
19193c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
19293c1dbf4SAndreas Gohr            }
1938817535bSAndreas Gohr        }
1947ee8b02dSAndreas Gohr        return $chunkList;
1958817535bSAndreas Gohr    }
1968817535bSAndreas Gohr
1979e81bea7SAndreas Gohr    /**
1989e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1999e81bea7SAndreas Gohr     *
2009e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
20168908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
2029e81bea7SAndreas Gohr     *
2039e81bea7SAndreas Gohr     * @param string $query The question
204e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language
2057ee8b02dSAndreas Gohr     * @return Chunk[]
2069e81bea7SAndreas Gohr     * @throws \Exception
2079e81bea7SAndreas Gohr     */
208e33a1d7aSAndreas Gohr    public function getSimilarChunks($query, $lang = '')
2098817535bSAndreas Gohr    {
2109e81bea7SAndreas Gohr        global $auth;
211*6a18e0f4SAndreas Gohr        $vector = $this->embedModel->getEmbedding($query);
2128817535bSAndreas Gohr
213f6ef2e50SAndreas Gohr        $fetch = ceil(
214*6a18e0f4SAndreas Gohr            ($this->getChunkSize() / $this->chatModel->getMaxEmbeddingTokenLength())
215f6ef2e50SAndreas Gohr            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
216f6ef2e50SAndreas Gohr        );
217aee9b383SAndreas Gohr
218aee9b383SAndreas Gohr        $time = microtime(true);
219e33a1d7aSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
2207ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
221aee9b383SAndreas Gohr            $this->logger->info(
222aee9b383SAndreas Gohr                'Fetched {count} similar chunks from store in {time} seconds',
223aee9b383SAndreas Gohr                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
224aee9b383SAndreas Gohr            );
225aee9b383SAndreas Gohr        }
22668908844SAndreas Gohr
22768908844SAndreas Gohr        $size = 0;
2288817535bSAndreas Gohr        $result = [];
2297ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
2309e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
2317ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
23268908844SAndreas Gohr
23368908844SAndreas Gohr            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
234*6a18e0f4SAndreas Gohr            if ($size + $chunkSize > $this->chatModel->getMaxContextTokenLength()) break; // we have enough
23568908844SAndreas Gohr
2369e81bea7SAndreas Gohr            $result[] = $chunk;
23768908844SAndreas Gohr            $size += $chunkSize;
2388817535bSAndreas Gohr        }
2398817535bSAndreas Gohr        return $result;
2408817535bSAndreas Gohr    }
2418817535bSAndreas Gohr
2425786be46SAndreas Gohr
2435786be46SAndreas Gohr    /**
2448817535bSAndreas Gohr     * @param $text
2458817535bSAndreas Gohr     * @return array
2468817535bSAndreas Gohr     * @throws \Exception
2478817535bSAndreas Gohr     * @todo support splitting too long sentences
2488817535bSAndreas Gohr     */
249ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
2508817535bSAndreas Gohr    {
2518817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
25268908844SAndreas Gohr        $tiktok = $this->getTokenEncoder();
2538817535bSAndreas Gohr
2548817535bSAndreas Gohr        $chunks = [];
2558817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
2568817535bSAndreas Gohr
2578817535bSAndreas Gohr        $chunklen = 0;
2588817535bSAndreas Gohr        $chunk = '';
2598817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
2608817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
261*6a18e0f4SAndreas Gohr            if ($slen > $this->getChunkSize()) {
2628817535bSAndreas Gohr                // sentence is too long, we need to split it further
263f8d5ae01SAndreas Gohr                if ($this->logger instanceof CLI) $this->logger->warning(
264f8d5ae01SAndreas Gohr                    'Sentence too long, splitting not implemented yet'
265f8d5ae01SAndreas Gohr                );
266ad38c5fdSAndreas Gohr                continue;
2678817535bSAndreas Gohr            }
2688817535bSAndreas Gohr
269*6a18e0f4SAndreas Gohr            if ($chunklen + $slen < $this->getChunkSize()) {
2708817535bSAndreas Gohr                // add to current chunk
2718817535bSAndreas Gohr                $chunk .= $sentence;
2728817535bSAndreas Gohr                $chunklen += $slen;
27368908844SAndreas Gohr                // remember sentence for overlap check
27468908844SAndreas Gohr                $this->rememberSentence($sentence);
2758817535bSAndreas Gohr            } else {
27668908844SAndreas Gohr                // add current chunk to result
2778817535bSAndreas Gohr                $chunks[] = $chunk;
27868908844SAndreas Gohr
27968908844SAndreas Gohr                // start new chunk with remembered sentences
2807ebc7895Ssplitbrain                $chunk = implode(' ', $this->sentenceQueue);
28168908844SAndreas Gohr                $chunk .= $sentence;
28268908844SAndreas Gohr                $chunklen = count($tiktok->encode($chunk));
2838817535bSAndreas Gohr            }
2848817535bSAndreas Gohr        }
2858817535bSAndreas Gohr        $chunks[] = $chunk;
2868817535bSAndreas Gohr
2878817535bSAndreas Gohr        return $chunks;
2888817535bSAndreas Gohr    }
28968908844SAndreas Gohr
29068908844SAndreas Gohr    /**
29168908844SAndreas Gohr     * Add a sentence to the queue of remembered sentences
29268908844SAndreas Gohr     *
29368908844SAndreas Gohr     * @param string $sentence
29468908844SAndreas Gohr     * @return void
29568908844SAndreas Gohr     */
29668908844SAndreas Gohr    protected function rememberSentence($sentence)
29768908844SAndreas Gohr    {
29868908844SAndreas Gohr        // add sentence to queue
29968908844SAndreas Gohr        $this->sentenceQueue[] = $sentence;
30068908844SAndreas Gohr
30168908844SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
30268908844SAndreas Gohr        $encoder = $this->getTokenEncoder();
3037ebc7895Ssplitbrain        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
30468908844SAndreas Gohr            array_shift($this->sentenceQueue);
30568908844SAndreas Gohr        }
30668908844SAndreas Gohr    }
3078817535bSAndreas Gohr}
308