xref: /plugin/aichat/Embeddings.php (revision 720bb43f9ac252f6e0b09e7b06804dec7c547a47)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
57ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface;
6294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface;
7294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface;
8f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage;
98817535bSAndreas Gohruse dokuwiki\Search\Indexer;
102ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
118817535bSAndreas Gohruse TikToken\Encoder;
128817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
138817535bSAndreas Gohr
149da5f0dfSAndreas Gohr/**
159da5f0dfSAndreas Gohr * Manage the embeddings index
169da5f0dfSAndreas Gohr *
179da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
187ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
199da5f0dfSAndreas Gohr */
208817535bSAndreas Gohrclass Embeddings
218817535bSAndreas Gohr{
2268908844SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
2330b9cbc7Ssplitbrain    final public const MAX_OVERLAP_LEN = 200;
248817535bSAndreas Gohr
25294a9eafSAndreas Gohr    /** @var ChatInterface */
266a18e0f4SAndreas Gohr    protected $chatModel;
276a18e0f4SAndreas Gohr
28294a9eafSAndreas Gohr    /** @var EmbeddingInterface */
296a18e0f4SAndreas Gohr    protected $embedModel;
306a18e0f4SAndreas Gohr
312ecc089aSAndreas Gohr    /** @var CLI|null */
322ecc089aSAndreas Gohr    protected $logger;
3368908844SAndreas Gohr    /** @var Encoder */
3468908844SAndreas Gohr    protected $tokenEncoder;
358817535bSAndreas Gohr
367ee8b02dSAndreas Gohr    /** @var AbstractStorage */
377ee8b02dSAndreas Gohr    protected $storage;
387ee8b02dSAndreas Gohr
3968908844SAndreas Gohr    /** @var array remember sentences when chunking */
4068908844SAndreas Gohr    private $sentenceQueue = [];
4168908844SAndreas Gohr
42c2b7a1f7SAndreas Gohr    /** @var int the time spent for the last similar chunk retrieval */
43c2b7a1f7SAndreas Gohr    public $timeSpent = 0;
44c2b7a1f7SAndreas Gohr
4534a1c478SAndreas Gohr    protected $configChunkSize;
4634a1c478SAndreas Gohr    protected $configContextChunks;
47*720bb43fSAndreas Gohr    protected $similarityThreshold;
4834a1c478SAndreas Gohr
4934a1c478SAndreas Gohr    /**
5034a1c478SAndreas Gohr     * Embeddings constructor.
5134a1c478SAndreas Gohr     *
5234a1c478SAndreas Gohr     * @param ChatInterface $chatModel
5334a1c478SAndreas Gohr     * @param EmbeddingInterface $embedModel
5434a1c478SAndreas Gohr     * @param AbstractStorage $storage
5534a1c478SAndreas Gohr     * @param array $config The plugin configuration
5634a1c478SAndreas Gohr     */
576a18e0f4SAndreas Gohr    public function __construct(
58294a9eafSAndreas Gohr        ChatInterface $chatModel,
59294a9eafSAndreas Gohr        EmbeddingInterface $embedModel,
6034a1c478SAndreas Gohr        AbstractStorage $storage,
6134a1c478SAndreas Gohr        $config
622071dcedSAndreas Gohr    ) {
636a18e0f4SAndreas Gohr        $this->chatModel = $chatModel;
646a18e0f4SAndreas Gohr        $this->embedModel = $embedModel;
65f6ef2e50SAndreas Gohr        $this->storage = $storage;
6634a1c478SAndreas Gohr        $this->configChunkSize = $config['chunkSize'];
6734a1c478SAndreas Gohr        $this->configContextChunks = $config['contextChunks'];
68*720bb43fSAndreas Gohr        $this->similarityThreshold = $config['similarityThreshold']/100;
697ee8b02dSAndreas Gohr    }
707ee8b02dSAndreas Gohr
717ee8b02dSAndreas Gohr    /**
727ee8b02dSAndreas Gohr     * Access storage
737ee8b02dSAndreas Gohr     *
747ee8b02dSAndreas Gohr     * @return AbstractStorage
757ee8b02dSAndreas Gohr     */
767ee8b02dSAndreas Gohr    public function getStorage()
777ee8b02dSAndreas Gohr    {
787ee8b02dSAndreas Gohr        return $this->storage;
792ecc089aSAndreas Gohr    }
802ecc089aSAndreas Gohr
812ecc089aSAndreas Gohr    /**
822ecc089aSAndreas Gohr     * Add a logger instance
832ecc089aSAndreas Gohr     *
842ecc089aSAndreas Gohr     * @return void
852ecc089aSAndreas Gohr     */
862ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
872ecc089aSAndreas Gohr    {
888817535bSAndreas Gohr        $this->logger = $logger;
898817535bSAndreas Gohr    }
908817535bSAndreas Gohr
912ecc089aSAndreas Gohr    /**
9268908844SAndreas Gohr     * Get the token encoder instance
9368908844SAndreas Gohr     *
9468908844SAndreas Gohr     * @return Encoder
9568908844SAndreas Gohr     */
9668908844SAndreas Gohr    public function getTokenEncoder()
9768908844SAndreas Gohr    {
987ebc7895Ssplitbrain        if (!$this->tokenEncoder instanceof Encoder) {
9968908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
10068908844SAndreas Gohr        }
10168908844SAndreas Gohr        return $this->tokenEncoder;
10268908844SAndreas Gohr    }
10368908844SAndreas Gohr
10468908844SAndreas Gohr    /**
1056a18e0f4SAndreas Gohr     * Return the chunk size to use
1066a18e0f4SAndreas Gohr     *
1076a18e0f4SAndreas Gohr     * @return int
1086a18e0f4SAndreas Gohr     */
1096a18e0f4SAndreas Gohr    public function getChunkSize()
1106a18e0f4SAndreas Gohr    {
1116a18e0f4SAndreas Gohr        return min(
11234a1c478SAndreas Gohr            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
11334a1c478SAndreas Gohr            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
11434a1c478SAndreas Gohr            $this->configChunkSize, // this is usually the smallest
1156a18e0f4SAndreas Gohr        );
1166a18e0f4SAndreas Gohr    }
1176a18e0f4SAndreas Gohr
1186a18e0f4SAndreas Gohr    /**
1195284515dSAndreas Gohr     * Update the embeddings storage
1202ecc089aSAndreas Gohr     *
121ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
122d5c102b3SAndreas Gohr     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
1235284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
1242ecc089aSAndreas Gohr     * @return void
1255284515dSAndreas Gohr     * @throws \Exception
1262ecc089aSAndreas Gohr     */
127d5c102b3SAndreas Gohr    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
1288817535bSAndreas Gohr    {
1298817535bSAndreas Gohr        $indexer = new Indexer();
1308817535bSAndreas Gohr        $pages = $indexer->getPages();
1318817535bSAndreas Gohr
132f6ef2e50SAndreas Gohr        $this->storage->startCreation($clear);
1335aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
1345aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
1355aa45b4dSAndreas Gohr
1365284515dSAndreas Gohr            if (
1375284515dSAndreas Gohr                !page_exists($page) ||
1385284515dSAndreas Gohr                isHiddenPage($page) ||
1394e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
140d5c102b3SAndreas Gohr                ($skipRE && preg_match($skipRE, (string)$page)) ||
141d5c102b3SAndreas Gohr                ($matchRE && !preg_match($matchRE, ":$page"))
1425284515dSAndreas Gohr            ) {
1435284515dSAndreas Gohr                // this page should not be in the index (anymore)
1445284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1455284515dSAndreas Gohr                continue;
1465284515dSAndreas Gohr            }
1475284515dSAndreas Gohr
1487ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1497ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1505aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1517ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
1527ebc7895Ssplitbrain                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
1535aa45b4dSAndreas Gohr            } else {
1545aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1557ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
156ecb0a423SAndreas Gohr                $chunks = $this->createPageChunks($page, $chunkID);
157ecb0a423SAndreas Gohr                if ($chunks) $this->storage->addPageChunks($chunks);
1585aa45b4dSAndreas Gohr            }
1595aa45b4dSAndreas Gohr        }
1607ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1615aa45b4dSAndreas Gohr    }
1625aa45b4dSAndreas Gohr
1635aa45b4dSAndreas Gohr    /**
1647ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1655aa45b4dSAndreas Gohr     *
16688305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
16788305719SAndreas Gohr     * Otherwise the raw wiki text is used.
16888305719SAndreas Gohr     *
1695aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1707ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
1717ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
1725aa45b4dSAndreas Gohr     * @throws \Exception
1735aa45b4dSAndreas Gohr     */
1747ee8b02dSAndreas Gohr    protected function createPageChunks($page, $firstChunkID)
1755aa45b4dSAndreas Gohr    {
1767ee8b02dSAndreas Gohr        $chunkList = [];
17788305719SAndreas Gohr
17888305719SAndreas Gohr        $textRenderer = plugin_load('renderer', 'text');
1797ebc7895Ssplitbrain        if ($textRenderer instanceof PluginInterface) {
18088305719SAndreas Gohr            global $ID;
18188305719SAndreas Gohr            $ID = $page;
18288305719SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'text', $page);
18388305719SAndreas Gohr        } else {
18488305719SAndreas Gohr            $text = rawWiki($page);
18588305719SAndreas Gohr        }
18688305719SAndreas Gohr
18788305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
1887ee8b02dSAndreas Gohr        foreach ($parts as $part) {
18930b9cbc7Ssplitbrain            if (trim((string)$part) == '') continue; // skip empty chunks
19093c1dbf4SAndreas Gohr
191ad38c5fdSAndreas Gohr            try {
1926a18e0f4SAndreas Gohr                $embedding = $this->embedModel->getEmbedding($part);
193ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
1947ebc7895Ssplitbrain                if ($this->logger instanceof CLI) {
195ad38c5fdSAndreas Gohr                    $this->logger->error(
196ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
197ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
198ad38c5fdSAndreas Gohr                    );
199ad38c5fdSAndreas Gohr                }
200ad38c5fdSAndreas Gohr                continue;
201ad38c5fdSAndreas Gohr            }
2027ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
2037ee8b02dSAndreas Gohr            $firstChunkID++;
2048817535bSAndreas Gohr        }
2057ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
2067ebc7895Ssplitbrain            if ($chunkList !== []) {
207f8d5ae01SAndreas Gohr                $this->logger->success(
208f8d5ae01SAndreas Gohr                    '{id} split into {count} chunks',
209f8d5ae01SAndreas Gohr                    ['id' => $page, 'count' => count($chunkList)]
210f8d5ae01SAndreas Gohr                );
21193c1dbf4SAndreas Gohr            } else {
21293c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
21393c1dbf4SAndreas Gohr            }
2148817535bSAndreas Gohr        }
2157ee8b02dSAndreas Gohr        return $chunkList;
2168817535bSAndreas Gohr    }
2178817535bSAndreas Gohr
2189e81bea7SAndreas Gohr    /**
2199e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
2209e81bea7SAndreas Gohr     *
2219e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
22268908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
2239e81bea7SAndreas Gohr     *
2249e81bea7SAndreas Gohr     * @param string $query The question
225e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language
2267ee8b02dSAndreas Gohr     * @return Chunk[]
2279e81bea7SAndreas Gohr     * @throws \Exception
2289e81bea7SAndreas Gohr     */
229e33a1d7aSAndreas Gohr    public function getSimilarChunks($query, $lang = '')
2308817535bSAndreas Gohr    {
2319e81bea7SAndreas Gohr        global $auth;
2326a18e0f4SAndreas Gohr        $vector = $this->embedModel->getEmbedding($query);
2338817535bSAndreas Gohr
234e3640be8SAndreas Gohr        $fetch = min(
23534a1c478SAndreas Gohr            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
23634a1c478SAndreas Gohr            $this->configContextChunks
237f6ef2e50SAndreas Gohr        );
238aee9b383SAndreas Gohr
239aee9b383SAndreas Gohr        $time = microtime(true);
240e33a1d7aSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
2415f71c9bbSAndreas Gohr        $this->timeSpent = round(microtime(true) - $time, 2);
2427ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
243aee9b383SAndreas Gohr            $this->logger->info(
244aee9b383SAndreas Gohr                'Fetched {count} similar chunks from store in {time} seconds',
2455f71c9bbSAndreas Gohr                ['count' => count($chunks), 'time' => $this->timeSpent]
246aee9b383SAndreas Gohr            );
247aee9b383SAndreas Gohr        }
24868908844SAndreas Gohr
24968908844SAndreas Gohr        $size = 0;
2508817535bSAndreas Gohr        $result = [];
2517ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
2529e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
2537ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
254*720bb43fSAndreas Gohr            if($chunk->getScore() < $this->similarityThreshold) continue;
25568908844SAndreas Gohr
25668908844SAndreas Gohr            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
25734a1c478SAndreas Gohr            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
25868908844SAndreas Gohr
2599e81bea7SAndreas Gohr            $result[] = $chunk;
26068908844SAndreas Gohr            $size += $chunkSize;
2618817535bSAndreas Gohr        }
2628817535bSAndreas Gohr        return $result;
2638817535bSAndreas Gohr    }
2648817535bSAndreas Gohr
2655786be46SAndreas Gohr
2665786be46SAndreas Gohr    /**
2678817535bSAndreas Gohr     * @param $text
2688817535bSAndreas Gohr     * @return array
2698817535bSAndreas Gohr     * @throws \Exception
2708817535bSAndreas Gohr     * @todo support splitting too long sentences
2718817535bSAndreas Gohr     */
272ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
2738817535bSAndreas Gohr    {
2748817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
27568908844SAndreas Gohr        $tiktok = $this->getTokenEncoder();
2768817535bSAndreas Gohr
2778817535bSAndreas Gohr        $chunks = [];
2788817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
2798817535bSAndreas Gohr
2808817535bSAndreas Gohr        $chunklen = 0;
2818817535bSAndreas Gohr        $chunk = '';
2828817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
2838817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
2846a18e0f4SAndreas Gohr            if ($slen > $this->getChunkSize()) {
2858817535bSAndreas Gohr                // sentence is too long, we need to split it further
286f8d5ae01SAndreas Gohr                if ($this->logger instanceof CLI) $this->logger->warning(
287f8d5ae01SAndreas Gohr                    'Sentence too long, splitting not implemented yet'
288f8d5ae01SAndreas Gohr                );
289ad38c5fdSAndreas Gohr                continue;
2908817535bSAndreas Gohr            }
2918817535bSAndreas Gohr
2926a18e0f4SAndreas Gohr            if ($chunklen + $slen < $this->getChunkSize()) {
2938817535bSAndreas Gohr                // add to current chunk
2948817535bSAndreas Gohr                $chunk .= $sentence;
2958817535bSAndreas Gohr                $chunklen += $slen;
29668908844SAndreas Gohr                // remember sentence for overlap check
29768908844SAndreas Gohr                $this->rememberSentence($sentence);
2988817535bSAndreas Gohr            } else {
29968908844SAndreas Gohr                // add current chunk to result
3008817535bSAndreas Gohr                $chunks[] = $chunk;
30168908844SAndreas Gohr
30268908844SAndreas Gohr                // start new chunk with remembered sentences
3037ebc7895Ssplitbrain                $chunk = implode(' ', $this->sentenceQueue);
30468908844SAndreas Gohr                $chunk .= $sentence;
30568908844SAndreas Gohr                $chunklen = count($tiktok->encode($chunk));
3068817535bSAndreas Gohr            }
3078817535bSAndreas Gohr        }
3088817535bSAndreas Gohr        $chunks[] = $chunk;
3098817535bSAndreas Gohr
3108817535bSAndreas Gohr        return $chunks;
3118817535bSAndreas Gohr    }
31268908844SAndreas Gohr
31368908844SAndreas Gohr    /**
31468908844SAndreas Gohr     * Add a sentence to the queue of remembered sentences
31568908844SAndreas Gohr     *
31668908844SAndreas Gohr     * @param string $sentence
31768908844SAndreas Gohr     * @return void
31868908844SAndreas Gohr     */
31968908844SAndreas Gohr    protected function rememberSentence($sentence)
32068908844SAndreas Gohr    {
32168908844SAndreas Gohr        // add sentence to queue
32268908844SAndreas Gohr        $this->sentenceQueue[] = $sentence;
32368908844SAndreas Gohr
32468908844SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
32568908844SAndreas Gohr        $encoder = $this->getTokenEncoder();
3267ebc7895Ssplitbrain        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
32768908844SAndreas Gohr            array_shift($this->sentenceQueue);
32868908844SAndreas Gohr        }
32968908844SAndreas Gohr    }
3308817535bSAndreas Gohr}
331