xref: /plugin/aichat/Embeddings.php (revision 614f8ab4ac738aed3502238736999a25e6cf719a)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
57ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\AbstractStorage;
67ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\Chunk;
77ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\KDTreeStorage;
87ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\SQLiteStorage;
98817535bSAndreas Gohruse dokuwiki\Search\Indexer;
10ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException;
112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
128817535bSAndreas Gohruse TikToken\Encoder;
138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
148817535bSAndreas Gohr
159da5f0dfSAndreas Gohr/**
169da5f0dfSAndreas Gohr * Manage the embeddings index
179da5f0dfSAndreas Gohr *
189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
197ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
209da5f0dfSAndreas Gohr */
218817535bSAndreas Gohrclass Embeddings
228817535bSAndreas Gohr{
238817535bSAndreas Gohr
24c4584168SAndreas Gohr    const MAX_TOKEN_LEN = 1000;
257ee8b02dSAndreas Gohr
268817535bSAndreas Gohr
272ecc089aSAndreas Gohr    /** @var OpenAI */
288817535bSAndreas Gohr    protected $openAI;
292ecc089aSAndreas Gohr    /** @var CLI|null */
302ecc089aSAndreas Gohr    protected $logger;
318817535bSAndreas Gohr
327ee8b02dSAndreas Gohr    /** @var AbstractStorage */
337ee8b02dSAndreas Gohr    protected $storage;
347ee8b02dSAndreas Gohr
352ecc089aSAndreas Gohr    /**
362ecc089aSAndreas Gohr     * @param OpenAI $openAI
372ecc089aSAndreas Gohr     */
382ecc089aSAndreas Gohr    public function __construct(OpenAI $openAI)
398817535bSAndreas Gohr    {
408817535bSAndreas Gohr        $this->openAI = $openAI;
41*614f8ab4SAndreas Gohr        $this->storage = new SQLiteStorage();
427ee8b02dSAndreas Gohr    }
437ee8b02dSAndreas Gohr
447ee8b02dSAndreas Gohr    /**
457ee8b02dSAndreas Gohr     * Access storage
467ee8b02dSAndreas Gohr     *
477ee8b02dSAndreas Gohr     * @return AbstractStorage
487ee8b02dSAndreas Gohr     */
497ee8b02dSAndreas Gohr    public function getStorage()
507ee8b02dSAndreas Gohr    {
517ee8b02dSAndreas Gohr        return $this->storage;
522ecc089aSAndreas Gohr    }
532ecc089aSAndreas Gohr
542ecc089aSAndreas Gohr    /**
552ecc089aSAndreas Gohr     * Add a logger instance
562ecc089aSAndreas Gohr     *
572ecc089aSAndreas Gohr     * @param CLI $logger
582ecc089aSAndreas Gohr     * @return void
592ecc089aSAndreas Gohr     */
602ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
612ecc089aSAndreas Gohr    {
628817535bSAndreas Gohr        $this->logger = $logger;
638817535bSAndreas Gohr    }
648817535bSAndreas Gohr
652ecc089aSAndreas Gohr    /**
665284515dSAndreas Gohr     * Update the embeddings storage
672ecc089aSAndreas Gohr     *
68ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
695284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
702ecc089aSAndreas Gohr     * @return void
715284515dSAndreas Gohr     * @throws \Exception
722ecc089aSAndreas Gohr     */
735284515dSAndreas Gohr    public function createNewIndex($skipRE = '', $clear = false)
748817535bSAndreas Gohr    {
758817535bSAndreas Gohr        $indexer = new Indexer();
768817535bSAndreas Gohr        $pages = $indexer->getPages();
778817535bSAndreas Gohr
785284515dSAndreas Gohr        $this->storage->startCreation(1536, $clear);
795aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
805aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
815aa45b4dSAndreas Gohr
825284515dSAndreas Gohr            if (
835284515dSAndreas Gohr                !page_exists($page) ||
845284515dSAndreas Gohr                isHiddenPage($page) ||
854e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
865284515dSAndreas Gohr                ($skipRE && preg_match($skipRE, $page))
875284515dSAndreas Gohr            ) {
885284515dSAndreas Gohr                // this page should not be in the index (anymore)
895284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
905284515dSAndreas Gohr                continue;
915284515dSAndreas Gohr            }
925284515dSAndreas Gohr
937ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
947ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
955aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
967ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
9733128f96SAndreas Gohr                if ($this->logger) $this->logger->info("Reusing chunks for $page");
985aa45b4dSAndreas Gohr            } else {
995aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1007ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1017ee8b02dSAndreas Gohr                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
1025aa45b4dSAndreas Gohr            }
1035aa45b4dSAndreas Gohr        }
1047ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1055aa45b4dSAndreas Gohr    }
1065aa45b4dSAndreas Gohr
1075aa45b4dSAndreas Gohr    /**
1087ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1095aa45b4dSAndreas Gohr     *
11088305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
11188305719SAndreas Gohr     * Otherwise the raw wiki text is used.
11288305719SAndreas Gohr     *
1135aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1147ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
1157ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
1165aa45b4dSAndreas Gohr     * @throws \Exception
1175aa45b4dSAndreas Gohr     */
1187ee8b02dSAndreas Gohr    protected function createPageChunks($page, $firstChunkID)
1195aa45b4dSAndreas Gohr    {
1207ee8b02dSAndreas Gohr        $chunkList = [];
12188305719SAndreas Gohr
12288305719SAndreas Gohr        $textRenderer = plugin_load('renderer', 'text');
12388305719SAndreas Gohr        if ($textRenderer) {
12488305719SAndreas Gohr            global $ID;
12588305719SAndreas Gohr            $ID = $page;
12688305719SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'text', $page);
12788305719SAndreas Gohr        } else {
12888305719SAndreas Gohr            $text = rawWiki($page);
12988305719SAndreas Gohr        }
13088305719SAndreas Gohr
13188305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
1327ee8b02dSAndreas Gohr        foreach ($parts as $part) {
13393c1dbf4SAndreas Gohr            if(trim($part) == '') continue; // skip empty chunks
13493c1dbf4SAndreas Gohr
135ad38c5fdSAndreas Gohr            try {
1367ee8b02dSAndreas Gohr                $embedding = $this->openAI->getEmbedding($part);
137ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
138ad38c5fdSAndreas Gohr                if ($this->logger) {
139ad38c5fdSAndreas Gohr                    $this->logger->error(
140ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
141ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
142ad38c5fdSAndreas Gohr                    );
143ad38c5fdSAndreas Gohr                }
144ad38c5fdSAndreas Gohr                continue;
145ad38c5fdSAndreas Gohr            }
1467ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
1477ee8b02dSAndreas Gohr            $firstChunkID++;
1488817535bSAndreas Gohr        }
1498817535bSAndreas Gohr        if ($this->logger) {
15093c1dbf4SAndreas Gohr            if(count($chunkList)) {
15193c1dbf4SAndreas Gohr                $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]);
15293c1dbf4SAndreas Gohr            } else {
15393c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
15493c1dbf4SAndreas Gohr            }
1558817535bSAndreas Gohr        }
1567ee8b02dSAndreas Gohr        return $chunkList;
1578817535bSAndreas Gohr    }
1588817535bSAndreas Gohr
1599e81bea7SAndreas Gohr    /**
1609e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1619e81bea7SAndreas Gohr     *
1629e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
1639e81bea7SAndreas Gohr     *
1649e81bea7SAndreas Gohr     * @param string $query The question
1659e81bea7SAndreas Gohr     * @param int $limit The number of results to return
1667ee8b02dSAndreas Gohr     * @return Chunk[]
1679e81bea7SAndreas Gohr     * @throws \Exception
1689e81bea7SAndreas Gohr     */
1698817535bSAndreas Gohr    public function getSimilarChunks($query, $limit = 4)
1708817535bSAndreas Gohr    {
1719e81bea7SAndreas Gohr        global $auth;
1727ee8b02dSAndreas Gohr        $vector = $this->openAI->getEmbedding($query);
1738817535bSAndreas Gohr
1747ee8b02dSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $limit);
1758817535bSAndreas Gohr        $result = [];
1767ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
1779e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
1787ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
1799e81bea7SAndreas Gohr            $result[] = $chunk;
1809e81bea7SAndreas Gohr            if (count($result) >= $limit) break;
1818817535bSAndreas Gohr        }
1828817535bSAndreas Gohr        return $result;
1838817535bSAndreas Gohr    }
1848817535bSAndreas Gohr
1855786be46SAndreas Gohr
1865786be46SAndreas Gohr    /**
1878817535bSAndreas Gohr     * @param $text
1888817535bSAndreas Gohr     * @return array
1898817535bSAndreas Gohr     * @throws \Exception
1908817535bSAndreas Gohr     * @todo maybe add overlap support
1918817535bSAndreas Gohr     * @todo support splitting too long sentences
1928817535bSAndreas Gohr     */
193ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
1948817535bSAndreas Gohr    {
1958817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
1968817535bSAndreas Gohr        $tiktok = new Encoder();
1978817535bSAndreas Gohr
1988817535bSAndreas Gohr        $chunks = [];
1998817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
2008817535bSAndreas Gohr
2018817535bSAndreas Gohr        $chunklen = 0;
2028817535bSAndreas Gohr        $chunk = '';
2038817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
2048817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
2058817535bSAndreas Gohr            if ($slen > self::MAX_TOKEN_LEN) {
2068817535bSAndreas Gohr                // sentence is too long, we need to split it further
207ad38c5fdSAndreas Gohr                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
208ad38c5fdSAndreas Gohr                continue;
2098817535bSAndreas Gohr            }
2108817535bSAndreas Gohr
2118817535bSAndreas Gohr            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
2128817535bSAndreas Gohr                // add to current chunk
2138817535bSAndreas Gohr                $chunk .= $sentence;
2148817535bSAndreas Gohr                $chunklen += $slen;
2158817535bSAndreas Gohr            } else {
2168817535bSAndreas Gohr                // start new chunk
2178817535bSAndreas Gohr                $chunks[] = $chunk;
2188817535bSAndreas Gohr                $chunk = $sentence;
2198817535bSAndreas Gohr                $chunklen = $slen;
2208817535bSAndreas Gohr            }
2218817535bSAndreas Gohr        }
2228817535bSAndreas Gohr        $chunks[] = $chunk;
2238817535bSAndreas Gohr
2248817535bSAndreas Gohr        return $chunks;
2258817535bSAndreas Gohr    }
2268817535bSAndreas Gohr}
227