xref: /plugin/aichat/Embeddings.php (revision 7ee8b02d54a468f0b7f9bba54177ef52db1e95de)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
5*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\AbstractStorage;
6*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\Chunk;
7*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\KDTreeStorage;
8*7ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\SQLiteStorage;
98817535bSAndreas Gohruse dokuwiki\Search\Indexer;
10ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException;
112ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
128817535bSAndreas Gohruse TikToken\Encoder;
138817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
148817535bSAndreas Gohr
159da5f0dfSAndreas Gohr/**
169da5f0dfSAndreas Gohr * Manage the embeddings index
179da5f0dfSAndreas Gohr *
189da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19*7ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
209da5f0dfSAndreas Gohr */
218817535bSAndreas Gohrclass Embeddings
228817535bSAndreas Gohr{
238817535bSAndreas Gohr
24c4584168SAndreas Gohr    const MAX_TOKEN_LEN = 1000;
25*7ee8b02dSAndreas Gohr
268817535bSAndreas Gohr
272ecc089aSAndreas Gohr    /** @var OpenAI */
288817535bSAndreas Gohr    protected $openAI;
292ecc089aSAndreas Gohr    /** @var CLI|null */
302ecc089aSAndreas Gohr    protected $logger;
318817535bSAndreas Gohr
32*7ee8b02dSAndreas Gohr    /** @var AbstractStorage */
33*7ee8b02dSAndreas Gohr    protected $storage;
34*7ee8b02dSAndreas Gohr
352ecc089aSAndreas Gohr    /**
362ecc089aSAndreas Gohr     * @param OpenAI $openAI
372ecc089aSAndreas Gohr     */
382ecc089aSAndreas Gohr    public function __construct(OpenAI $openAI)
398817535bSAndreas Gohr    {
408817535bSAndreas Gohr        $this->openAI = $openAI;
41*7ee8b02dSAndreas Gohr        //$this->storage = new KDTreeStorage(); // FIXME make configurable
42*7ee8b02dSAndreas Gohr        $this->storage = new SQLiteStorage(); // FIXME make configurable
43*7ee8b02dSAndreas Gohr    }
44*7ee8b02dSAndreas Gohr
45*7ee8b02dSAndreas Gohr    /**
46*7ee8b02dSAndreas Gohr     * Access storage
47*7ee8b02dSAndreas Gohr     *
48*7ee8b02dSAndreas Gohr     * @return AbstractStorage
49*7ee8b02dSAndreas Gohr     */
50*7ee8b02dSAndreas Gohr    public function getStorage()
51*7ee8b02dSAndreas Gohr    {
52*7ee8b02dSAndreas Gohr        return $this->storage;
532ecc089aSAndreas Gohr    }
542ecc089aSAndreas Gohr
552ecc089aSAndreas Gohr    /**
562ecc089aSAndreas Gohr     * Add a logger instance
572ecc089aSAndreas Gohr     *
582ecc089aSAndreas Gohr     * @param CLI $logger
592ecc089aSAndreas Gohr     * @return void
602ecc089aSAndreas Gohr     */
612ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
622ecc089aSAndreas Gohr    {
638817535bSAndreas Gohr        $this->logger = $logger;
648817535bSAndreas Gohr    }
658817535bSAndreas Gohr
662ecc089aSAndreas Gohr    /**
672ecc089aSAndreas Gohr     * Create a new K-D Tree from all pages
682ecc089aSAndreas Gohr     *
692ecc089aSAndreas Gohr     * Deletes the existing index
702ecc089aSAndreas Gohr     *
71ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
722ecc089aSAndreas Gohr     * @return void
73ad38c5fdSAndreas Gohr     * @throws ValidationException
742ecc089aSAndreas Gohr     */
75ad38c5fdSAndreas Gohr    public function createNewIndex($skipRE = '')
768817535bSAndreas Gohr    {
778817535bSAndreas Gohr        $indexer = new Indexer();
788817535bSAndreas Gohr        $pages = $indexer->getPages();
798817535bSAndreas Gohr
80*7ee8b02dSAndreas Gohr        $this->storage->startCreation(1536);
815aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
828817535bSAndreas Gohr            if (!page_exists($page)) continue;
836f9744f7SAndreas Gohr            if (isHiddenPage($page)) continue;
84*7ee8b02dSAndreas Gohr            if ($skipRE && preg_match($skipRE, $page)) continue; // FIXME delete previous chunks
855aa45b4dSAndreas Gohr
865aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
875aa45b4dSAndreas Gohr
88*7ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
89*7ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
905aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
91*7ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
925aa45b4dSAndreas Gohr            } else {
935aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
94*7ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
95*7ee8b02dSAndreas Gohr                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
965aa45b4dSAndreas Gohr            }
975aa45b4dSAndreas Gohr        }
98*7ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
995aa45b4dSAndreas Gohr    }
1005aa45b4dSAndreas Gohr
1015aa45b4dSAndreas Gohr    /**
102*7ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1035aa45b4dSAndreas Gohr     *
1045aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
105*7ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
106*7ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
1075aa45b4dSAndreas Gohr     * @throws \Exception
1085aa45b4dSAndreas Gohr     */
109*7ee8b02dSAndreas Gohr    protected function createPageChunks($page, $firstChunkID)
1105aa45b4dSAndreas Gohr    {
111*7ee8b02dSAndreas Gohr        $chunkList = [];
112*7ee8b02dSAndreas Gohr        $parts = $this->splitIntoChunks(rawWiki($page));
113*7ee8b02dSAndreas Gohr        foreach ($parts as $part) {
114ad38c5fdSAndreas Gohr            try {
115*7ee8b02dSAndreas Gohr                $embedding = $this->openAI->getEmbedding($part);
116ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
117ad38c5fdSAndreas Gohr                if ($this->logger) {
118ad38c5fdSAndreas Gohr                    $this->logger->error(
119ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
120ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
121ad38c5fdSAndreas Gohr                    );
122ad38c5fdSAndreas Gohr                }
123ad38c5fdSAndreas Gohr                continue;
124ad38c5fdSAndreas Gohr            }
125*7ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
126*7ee8b02dSAndreas Gohr            $firstChunkID++;
1278817535bSAndreas Gohr        }
1288817535bSAndreas Gohr        if ($this->logger) {
129*7ee8b02dSAndreas Gohr            $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($parts)]);
1308817535bSAndreas Gohr        }
131*7ee8b02dSAndreas Gohr        return $chunkList;
1328817535bSAndreas Gohr    }
1338817535bSAndreas Gohr
1349e81bea7SAndreas Gohr    /**
1359e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1369e81bea7SAndreas Gohr     *
1379e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
1389e81bea7SAndreas Gohr     *
1399e81bea7SAndreas Gohr     * @param string $query The question
1409e81bea7SAndreas Gohr     * @param int $limit The number of results to return
141*7ee8b02dSAndreas Gohr     * @return Chunk[]
1429e81bea7SAndreas Gohr     * @throws \Exception
1439e81bea7SAndreas Gohr     */
1448817535bSAndreas Gohr    public function getSimilarChunks($query, $limit = 4)
1458817535bSAndreas Gohr    {
1469e81bea7SAndreas Gohr        global $auth;
147*7ee8b02dSAndreas Gohr        $vector = $this->openAI->getEmbedding($query);
1488817535bSAndreas Gohr
149*7ee8b02dSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $limit);
1508817535bSAndreas Gohr        $result = [];
151*7ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
1529e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
153*7ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
1549e81bea7SAndreas Gohr            $result[] = $chunk;
1559e81bea7SAndreas Gohr            if (count($result) >= $limit) break;
1568817535bSAndreas Gohr        }
1578817535bSAndreas Gohr        return $result;
1588817535bSAndreas Gohr    }
1598817535bSAndreas Gohr
1605786be46SAndreas Gohr
1615786be46SAndreas Gohr    /**
1628817535bSAndreas Gohr     * @param $text
1638817535bSAndreas Gohr     * @return array
1648817535bSAndreas Gohr     * @throws \Exception
1658817535bSAndreas Gohr     * @todo maybe add overlap support
1668817535bSAndreas Gohr     * @todo support splitting too long sentences
1678817535bSAndreas Gohr     */
168ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
1698817535bSAndreas Gohr    {
1708817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
1718817535bSAndreas Gohr        $tiktok = new Encoder();
1728817535bSAndreas Gohr
1738817535bSAndreas Gohr        $chunks = [];
1748817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
1758817535bSAndreas Gohr
1768817535bSAndreas Gohr        $chunklen = 0;
1778817535bSAndreas Gohr        $chunk = '';
1788817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
1798817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
1808817535bSAndreas Gohr            if ($slen > self::MAX_TOKEN_LEN) {
1818817535bSAndreas Gohr                // sentence is too long, we need to split it further
182ad38c5fdSAndreas Gohr                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
183ad38c5fdSAndreas Gohr                continue;
1848817535bSAndreas Gohr            }
1858817535bSAndreas Gohr
1868817535bSAndreas Gohr            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
1878817535bSAndreas Gohr                // add to current chunk
1888817535bSAndreas Gohr                $chunk .= $sentence;
1898817535bSAndreas Gohr                $chunklen += $slen;
1908817535bSAndreas Gohr            } else {
1918817535bSAndreas Gohr                // start new chunk
1928817535bSAndreas Gohr                $chunks[] = $chunk;
1938817535bSAndreas Gohr                $chunk = $sentence;
1948817535bSAndreas Gohr                $chunklen = $slen;
1958817535bSAndreas Gohr            }
1968817535bSAndreas Gohr        }
1978817535bSAndreas Gohr        $chunks[] = $chunk;
1988817535bSAndreas Gohr
1998817535bSAndreas Gohr        return $chunks;
2008817535bSAndreas Gohr    }
2018817535bSAndreas Gohr}
202