xref: /plugin/aichat/Embeddings.php (revision 689088446f64ff8d9dfdae9ae0666b45de449da7)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
57ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\AbstractStorage;
67ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\Chunk;
77ee8b02dSAndreas Gohruse dokuwiki\plugin\aichat\backend\SQLiteStorage;
88817535bSAndreas Gohruse dokuwiki\Search\Indexer;
92ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
108817535bSAndreas Gohruse TikToken\Encoder;
118817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
128817535bSAndreas Gohr
139da5f0dfSAndreas Gohr/**
149da5f0dfSAndreas Gohr * Manage the embeddings index
159da5f0dfSAndreas Gohr *
169da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
177ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
189da5f0dfSAndreas Gohr */
198817535bSAndreas Gohrclass Embeddings
208817535bSAndreas Gohr{
21*68908844SAndreas Gohr    /** @var int length of all context chunks together */
22*68908844SAndreas Gohr    const MAX_CONTEXT_LEN = 3800;
238817535bSAndreas Gohr
24*68908844SAndreas Gohr    /** @var int size of the chunks in tokens */
25*68908844SAndreas Gohr    const MAX_CHUNK_LEN = 1000;
267ee8b02dSAndreas Gohr
27*68908844SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
28*68908844SAndreas Gohr    const MAX_OVERLAP_LEN = 200;
298817535bSAndreas Gohr
302ecc089aSAndreas Gohr    /** @var OpenAI */
318817535bSAndreas Gohr    protected $openAI;
322ecc089aSAndreas Gohr    /** @var CLI|null */
332ecc089aSAndreas Gohr    protected $logger;
34*68908844SAndreas Gohr    /** @var Encoder */
35*68908844SAndreas Gohr    protected $tokenEncoder;
368817535bSAndreas Gohr
377ee8b02dSAndreas Gohr    /** @var AbstractStorage */
387ee8b02dSAndreas Gohr    protected $storage;
397ee8b02dSAndreas Gohr
40*68908844SAndreas Gohr    /** @var array remember sentences when chunking */
41*68908844SAndreas Gohr    private $sentenceQueue = [];
42*68908844SAndreas Gohr
432ecc089aSAndreas Gohr    /**
442ecc089aSAndreas Gohr     * @param OpenAI $openAI
452ecc089aSAndreas Gohr     */
462ecc089aSAndreas Gohr    public function __construct(OpenAI $openAI)
478817535bSAndreas Gohr    {
488817535bSAndreas Gohr        $this->openAI = $openAI;
49614f8ab4SAndreas Gohr        $this->storage = new SQLiteStorage();
507ee8b02dSAndreas Gohr    }
517ee8b02dSAndreas Gohr
527ee8b02dSAndreas Gohr    /**
537ee8b02dSAndreas Gohr     * Access storage
547ee8b02dSAndreas Gohr     *
557ee8b02dSAndreas Gohr     * @return AbstractStorage
567ee8b02dSAndreas Gohr     */
577ee8b02dSAndreas Gohr    public function getStorage()
587ee8b02dSAndreas Gohr    {
597ee8b02dSAndreas Gohr        return $this->storage;
602ecc089aSAndreas Gohr    }
612ecc089aSAndreas Gohr
622ecc089aSAndreas Gohr    /**
632ecc089aSAndreas Gohr     * Add a logger instance
642ecc089aSAndreas Gohr     *
652ecc089aSAndreas Gohr     * @param CLI $logger
662ecc089aSAndreas Gohr     * @return void
672ecc089aSAndreas Gohr     */
682ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
692ecc089aSAndreas Gohr    {
708817535bSAndreas Gohr        $this->logger = $logger;
718817535bSAndreas Gohr    }
728817535bSAndreas Gohr
732ecc089aSAndreas Gohr    /**
74*68908844SAndreas Gohr     * Get the token encoder instance
75*68908844SAndreas Gohr     *
76*68908844SAndreas Gohr     * @return Encoder
77*68908844SAndreas Gohr     */
78*68908844SAndreas Gohr    public function getTokenEncoder()
79*68908844SAndreas Gohr    {
80*68908844SAndreas Gohr        if ($this->tokenEncoder === null) {
81*68908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
82*68908844SAndreas Gohr        }
83*68908844SAndreas Gohr        return $this->tokenEncoder;
84*68908844SAndreas Gohr    }
85*68908844SAndreas Gohr
86*68908844SAndreas Gohr    /**
875284515dSAndreas Gohr     * Update the embeddings storage
882ecc089aSAndreas Gohr     *
89ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
905284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
912ecc089aSAndreas Gohr     * @return void
925284515dSAndreas Gohr     * @throws \Exception
932ecc089aSAndreas Gohr     */
945284515dSAndreas Gohr    public function createNewIndex($skipRE = '', $clear = false)
958817535bSAndreas Gohr    {
968817535bSAndreas Gohr        $indexer = new Indexer();
978817535bSAndreas Gohr        $pages = $indexer->getPages();
988817535bSAndreas Gohr
995284515dSAndreas Gohr        $this->storage->startCreation(1536, $clear);
1005aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
1015aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
1025aa45b4dSAndreas Gohr
1035284515dSAndreas Gohr            if (
1045284515dSAndreas Gohr                !page_exists($page) ||
1055284515dSAndreas Gohr                isHiddenPage($page) ||
1064e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
1075284515dSAndreas Gohr                ($skipRE && preg_match($skipRE, $page))
1085284515dSAndreas Gohr            ) {
1095284515dSAndreas Gohr                // this page should not be in the index (anymore)
1105284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1115284515dSAndreas Gohr                continue;
1125284515dSAndreas Gohr            }
1135284515dSAndreas Gohr
1147ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1157ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1165aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1177ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
11833128f96SAndreas Gohr                if ($this->logger) $this->logger->info("Reusing chunks for $page");
1195aa45b4dSAndreas Gohr            } else {
1205aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1217ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1227ee8b02dSAndreas Gohr                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
1235aa45b4dSAndreas Gohr            }
1245aa45b4dSAndreas Gohr        }
1257ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1265aa45b4dSAndreas Gohr    }
1275aa45b4dSAndreas Gohr
1285aa45b4dSAndreas Gohr    /**
1297ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1305aa45b4dSAndreas Gohr     *
13188305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
13288305719SAndreas Gohr     * Otherwise the raw wiki text is used.
13388305719SAndreas Gohr     *
1345aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1357ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
1367ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
1375aa45b4dSAndreas Gohr     * @throws \Exception
1385aa45b4dSAndreas Gohr     */
1397ee8b02dSAndreas Gohr    protected function createPageChunks($page, $firstChunkID)
1405aa45b4dSAndreas Gohr    {
1417ee8b02dSAndreas Gohr        $chunkList = [];
14288305719SAndreas Gohr
14388305719SAndreas Gohr        $textRenderer = plugin_load('renderer', 'text');
14488305719SAndreas Gohr        if ($textRenderer) {
14588305719SAndreas Gohr            global $ID;
14688305719SAndreas Gohr            $ID = $page;
14788305719SAndreas Gohr            $text = p_cached_output(wikiFN($page), 'text', $page);
14888305719SAndreas Gohr        } else {
14988305719SAndreas Gohr            $text = rawWiki($page);
15088305719SAndreas Gohr        }
15188305719SAndreas Gohr
15288305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
1537ee8b02dSAndreas Gohr        foreach ($parts as $part) {
15493c1dbf4SAndreas Gohr            if (trim($part) == '') continue; // skip empty chunks
15593c1dbf4SAndreas Gohr
156ad38c5fdSAndreas Gohr            try {
1577ee8b02dSAndreas Gohr                $embedding = $this->openAI->getEmbedding($part);
158ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
159ad38c5fdSAndreas Gohr                if ($this->logger) {
160ad38c5fdSAndreas Gohr                    $this->logger->error(
161ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
162ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
163ad38c5fdSAndreas Gohr                    );
164ad38c5fdSAndreas Gohr                }
165ad38c5fdSAndreas Gohr                continue;
166ad38c5fdSAndreas Gohr            }
1677ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
1687ee8b02dSAndreas Gohr            $firstChunkID++;
1698817535bSAndreas Gohr        }
1708817535bSAndreas Gohr        if ($this->logger) {
17193c1dbf4SAndreas Gohr            if (count($chunkList)) {
17293c1dbf4SAndreas Gohr                $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]);
17393c1dbf4SAndreas Gohr            } else {
17493c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
17593c1dbf4SAndreas Gohr            }
1768817535bSAndreas Gohr        }
1777ee8b02dSAndreas Gohr        return $chunkList;
1788817535bSAndreas Gohr    }
1798817535bSAndreas Gohr
1809e81bea7SAndreas Gohr    /**
1819e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1829e81bea7SAndreas Gohr     *
1839e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
184*68908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
1859e81bea7SAndreas Gohr     *
1869e81bea7SAndreas Gohr     * @param string $query The question
1877ee8b02dSAndreas Gohr     * @return Chunk[]
1889e81bea7SAndreas Gohr     * @throws \Exception
1899e81bea7SAndreas Gohr     */
190*68908844SAndreas Gohr    public function getSimilarChunks($query)
1918817535bSAndreas Gohr    {
1929e81bea7SAndreas Gohr        global $auth;
1937ee8b02dSAndreas Gohr        $vector = $this->openAI->getEmbedding($query);
1948817535bSAndreas Gohr
195*68908844SAndreas Gohr        // fetch a few more than needed, since not all chunks are maximum length
196*68908844SAndreas Gohr        $fetch = ceil((self::MAX_CONTEXT_LEN / self::MAX_CHUNK_LEN) * 1.2);
197*68908844SAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $fetch);
198*68908844SAndreas Gohr
199*68908844SAndreas Gohr        $size = 0;
2008817535bSAndreas Gohr        $result = [];
2017ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
2029e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
2037ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
204*68908844SAndreas Gohr
205*68908844SAndreas Gohr            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
206*68908844SAndreas Gohr            if ($size + $chunkSize > self::MAX_CONTEXT_LEN) break; // we have enough
207*68908844SAndreas Gohr
2089e81bea7SAndreas Gohr            $result[] = $chunk;
209*68908844SAndreas Gohr            $size += $chunkSize;
2108817535bSAndreas Gohr        }
2118817535bSAndreas Gohr        return $result;
2128817535bSAndreas Gohr    }
2138817535bSAndreas Gohr
2145786be46SAndreas Gohr
2155786be46SAndreas Gohr    /**
2168817535bSAndreas Gohr     * @param $text
2178817535bSAndreas Gohr     * @return array
2188817535bSAndreas Gohr     * @throws \Exception
2198817535bSAndreas Gohr     * @todo support splitting too long sentences
2208817535bSAndreas Gohr     */
221ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
2228817535bSAndreas Gohr    {
2238817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
224*68908844SAndreas Gohr        $tiktok = $this->getTokenEncoder();
2258817535bSAndreas Gohr
2268817535bSAndreas Gohr        $chunks = [];
2278817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
2288817535bSAndreas Gohr
2298817535bSAndreas Gohr        $chunklen = 0;
2308817535bSAndreas Gohr        $chunk = '';
2318817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
2328817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
233*68908844SAndreas Gohr            if ($slen > self::MAX_CHUNK_LEN) {
2348817535bSAndreas Gohr                // sentence is too long, we need to split it further
235ad38c5fdSAndreas Gohr                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
236ad38c5fdSAndreas Gohr                continue;
2378817535bSAndreas Gohr            }
2388817535bSAndreas Gohr
239*68908844SAndreas Gohr            if ($chunklen + $slen < self::MAX_CHUNK_LEN) {
2408817535bSAndreas Gohr                // add to current chunk
2418817535bSAndreas Gohr                $chunk .= $sentence;
2428817535bSAndreas Gohr                $chunklen += $slen;
243*68908844SAndreas Gohr                // remember sentence for overlap check
244*68908844SAndreas Gohr                $this->rememberSentence($sentence);
2458817535bSAndreas Gohr            } else {
246*68908844SAndreas Gohr                // add current chunk to result
2478817535bSAndreas Gohr                $chunks[] = $chunk;
248*68908844SAndreas Gohr
249*68908844SAndreas Gohr                // start new chunk with remembered sentences
250*68908844SAndreas Gohr                $chunk = join(' ', $this->sentenceQueue);
251*68908844SAndreas Gohr                $chunk .= $sentence;
252*68908844SAndreas Gohr                $chunklen = count($tiktok->encode($chunk));
2538817535bSAndreas Gohr            }
2548817535bSAndreas Gohr        }
2558817535bSAndreas Gohr        $chunks[] = $chunk;
2568817535bSAndreas Gohr
2578817535bSAndreas Gohr        return $chunks;
2588817535bSAndreas Gohr    }
259*68908844SAndreas Gohr
260*68908844SAndreas Gohr    /**
261*68908844SAndreas Gohr     * Add a sentence to the queue of remembered sentences
262*68908844SAndreas Gohr     *
263*68908844SAndreas Gohr     * @param string $sentence
264*68908844SAndreas Gohr     * @return void
265*68908844SAndreas Gohr     */
266*68908844SAndreas Gohr    protected function rememberSentence($sentence)
267*68908844SAndreas Gohr    {
268*68908844SAndreas Gohr        // add sentence to queue
269*68908844SAndreas Gohr        $this->sentenceQueue[] = $sentence;
270*68908844SAndreas Gohr
271*68908844SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
272*68908844SAndreas Gohr        $encoder = $this->getTokenEncoder();
273*68908844SAndreas Gohr        while (count($encoder->encode(join(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
274*68908844SAndreas Gohr            array_shift($this->sentenceQueue);
275*68908844SAndreas Gohr        }
276*68908844SAndreas Gohr    }
2778817535bSAndreas Gohr}
278