xref: /plugin/aichat/Embeddings.php (revision 689088446f64ff8d9dfdae9ae0666b45de449da7)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\plugin\aichat\backend\AbstractStorage;
6use dokuwiki\plugin\aichat\backend\Chunk;
7use dokuwiki\plugin\aichat\backend\SQLiteStorage;
8use dokuwiki\Search\Indexer;
9use splitbrain\phpcli\CLI;
10use TikToken\Encoder;
11use Vanderlee\Sentence\Sentence;
12
13/**
14 * Manage the embeddings index
15 *
16 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
17 * OpenAI and stored in the Storage backend.
18 */
19class Embeddings
20{
21    /** @var int length of all context chunks together */
22    const MAX_CONTEXT_LEN = 3800;
23
24    /** @var int size of the chunks in tokens */
25    const MAX_CHUNK_LEN = 1000;
26
27    /** @var int maximum overlap between chunks in tokens */
28    const MAX_OVERLAP_LEN = 200;
29
30    /** @var OpenAI */
31    protected $openAI;
32    /** @var CLI|null */
33    protected $logger;
34    /** @var Encoder */
35    protected $tokenEncoder;
36
37    /** @var AbstractStorage */
38    protected $storage;
39
40    /** @var array remember sentences when chunking */
41    private $sentenceQueue = [];
42
43    /**
44     * @param OpenAI $openAI
45     */
46    public function __construct(OpenAI $openAI)
47    {
48        $this->openAI = $openAI;
49        $this->storage = new SQLiteStorage();
50    }
51
52    /**
53     * Access storage
54     *
55     * @return AbstractStorage
56     */
57    public function getStorage()
58    {
59        return $this->storage;
60    }
61
62    /**
63     * Add a logger instance
64     *
65     * @param CLI $logger
66     * @return void
67     */
68    public function setLogger(CLI $logger)
69    {
70        $this->logger = $logger;
71    }
72
73    /**
74     * Get the token encoder instance
75     *
76     * @return Encoder
77     */
78    public function getTokenEncoder()
79    {
80        if ($this->tokenEncoder === null) {
81            $this->tokenEncoder = new Encoder();
82        }
83        return $this->tokenEncoder;
84    }
85
86    /**
87     * Update the embeddings storage
88     *
89     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
90     * @param bool $clear Should any existing storage be cleared before updating?
91     * @return void
92     * @throws \Exception
93     */
94    public function createNewIndex($skipRE = '', $clear = false)
95    {
96        $indexer = new Indexer();
97        $pages = $indexer->getPages();
98
99        $this->storage->startCreation(1536, $clear);
100        foreach ($pages as $pid => $page) {
101            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
102
103            if (
104                !page_exists($page) ||
105                isHiddenPage($page) ||
106                filesize(wikiFN($page)) < 150 || // skip very small pages
107                ($skipRE && preg_match($skipRE, $page))
108            ) {
109                // this page should not be in the index (anymore)
110                $this->storage->deletePageChunks($page, $chunkID);
111                continue;
112            }
113
114            $firstChunk = $this->storage->getChunk($chunkID);
115            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
116                // page is older than the chunks we have, reuse the existing chunks
117                $this->storage->reusePageChunks($page, $chunkID);
118                if ($this->logger) $this->logger->info("Reusing chunks for $page");
119            } else {
120                // page is newer than the chunks we have, create new chunks
121                $this->storage->deletePageChunks($page, $chunkID);
122                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
123            }
124        }
125        $this->storage->finalizeCreation();
126    }
127
128    /**
129     * Split the given page, fetch embedding vectors and return Chunks
130     *
131     * Will use the text renderer plugin if available to get the rendered text.
132     * Otherwise the raw wiki text is used.
133     *
134     * @param string $page Name of the page to split
135     * @param int $firstChunkID The ID of the first chunk of this page
136     * @return Chunk[] A list of chunks created for this page
137     * @throws \Exception
138     */
139    protected function createPageChunks($page, $firstChunkID)
140    {
141        $chunkList = [];
142
143        $textRenderer = plugin_load('renderer', 'text');
144        if ($textRenderer) {
145            global $ID;
146            $ID = $page;
147            $text = p_cached_output(wikiFN($page), 'text', $page);
148        } else {
149            $text = rawWiki($page);
150        }
151
152        $parts = $this->splitIntoChunks($text);
153        foreach ($parts as $part) {
154            if (trim($part) == '') continue; // skip empty chunks
155
156            try {
157                $embedding = $this->openAI->getEmbedding($part);
158            } catch (\Exception $e) {
159                if ($this->logger) {
160                    $this->logger->error(
161                        'Failed to get embedding for chunk of page {page}: {msg}',
162                        ['page' => $page, 'msg' => $e->getMessage()]
163                    );
164                }
165                continue;
166            }
167            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
168            $firstChunkID++;
169        }
170        if ($this->logger) {
171            if (count($chunkList)) {
172                $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]);
173            } else {
174                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
175            }
176        }
177        return $chunkList;
178    }
179
180    /**
181     * Do a nearest neighbor search for chunks similar to the given question
182     *
183     * Returns only chunks the current user is allowed to read, may return an empty result.
184     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
185     *
186     * @param string $query The question
187     * @return Chunk[]
188     * @throws \Exception
189     */
190    public function getSimilarChunks($query)
191    {
192        global $auth;
193        $vector = $this->openAI->getEmbedding($query);
194
195        // fetch a few more than needed, since not all chunks are maximum length
196        $fetch = ceil((self::MAX_CONTEXT_LEN / self::MAX_CHUNK_LEN) * 1.2);
197        $chunks = $this->storage->getSimilarChunks($vector, $fetch);
198
199        $size = 0;
200        $result = [];
201        foreach ($chunks as $chunk) {
202            // filter out chunks the user is not allowed to read
203            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
204
205            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
206            if ($size + $chunkSize > self::MAX_CONTEXT_LEN) break; // we have enough
207
208            $result[] = $chunk;
209            $size += $chunkSize;
210        }
211        return $result;
212    }
213
214
215    /**
216     * @param $text
217     * @return array
218     * @throws \Exception
219     * @todo support splitting too long sentences
220     */
221    public function splitIntoChunks($text)
222    {
223        $sentenceSplitter = new Sentence();
224        $tiktok = $this->getTokenEncoder();
225
226        $chunks = [];
227        $sentences = $sentenceSplitter->split($text);
228
229        $chunklen = 0;
230        $chunk = '';
231        while ($sentence = array_shift($sentences)) {
232            $slen = count($tiktok->encode($sentence));
233            if ($slen > self::MAX_CHUNK_LEN) {
234                // sentence is too long, we need to split it further
235                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
236                continue;
237            }
238
239            if ($chunklen + $slen < self::MAX_CHUNK_LEN) {
240                // add to current chunk
241                $chunk .= $sentence;
242                $chunklen += $slen;
243                // remember sentence for overlap check
244                $this->rememberSentence($sentence);
245            } else {
246                // add current chunk to result
247                $chunks[] = $chunk;
248
249                // start new chunk with remembered sentences
250                $chunk = join(' ', $this->sentenceQueue);
251                $chunk .= $sentence;
252                $chunklen = count($tiktok->encode($chunk));
253            }
254        }
255        $chunks[] = $chunk;
256
257        return $chunks;
258    }
259
260    /**
261     * Add a sentence to the queue of remembered sentences
262     *
263     * @param string $sentence
264     * @return void
265     */
266    protected function rememberSentence($sentence)
267    {
268        // add sentence to queue
269        $this->sentenceQueue[] = $sentence;
270
271        // remove oldest sentences from queue until we are below the max overlap
272        $encoder = $this->getTokenEncoder();
273        while (count($encoder->encode(join(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
274            array_shift($this->sentenceQueue);
275        }
276    }
277}
278