xref: /plugin/aichat/Embeddings.php (revision 2071dced6f96936ea7b9bf5dbe8a117eef598448)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\ChatInterface;
7use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
8use dokuwiki\plugin\aichat\Storage\AbstractStorage;
9use dokuwiki\Search\Indexer;
10use splitbrain\phpcli\CLI;
11use TikToken\Encoder;
12use Vanderlee\Sentence\Sentence;
13
14/**
15 * Manage the embeddings index
16 *
17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
18 * OpenAI and stored in the Storage backend.
19 */
20class Embeddings
21{
22    /** @var int maximum overlap between chunks in tokens */
23    final public const MAX_OVERLAP_LEN = 200;
24
25    /** @var ChatInterface */
26    protected $chatModel;
27
28    /** @var EmbeddingInterface */
29    protected $embedModel;
30
31    /** @var CLI|null */
32    protected $logger;
33    /** @var Encoder */
34    protected $tokenEncoder;
35
36    /** @var AbstractStorage */
37    protected $storage;
38
39    /** @var array remember sentences when chunking */
40    private $sentenceQueue = [];
41
42    /** @var int the time spent for the last similar chunk retrieval */
43    public $timeSpent = 0;
44
45    protected $configChunkSize;
46    protected $configContextChunks;
47
48    /**
49     * Embeddings constructor.
50     *
51     * @param ChatInterface $chatModel
52     * @param EmbeddingInterface $embedModel
53     * @param AbstractStorage $storage
54     * @param array $config The plugin configuration
55     */
56    public function __construct(
57        ChatInterface $chatModel,
58        EmbeddingInterface $embedModel,
59        AbstractStorage $storage,
60        $config
61    ) {
62        $this->chatModel = $chatModel;
63        $this->embedModel = $embedModel;
64        $this->storage = $storage;
65        $this->configChunkSize = $config['chunkSize'];
66        $this->configContextChunks = $config['contextChunks'];
67    }
68
69    /**
70     * Access storage
71     *
72     * @return AbstractStorage
73     */
74    public function getStorage()
75    {
76        return $this->storage;
77    }
78
79    /**
80     * Add a logger instance
81     *
82     * @return void
83     */
84    public function setLogger(CLI $logger)
85    {
86        $this->logger = $logger;
87    }
88
89    /**
90     * Get the token encoder instance
91     *
92     * @return Encoder
93     */
94    public function getTokenEncoder()
95    {
96        if (!$this->tokenEncoder instanceof Encoder) {
97            $this->tokenEncoder = new Encoder();
98        }
99        return $this->tokenEncoder;
100    }
101
102    /**
103     * Return the chunk size to use
104     *
105     * @return int
106     */
107    public function getChunkSize()
108    {
109        return min(
110            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
111            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
112            $this->configChunkSize, // this is usually the smallest
113        );
114    }
115
116    /**
117     * Update the embeddings storage
118     *
119     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
120     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
121     * @param bool $clear Should any existing storage be cleared before updating?
122     * @return void
123     * @throws \Exception
124     */
125    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
126    {
127        $indexer = new Indexer();
128        $pages = $indexer->getPages();
129
130        $this->storage->startCreation($clear);
131        foreach ($pages as $pid => $page) {
132            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
133
134            if (
135                !page_exists($page) ||
136                isHiddenPage($page) ||
137                filesize(wikiFN($page)) < 150 || // skip very small pages
138                ($skipRE && preg_match($skipRE, (string)$page)) ||
139                ($matchRE && !preg_match($matchRE, ":$page"))
140            ) {
141                // this page should not be in the index (anymore)
142                $this->storage->deletePageChunks($page, $chunkID);
143                continue;
144            }
145
146            $firstChunk = $this->storage->getChunk($chunkID);
147            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
148                // page is older than the chunks we have, reuse the existing chunks
149                $this->storage->reusePageChunks($page, $chunkID);
150                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
151            } else {
152                // page is newer than the chunks we have, create new chunks
153                $this->storage->deletePageChunks($page, $chunkID);
154                $chunks = $this->createPageChunks($page, $chunkID);
155                if ($chunks) $this->storage->addPageChunks($chunks);
156            }
157        }
158        $this->storage->finalizeCreation();
159    }
160
161    /**
162     * Split the given page, fetch embedding vectors and return Chunks
163     *
164     * Will use the text renderer plugin if available to get the rendered text.
165     * Otherwise the raw wiki text is used.
166     *
167     * @param string $page Name of the page to split
168     * @param int $firstChunkID The ID of the first chunk of this page
169     * @return Chunk[] A list of chunks created for this page
170     * @throws \Exception
171     */
172    protected function createPageChunks($page, $firstChunkID)
173    {
174        $chunkList = [];
175
176        $textRenderer = plugin_load('renderer', 'text');
177        if ($textRenderer instanceof PluginInterface) {
178            global $ID;
179            $ID = $page;
180            $text = p_cached_output(wikiFN($page), 'text', $page);
181        } else {
182            $text = rawWiki($page);
183        }
184
185        $parts = $this->splitIntoChunks($text);
186        foreach ($parts as $part) {
187            if (trim((string)$part) == '') continue; // skip empty chunks
188
189            try {
190                $embedding = $this->embedModel->getEmbedding($part);
191            } catch (\Exception $e) {
192                if ($this->logger instanceof CLI) {
193                    $this->logger->error(
194                        'Failed to get embedding for chunk of page {page}: {msg}',
195                        ['page' => $page, 'msg' => $e->getMessage()]
196                    );
197                }
198                continue;
199            }
200            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
201            $firstChunkID++;
202        }
203        if ($this->logger instanceof CLI) {
204            if ($chunkList !== []) {
205                $this->logger->success(
206                    '{id} split into {count} chunks',
207                    ['id' => $page, 'count' => count($chunkList)]
208                );
209            } else {
210                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
211            }
212        }
213        return $chunkList;
214    }
215
216    /**
217     * Do a nearest neighbor search for chunks similar to the given question
218     *
219     * Returns only chunks the current user is allowed to read, may return an empty result.
220     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
221     *
222     * @param string $query The question
223     * @param string $lang Limit results to this language
224     * @return Chunk[]
225     * @throws \Exception
226     */
227    public function getSimilarChunks($query, $lang = '')
228    {
229        global $auth;
230        $vector = $this->embedModel->getEmbedding($query);
231
232        $fetch = min(
233            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
234            $this->configContextChunks
235        );
236
237        $time = microtime(true);
238        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
239        $this->timeSpent = round(microtime(true) - $time, 2);
240        if ($this->logger instanceof CLI) {
241            $this->logger->info(
242                'Fetched {count} similar chunks from store in {time} seconds',
243                ['count' => count($chunks), 'time' => $this->timeSpent]
244            );
245        }
246
247        $size = 0;
248        $result = [];
249        foreach ($chunks as $chunk) {
250            // filter out chunks the user is not allowed to read
251            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
252
253            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
254            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
255
256            $result[] = $chunk;
257            $size += $chunkSize;
258        }
259        return $result;
260    }
261
262
263    /**
264     * @param $text
265     * @return array
266     * @throws \Exception
267     * @todo support splitting too long sentences
268     */
269    public function splitIntoChunks($text)
270    {
271        $sentenceSplitter = new Sentence();
272        $tiktok = $this->getTokenEncoder();
273
274        $chunks = [];
275        $sentences = $sentenceSplitter->split($text);
276
277        $chunklen = 0;
278        $chunk = '';
279        while ($sentence = array_shift($sentences)) {
280            $slen = count($tiktok->encode($sentence));
281            if ($slen > $this->getChunkSize()) {
282                // sentence is too long, we need to split it further
283                if ($this->logger instanceof CLI) $this->logger->warning(
284                    'Sentence too long, splitting not implemented yet'
285                );
286                continue;
287            }
288
289            if ($chunklen + $slen < $this->getChunkSize()) {
290                // add to current chunk
291                $chunk .= $sentence;
292                $chunklen += $slen;
293                // remember sentence for overlap check
294                $this->rememberSentence($sentence);
295            } else {
296                // add current chunk to result
297                $chunks[] = $chunk;
298
299                // start new chunk with remembered sentences
300                $chunk = implode(' ', $this->sentenceQueue);
301                $chunk .= $sentence;
302                $chunklen = count($tiktok->encode($chunk));
303            }
304        }
305        $chunks[] = $chunk;
306
307        return $chunks;
308    }
309
310    /**
311     * Add a sentence to the queue of remembered sentences
312     *
313     * @param string $sentence
314     * @return void
315     */
316    protected function rememberSentence($sentence)
317    {
318        // add sentence to queue
319        $this->sentenceQueue[] = $sentence;
320
321        // remove oldest sentences from queue until we are below the max overlap
322        $encoder = $this->getTokenEncoder();
323        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
324            array_shift($this->sentenceQueue);
325        }
326    }
327}
328