xref: /plugin/aichat/Embeddings.php (revision 9634d7345e88e8177bbba1e0ecb312352866df1d)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\File\PageResolver;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var int maximum overlap between chunks in tokens */
24    final public const MAX_OVERLAP_LEN = 200;
25
26    /** @var ChatInterface */
27    protected $chatModel;
28
29    /** @var EmbeddingInterface */
30    protected $embedModel;
31
32    /** @var CLI|null */
33    protected $logger;
34    /** @var Encoder */
35    protected $tokenEncoder;
36
37    /** @var AbstractStorage */
38    protected $storage;
39
40    /** @var array remember sentences when chunking */
41    private $sentenceQueue = [];
42
43    /** @var int the time spent for the last similar chunk retrieval */
44    public $timeSpent = 0;
45
46    protected $configChunkSize;
47    protected $configContextChunks;
48    protected $similarityThreshold;
49
50    /**
51     * Embeddings constructor.
52     *
53     * @param ChatInterface $chatModel
54     * @param EmbeddingInterface $embedModel
55     * @param AbstractStorage $storage
56     * @param array $config The plugin configuration
57     */
58    public function __construct(
59        ChatInterface      $chatModel,
60        EmbeddingInterface $embedModel,
61        AbstractStorage    $storage,
62                           $config
63    )
64    {
65        $this->chatModel = $chatModel;
66        $this->embedModel = $embedModel;
67        $this->storage = $storage;
68        $this->configChunkSize = $config['chunkSize'];
69        $this->configContextChunks = $config['contextChunks'];
70        $this->similarityThreshold = $config['similarityThreshold'] / 100;
71    }
72
73    /**
74     * Access storage
75     *
76     * @return AbstractStorage
77     */
78    public function getStorage()
79    {
80        return $this->storage;
81    }
82
83    /**
84     * Override the number of used context chunks
85     *
86     * @param int $max
87     * @return void
88     */
89    public function setConfigContextChunks(int $max)
90    {
91        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
92        $this->configContextChunks = $max;
93    }
94
95    /**
96     * Override the similiarity threshold
97     *
98     * @param float $threshold
99     * @return void
100     */
101    public function setSimilarityThreshold(float $threshold)
102    {
103        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
104        $this->similarityThreshold = $threshold;
105    }
106
107    /**
108     * Add a logger instance
109     *
110     * @return void
111     */
112    public function setLogger(CLI $logger)
113    {
114        $this->logger = $logger;
115    }
116
117    /**
118     * Get the token encoder instance
119     *
120     * @return Encoder
121     */
122    public function getTokenEncoder()
123    {
124        if (!$this->tokenEncoder instanceof Encoder) {
125            $this->tokenEncoder = new Encoder();
126        }
127        return $this->tokenEncoder;
128    }
129
130    /**
131     * Return the chunk size to use
132     *
133     * @return int
134     */
135    public function getChunkSize()
136    {
137        $tokenlimit = $this->chatModel->getMaxInputTokenLength();
138        if (!$tokenlimit) {
139            // no token limit, use the configured chunk size
140            return $this->configChunkSize;
141        }
142
143        return min(
144            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
145            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
146            $this->configChunkSize, // this is usually the smallest
147        );
148    }
149
150    /**
151     * Update the embeddings storage
152     *
153     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
154     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
155     * @param bool $clear Should any existing storage be cleared before updating?
156     * @return void
157     * @throws \Exception
158     */
159    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
160    {
161        $indexer = new Indexer();
162        $pages = $indexer->getPages();
163
164        $this->storage->startCreation($clear);
165        foreach ($pages as $pid => $page) {
166            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
167
168            if (
169                !page_exists($page) ||
170                isHiddenPage($page) ||
171                filesize(wikiFN($page)) < 150 || // skip very small pages
172                ($skipRE && preg_match($skipRE, (string)$page)) ||
173                ($matchRE && !preg_match($matchRE, ":$page"))
174            ) {
175                // this page should not be in the index (anymore)
176                $this->storage->deletePageChunks($page, $chunkID);
177                continue;
178            }
179
180            $firstChunk = $this->storage->getChunk($chunkID);
181            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
182                // page is older than the chunks we have, reuse the existing chunks
183                $this->storage->reusePageChunks($page, $chunkID);
184                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
185            } else {
186                // page is newer than the chunks we have, create new chunks
187                $this->storage->deletePageChunks($page, $chunkID);
188                $chunks = $this->createPageChunks($page, $chunkID);
189                if ($chunks) $this->storage->addPageChunks($chunks);
190            }
191        }
192        $this->storage->finalizeCreation();
193    }
194
195    /**
196     * Get the content of a page
197     *
198     * Uses our own renderer to format the contents in an LLM friendly way. Falls back to
199     * raw syntax if the renderer fails for some reason
200     *
201     * @param string $page Name of the page to read
202     * @return string The content of the page
203     */
204    public function getPageContent($page)
205    {
206        global $ID;
207        $ID = $page;
208        try {
209            $text = p_cached_output(wikiFN($page), 'aichat', $page);
210        } catch (\Throwable $e) {
211            if ($this->logger) $this->logger->error(
212                'Failed to render page {page}. Using raw text instead. {msg}',
213                ['page' => $page, 'msg' => $e->getMessage()]
214            );
215            $text = rawWiki($page);
216        }
217        return $text;
218    }
219
220    /**
221     * Split the given page, fetch embedding vectors and return Chunks
222     *
223     * Will use the text renderer plugin if available to get the rendered text.
224     * Otherwise the raw wiki text is used.
225     *
226     * @param string $page Name of the page to split
227     * @param int $firstChunkID The ID of the first chunk of this page
228     * @return Chunk[] A list of chunks created for this page
229     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
230     * @throws \Exception
231     */
232    public function createPageChunks($page, $firstChunkID)
233    {
234        $chunkList = [];
235
236        $text = $this->getPageContent($page);
237        $crumbs = $this->breadcrumbTrail($page);
238
239        // allow plugins to modify the text before splitting
240        $eventData = [
241            'page' => $page,
242            'body' => '',
243            'metadata' => ['title' => $page, 'relation_references' => []],
244        ];
245        $event = new Event('INDEXER_PAGE_ADD', $eventData);
246        if ($event->advise_before()) {
247            $text = $eventData['body'] . ' ' . $text;
248        } else {
249            $text = $eventData['body'];
250        }
251
252        $parts = $this->splitIntoChunks($text);
253        foreach ($parts as $part) {
254            if (trim((string)$part) == '') continue; // skip empty chunks
255
256            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
257
258            try {
259                $embedding = $this->embedModel->getEmbedding($part);
260            } catch (\Exception $e) {
261                if ($this->logger instanceof CLI) {
262                    $this->logger->error(
263                        'Failed to get embedding for chunk of page {page}: {msg}',
264                        ['page' => $page, 'msg' => $e->getMessage()]
265                    );
266                }
267                continue;
268            }
269            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
270            $firstChunkID++;
271        }
272        if ($this->logger instanceof CLI) {
273            if ($chunkList !== []) {
274                $this->logger->success(
275                    '{id} split into {count} chunks',
276                    ['id' => $page, 'count' => count($chunkList)]
277                );
278            } else {
279                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
280            }
281        }
282        return $chunkList;
283    }
284
285    /**
286     * Do a nearest neighbor search for chunks similar to the given question
287     *
288     * Returns only chunks the current user is allowed to read, may return an empty result.
289     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
290     *
291     * @param string $query The question
292     * @param string $lang Limit results to this language
293     * @param bool $limits Apply chat token limits to the number of chunks returned?
294     * @return Chunk[]
295     * @throws \Exception
296     */
297    public function getSimilarChunks($query, $lang = '', $limits = true)
298    {
299        global $auth;
300        $vector = $this->embedModel->getEmbedding($query);
301
302        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
303
304        if ($tokenlimit) {
305            $fetch = min(
306                ($tokenlimit / $this->getChunkSize()),
307                $this->configContextChunks
308            );
309        } else {
310            $fetch = $this->configContextChunks;
311        }
312
313        $time = microtime(true);
314        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
315        $this->timeSpent = round(microtime(true) - $time, 2);
316        if ($this->logger instanceof CLI) {
317            $this->logger->info(
318                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
319                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
320            );
321        }
322
323        $size = 0;
324        $result = [];
325        foreach ($chunks as $chunk) {
326            // filter out chunks the user is not allowed to read
327            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
328            if ($chunk->getScore() < $this->similarityThreshold) continue;
329
330            if ($tokenlimit) {
331                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
332                if ($size + $chunkSize > $tokenlimit) break; // we have enough
333            }
334
335            $result[] = $chunk;
336            $size += $chunkSize ?? 0;
337
338            if (count($result) >= $this->configContextChunks) break; // we have enough
339        }
340        return $result;
341    }
342
343    /**
344     * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk
345     *
346     * This will not apply any token limits
347     *
348     * @param string $query The question
349     * @param string $lang Limit results to this language
350     * @return Chunk[]
351     * @throws \Exception
352     */
353    public function getSimilarPages($query, $lang = '')
354    {
355        $chunks = $this->getSimilarChunks($query, $lang, false);
356        $pages = [];
357
358        foreach ($chunks as $chunk) {
359            $page = $chunk->getPage();
360            if (isset($pages[$page])) continue; // we already have this page
361
362            $content = $this->getPageContent($chunk->getPage());
363            $crumbs = $this->breadcrumbTrail($chunk->getPage());
364
365            $pages[$page] = new Chunk(
366                $page,
367                $chunk->getId(),
368                $crumbs . "\n\n" . $content,
369                $chunk->getEmbedding(),
370                $chunk->getLanguage(),
371                $chunk->getCreated(),
372                $chunk->getScore()
373            );
374        }
375        return $pages;
376    }
377
378    /**
379     * Returns all chunks for a page
380     *
381     * Does not apply configContextChunks but checks token limits if requested
382     *
383     * @param string $page
384     * @param bool $limits Apply chat token limits to the number of chunks returned?
385     * @return Chunk[]
386     */
387    public function getPageChunks($page, $limits = true)
388    {
389        global $auth;
390        if ($auth && auth_quickaclcheck($page) < AUTH_READ) {
391            if ($this->logger instanceof CLI) $this->logger->warning(
392                'User not allowed to read context page {page}', ['page' => $page]
393            );
394            return [];
395        }
396
397        $indexer = new Indexer();
398        $pages = $indexer->getPages();
399        $pos = array_search(cleanID($page), $pages);
400
401        if ($pos === false) {
402            if ($this->logger instanceof CLI) $this->logger->warning(
403                'Context page {page} is not in index', ['page' => $page]
404            );
405            return [];
406        }
407
408        $chunks = $this->storage->getPageChunks($page, $pos * 100);
409
410        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
411
412        $size = 0;
413        $result = [];
414        foreach ($chunks as $chunk) {
415            if ($tokenlimit) {
416                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
417                if ($size + $chunkSize > $tokenlimit) break; // we have enough
418            }
419
420            $result[] = $chunk;
421            $size += $chunkSize ?? 0;
422        }
423
424        return $result;
425    }
426
427
428    /**
429     * Create a breadcrumb trail for the given page
430     *
431     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
432     * each chunk to give the AI some context.
433     *
434     * @param string $id
435     * @return string
436     */
437    protected function breadcrumbTrail($id)
438    {
439        $namespaces = explode(':', getNS($id));
440        $resolver = new PageResolver($id);
441        $crumbs = [];
442
443        // all namespaces
444        $check = '';
445        foreach ($namespaces as $namespace) {
446            $check .= $namespace . ':';
447            $page = $resolver->resolveId($check);
448            $title = p_get_first_heading($page);
449            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
450        }
451
452        // the page itself
453        $title = p_get_first_heading($id);
454        $page = noNS($id);
455        $crumbs[] = $title ? "$title ($page)" : $page;
456
457        return implode(' » ', $crumbs);
458    }
459
460    /**
461     * @param $text
462     * @return array
463     * @throws \Exception
464     * @todo support splitting too long sentences
465     */
466    protected function splitIntoChunks($text)
467    {
468        $sentenceSplitter = new Sentence();
469        $tiktok = $this->getTokenEncoder();
470
471        $chunks = [];
472        $sentences = $sentenceSplitter->split($text);
473
474        $chunklen = 0;
475        $chunk = '';
476        while ($sentence = array_shift($sentences)) {
477            $slen = count($tiktok->encode($sentence));
478            if ($slen > $this->getChunkSize()) {
479                // sentence is too long, we need to split it further
480                if ($this->logger instanceof CLI) $this->logger->warning(
481                    'Sentence too long, splitting not implemented yet'
482                );
483                continue;
484            }
485
486            if ($chunklen + $slen < $this->getChunkSize()) {
487                // add to current chunk
488                $chunk .= $sentence;
489                $chunklen += $slen;
490                // remember sentence for overlap check
491                $this->rememberSentence($sentence);
492            } else {
493                // add current chunk to result
494                $chunk = trim($chunk);
495                if ($chunk !== '') $chunks[] = $chunk;
496
497                // start new chunk with remembered sentences
498                $chunk = implode(' ', $this->sentenceQueue);
499                $chunk .= $sentence;
500                $chunklen = count($tiktok->encode($chunk));
501            }
502        }
503        $chunks[] = $chunk;
504
505        return $chunks;
506    }
507
508    /**
509     * Add a sentence to the queue of remembered sentences
510     *
511     * @param string $sentence
512     * @return void
513     */
514    protected function rememberSentence($sentence)
515    {
516        // add sentence to queue
517        $this->sentenceQueue[] = $sentence;
518
519        // remove oldest sentences from queue until we are below the max overlap
520        $encoder = $this->getTokenEncoder();
521        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
522            array_shift($this->sentenceQueue);
523        }
524    }
525}
526