xref: /plugin/aichat/Embeddings.php (revision 7be8078ef9026e317a5c01f90a94183276bbbbd2)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\File\PageResolver;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var int maximum overlap between chunks in tokens */
24    final public const MAX_OVERLAP_LEN = 200;
25
26    /** @var ChatInterface */
27    protected $chatModel;
28
29    /** @var EmbeddingInterface */
30    protected $embedModel;
31
32    /** @var CLI|null */
33    protected $logger;
34    /** @var Encoder */
35    protected $tokenEncoder;
36
37    /** @var AbstractStorage */
38    protected $storage;
39
40    /** @var array remember sentences when chunking */
41    private $sentenceQueue = [];
42
43    /** @var int the time spent for the last similar chunk retrieval */
44    public $timeSpent = 0;
45
46    protected $configChunkSize;
47    protected $configContextChunks;
48    protected $similarityThreshold;
49
50    /**
51     * Embeddings constructor.
52     *
53     * @param ChatInterface $chatModel
54     * @param EmbeddingInterface $embedModel
55     * @param AbstractStorage $storage
56     * @param array $config The plugin configuration
57     */
58    public function __construct(
59        ChatInterface      $chatModel,
60        EmbeddingInterface $embedModel,
61        AbstractStorage    $storage,
62                           $config
63    )
64    {
65        $this->chatModel = $chatModel;
66        $this->embedModel = $embedModel;
67        $this->storage = $storage;
68        $this->configChunkSize = $config['chunkSize'];
69        $this->configContextChunks = $config['contextChunks'];
70        $this->similarityThreshold = $config['similarityThreshold'] / 100;
71    }
72
73    /**
74     * Access storage
75     *
76     * @return AbstractStorage
77     */
78    public function getStorage()
79    {
80        return $this->storage;
81    }
82
83    /**
84     * Override the number of used context chunks
85     *
86     * @param int $max
87     * @return void
88     */
89    public function setConfigContextChunks(int $max)
90    {
91        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
92        $this->configContextChunks = $max;
93    }
94
95    /**
96     * Override the similiarity threshold
97     *
98     * @param float $threshold
99     * @return void
100     */
101    public function setSimilarityThreshold(float $threshold)
102    {
103        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
104        $this->similarityThreshold = $threshold;
105    }
106
107    /**
108     * Add a logger instance
109     *
110     * @return void
111     */
112    public function setLogger(CLI $logger)
113    {
114        $this->logger = $logger;
115    }
116
117    /**
118     * Get the token encoder instance
119     *
120     * @return Encoder
121     */
122    public function getTokenEncoder()
123    {
124        if (!$this->tokenEncoder instanceof Encoder) {
125            $this->tokenEncoder = new Encoder();
126        }
127        return $this->tokenEncoder;
128    }
129
130    /**
131     * Return the chunk size to use
132     *
133     * @return int
134     */
135    public function getChunkSize()
136    {
137        $tokenlimit = $this->chatModel->getMaxInputTokenLength();
138        if(!$tokenlimit) {
139            // no token limit, use the configured chunk size
140            return $this->configChunkSize;
141        }
142
143        return min(
144            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
145            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
146            $this->configChunkSize, // this is usually the smallest
147        );
148    }
149
150    /**
151     * Update the embeddings storage
152     *
153     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
154     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
155     * @param bool $clear Should any existing storage be cleared before updating?
156     * @return void
157     * @throws \Exception
158     */
159    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
160    {
161        $indexer = new Indexer();
162        $pages = $indexer->getPages();
163
164        $this->storage->startCreation($clear);
165        foreach ($pages as $pid => $page) {
166            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
167
168            if (
169                !page_exists($page) ||
170                isHiddenPage($page) ||
171                filesize(wikiFN($page)) < 150 || // skip very small pages
172                ($skipRE && preg_match($skipRE, (string)$page)) ||
173                ($matchRE && !preg_match($matchRE, ":$page"))
174            ) {
175                // this page should not be in the index (anymore)
176                $this->storage->deletePageChunks($page, $chunkID);
177                continue;
178            }
179
180            $firstChunk = $this->storage->getChunk($chunkID);
181            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
182                // page is older than the chunks we have, reuse the existing chunks
183                $this->storage->reusePageChunks($page, $chunkID);
184                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
185            } else {
186                // page is newer than the chunks we have, create new chunks
187                $this->storage->deletePageChunks($page, $chunkID);
188                $chunks = $this->createPageChunks($page, $chunkID);
189                if ($chunks) $this->storage->addPageChunks($chunks);
190            }
191        }
192        $this->storage->finalizeCreation();
193    }
194
195    /**
196     * Split the given page, fetch embedding vectors and return Chunks
197     *
198     * Will use the text renderer plugin if available to get the rendered text.
199     * Otherwise the raw wiki text is used.
200     *
201     * @param string $page Name of the page to split
202     * @param int $firstChunkID The ID of the first chunk of this page
203     * @return Chunk[] A list of chunks created for this page
204     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
205     * @throws \Exception
206     */
207    public function createPageChunks($page, $firstChunkID)
208    {
209        $chunkList = [];
210
211        global $ID;
212        $ID = $page;
213        try {
214            $text = p_cached_output(wikiFN($page), 'aichat', $page);
215        } catch (\Throwable $e) {
216            if ($this->logger) $this->logger->error(
217                'Failed to render page {page}. Using raw text instead. {msg}',
218                ['page' => $page, 'msg' => $e->getMessage()]
219            );
220            $text = rawWiki($page);
221        }
222
223        $crumbs = $this->breadcrumbTrail($page);
224
225        // allow plugins to modify the text before splitting
226        $eventData = [
227            'page' => $page,
228            'body' => '',
229            'metadata' => ['title' => $page, 'relation_references' => []],
230        ];
231        $event = new Event('INDEXER_PAGE_ADD', $eventData);
232        if ($event->advise_before()) {
233            $text = $eventData['body'] . ' ' . $text;
234        } else {
235            $text = $eventData['body'];
236        }
237
238        $parts = $this->splitIntoChunks($text);
239        foreach ($parts as $part) {
240            if (trim((string)$part) == '') continue; // skip empty chunks
241
242            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
243
244            try {
245                $embedding = $this->embedModel->getEmbedding($part);
246            } catch (\Exception $e) {
247                if ($this->logger instanceof CLI) {
248                    $this->logger->error(
249                        'Failed to get embedding for chunk of page {page}: {msg}',
250                        ['page' => $page, 'msg' => $e->getMessage()]
251                    );
252                }
253                continue;
254            }
255            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
256            $firstChunkID++;
257        }
258        if ($this->logger instanceof CLI) {
259            if ($chunkList !== []) {
260                $this->logger->success(
261                    '{id} split into {count} chunks',
262                    ['id' => $page, 'count' => count($chunkList)]
263                );
264            } else {
265                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
266            }
267        }
268        return $chunkList;
269    }
270
271    /**
272     * Do a nearest neighbor search for chunks similar to the given question
273     *
274     * Returns only chunks the current user is allowed to read, may return an empty result.
275     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
276     *
277     * @param string $query The question
278     * @param string $lang Limit results to this language
279     * @param bool $limits Apply chat token limits to the number of chunks returned?
280     * @return Chunk[]
281     * @throws \Exception
282     */
283    public function getSimilarChunks($query, $lang = '', $limits = true)
284    {
285        global $auth;
286        $vector = $this->embedModel->getEmbedding($query);
287
288        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
289
290        if ($tokenlimit) {
291            $fetch = min(
292                ($tokenlimit / $this->getChunkSize()),
293                $this->configContextChunks
294            );
295        } else {
296            $fetch = $this->configContextChunks;
297        }
298
299        $time = microtime(true);
300        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
301        $this->timeSpent = round(microtime(true) - $time, 2);
302        if ($this->logger instanceof CLI) {
303            $this->logger->info(
304                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
305                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
306            );
307        }
308
309        $size = 0;
310        $result = [];
311        foreach ($chunks as $chunk) {
312            // filter out chunks the user is not allowed to read
313            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
314            if ($chunk->getScore() < $this->similarityThreshold) continue;
315
316            if ($tokenlimit) {
317                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
318                if ($size + $chunkSize > $tokenlimit) break; // we have enough
319            }
320
321            $result[] = $chunk;
322            $size += $chunkSize ?? 0;
323
324            if (count($result) >= $this->configContextChunks) break; // we have enough
325        }
326        return $result;
327    }
328
329    /**
330     * Returns all chunks for a page
331     *
332     * Does not apply configContextChunks but checks token limits if requested
333     *
334     * @param string $page
335     * @param bool $limits Apply chat token limits to the number of chunks returned?
336     * @return Chunk[]
337     */
338    public function getPageChunks($page, $limits = true)
339    {
340        global $auth;
341        if ($auth && auth_quickaclcheck($page) < AUTH_READ) {
342            if ($this->logger instanceof CLI) $this->logger->warning(
343                'User not allowed to read context page {page}', ['page' => $page]
344            );
345            return [];
346        }
347
348        $indexer = new Indexer();
349        $pages = $indexer->getPages();
350        $pos = array_search(cleanID($page), $pages);
351
352        if ($pos === false) {
353            if ($this->logger instanceof CLI) $this->logger->warning(
354                'Context page {page} is not in index', ['page' => $page]
355            );
356            return [];
357        }
358
359        $chunks = $this->storage->getPageChunks($page, $pos * 100);
360
361        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
362
363        $size = 0;
364        $result = [];
365        foreach ($chunks as $chunk) {
366            if ($tokenlimit) {
367                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
368                if ($size + $chunkSize > $tokenlimit) break; // we have enough
369            }
370
371            $result[] = $chunk;
372            $size += $chunkSize ?? 0;
373        }
374
375        return $result;
376    }
377
378
379    /**
380     * Create a breadcrumb trail for the given page
381     *
382     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
383     * each chunk to give the AI some context.
384     *
385     * @param string $id
386     * @return string
387     */
388    protected function breadcrumbTrail($id)
389    {
390        $namespaces = explode(':', getNS($id));
391        $resolver = new PageResolver($id);
392        $crumbs = [];
393
394        // all namespaces
395        $check = '';
396        foreach ($namespaces as $namespace) {
397            $check .= $namespace . ':';
398            $page = $resolver->resolveId($check);
399            $title = p_get_first_heading($page);
400            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
401        }
402
403        // the page itself
404        $title = p_get_first_heading($id);
405        $page = noNS($id);
406        $crumbs[] = $title ? "$title ($page)" : $page;
407
408        return implode(' » ', $crumbs);
409    }
410
411    /**
412     * @param $text
413     * @return array
414     * @throws \Exception
415     * @todo support splitting too long sentences
416     */
417    protected function splitIntoChunks($text)
418    {
419        $sentenceSplitter = new Sentence();
420        $tiktok = $this->getTokenEncoder();
421
422        $chunks = [];
423        $sentences = $sentenceSplitter->split($text);
424
425        $chunklen = 0;
426        $chunk = '';
427        while ($sentence = array_shift($sentences)) {
428            $slen = count($tiktok->encode($sentence));
429            if ($slen > $this->getChunkSize()) {
430                // sentence is too long, we need to split it further
431                if ($this->logger instanceof CLI) $this->logger->warning(
432                    'Sentence too long, splitting not implemented yet'
433                );
434                continue;
435            }
436
437            if ($chunklen + $slen < $this->getChunkSize()) {
438                // add to current chunk
439                $chunk .= $sentence;
440                $chunklen += $slen;
441                // remember sentence for overlap check
442                $this->rememberSentence($sentence);
443            } else {
444                // add current chunk to result
445                $chunk = trim($chunk);
446                if ($chunk !== '') $chunks[] = $chunk;
447
448                // start new chunk with remembered sentences
449                $chunk = implode(' ', $this->sentenceQueue);
450                $chunk .= $sentence;
451                $chunklen = count($tiktok->encode($chunk));
452            }
453        }
454        $chunks[] = $chunk;
455
456        return $chunks;
457    }
458
459    /**
460     * Add a sentence to the queue of remembered sentences
461     *
462     * @param string $sentence
463     * @return void
464     */
465    protected function rememberSentence($sentence)
466    {
467        // add sentence to queue
468        $this->sentenceQueue[] = $sentence;
469
470        // remove oldest sentences from queue until we are below the max overlap
471        $encoder = $this->getTokenEncoder();
472        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
473            array_shift($this->sentenceQueue);
474        }
475    }
476}
477