1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\File\PageResolver;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var ChatInterface */
24    protected $chatModel;
25
26    /** @var EmbeddingInterface */
27    protected $embedModel;
28
29    /** @var CLI|null */
30    protected $logger;
31    /** @var Encoder */
32    protected $tokenEncoder;
33
34    /** @var AbstractStorage */
35    protected $storage;
36
37    /** @var array remember sentences when chunking */
38    private $sentenceQueue = [];
39
40    /** @var int the time spent for the last similar chunk retrieval */
41    public $timeSpent = 0;
42
43    protected $configChunkSize;
44    protected $configContextChunks;
45    protected $similarityThreshold;
46
47    /**
48     * Embeddings constructor.
49     *
50     * @param ChatInterface $chatModel
51     * @param EmbeddingInterface $embedModel
52     * @param AbstractStorage $storage
53     * @param array $config The plugin configuration
54     */
55    public function __construct(
56        ChatInterface      $chatModel,
57        EmbeddingInterface $embedModel,
58        AbstractStorage    $storage,
59                           $config
60    )
61    {
62        $this->chatModel = $chatModel;
63        $this->embedModel = $embedModel;
64        $this->storage = $storage;
65        $this->configChunkSize = $config['chunkSize'];
66        $this->configContextChunks = $config['contextChunks'];
67        $this->similarityThreshold = $config['similarityThreshold'] / 100;
68    }
69
70    /**
71     * Access storage
72     *
73     * @return AbstractStorage
74     */
75    public function getStorage()
76    {
77        return $this->storage;
78    }
79
80    /**
81     * Override the number of used context chunks
82     *
83     * @param int $max
84     * @return void
85     */
86    public function setConfigContextChunks(int $max)
87    {
88        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
89        $this->configContextChunks = $max;
90    }
91
92    /**
93     * Override the similiarity threshold
94     *
95     * @param float $threshold
96     * @return void
97     */
98    public function setSimilarityThreshold(float $threshold)
99    {
100        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
101        $this->similarityThreshold = $threshold;
102    }
103
104    /**
105     * Add a logger instance
106     *
107     * @return void
108     */
109    public function setLogger(CLI $logger)
110    {
111        $this->logger = $logger;
112    }
113
114    /**
115     * Get the token encoder instance
116     *
117     * @return Encoder
118     */
119    public function getTokenEncoder()
120    {
121        if (!$this->tokenEncoder instanceof Encoder) {
122            $this->tokenEncoder = new Encoder();
123        }
124        return $this->tokenEncoder;
125    }
126
127    /**
128     * Return the chunk size to use
129     *
130     * @return int
131     */
132    public function getChunkSize()
133    {
134        $tokenlimit = $this->chatModel->getMaxInputTokenLength();
135        if (!$tokenlimit) {
136            // no token limit, use the configured chunk size
137            return $this->configChunkSize;
138        }
139
140        return min(
141            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
142            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
143            $this->configChunkSize, // this is usually the smallest
144        );
145    }
146
147    /**
148     * Update the embeddings storage
149     *
150     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
151     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
152     * @param bool $clear Should any existing storage be cleared before updating?
153     * @return void
154     * @throws \Exception
155     */
156    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
157    {
158        $indexer = new Indexer();
159        $pages = $indexer->getPages();
160
161        $this->storage->startCreation($clear);
162        foreach ($pages as $pid => $page) {
163            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
164
165            if (
166                !page_exists($page) ||
167                isHiddenPage($page) ||
168                filesize(wikiFN($page)) < 150 || // skip very small pages
169                ($skipRE && preg_match($skipRE, (string)$page)) ||
170                ($matchRE && !preg_match($matchRE, ":$page"))
171            ) {
172                // this page should not be in the index (anymore)
173                $this->storage->deletePageChunks($page, $chunkID);
174                continue;
175            }
176
177            $firstChunk = $this->storage->getChunk($chunkID);
178            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
179                // page is older than the chunks we have, reuse the existing chunks
180                $this->storage->reusePageChunks($page, $chunkID);
181                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
182            } else {
183                // page is newer than the chunks we have, create new chunks
184                $this->storage->deletePageChunks($page, $chunkID);
185                $chunks = $this->createPageChunks($page, $chunkID);
186                if ($chunks) $this->storage->addPageChunks($chunks);
187            }
188        }
189        $this->storage->finalizeCreation();
190    }
191
192    /**
193     * Get the content of a page
194     *
195     * Uses our own renderer to format the contents in an LLM friendly way. Falls back to
196     * raw syntax if the renderer fails for some reason
197     *
198     * @param string $page Name of the page to read
199     * @return string The content of the page
200     */
201    public function getPageContent($page)
202    {
203        global $ID;
204        $ID = $page;
205        try {
206            $text = p_cached_output(wikiFN($page), 'aichat', $page);
207        } catch (\Throwable $e) {
208            if ($this->logger) $this->logger->error(
209                'Failed to render page {page}. Using raw text instead. {msg}',
210                ['page' => $page, 'msg' => $e->getMessage()]
211            );
212            $text = rawWiki($page);
213        }
214        return $text;
215    }
216
217    /**
218     * Split the given page, fetch embedding vectors and return Chunks
219     *
220     * Will use the text renderer plugin if available to get the rendered text.
221     * Otherwise the raw wiki text is used.
222     *
223     * @param string $page Name of the page to split
224     * @param int $firstChunkID The ID of the first chunk of this page
225     * @return Chunk[] A list of chunks created for this page
226     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
227     * @throws \Exception
228     */
229    public function createPageChunks($page, $firstChunkID)
230    {
231        $chunkList = [];
232
233        $text = $this->getPageContent($page);
234        $crumbs = $this->breadcrumbTrail($page);
235
236        // allow plugins to modify the text before splitting
237        $eventData = [
238            'page' => $page,
239            'body' => '',
240            'metadata' => ['title' => $page, 'relation_references' => []],
241        ];
242        $event = new Event('INDEXER_PAGE_ADD', $eventData);
243        if ($event->advise_before()) {
244            $text = $eventData['body'] . ' ' . $text;
245        } else {
246            $text = $eventData['body'];
247        }
248
249        $splitter = new TextSplitter($this->getChunkSize(), $this->getTokenEncoder());
250        $parts = $splitter->splitIntoChunks($text);
251        foreach ($parts as $part) {
252            if (trim($part) === '') continue; // skip empty chunks
253
254            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
255
256            try {
257                $embedding = $this->embedModel->getEmbedding($part);
258            } catch (\Exception $e) {
259                if ($this->logger instanceof CLI) {
260                    $this->logger->error(
261                        'Failed to get embedding for chunk of page {page}: {msg}',
262                        ['page' => $page, 'msg' => $e->getMessage()]
263                    );
264                }
265                continue;
266            }
267            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
268            $firstChunkID++;
269        }
270        if ($this->logger instanceof CLI) {
271            if ($chunkList !== []) {
272                $this->logger->success(
273                    '{id} split into {count} chunks',
274                    ['id' => $page, 'count' => count($chunkList)]
275                );
276            } else {
277                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
278            }
279        }
280        return $chunkList;
281    }
282
283    /**
284     * Do a nearest neighbor search for chunks similar to the given question
285     *
286     * Returns only chunks the current user is allowed to read, may return an empty result.
287     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
288     *
289     * @param string $query The question
290     * @param string $lang Limit results to this language
291     * @param bool $limits Apply chat token limits to the number of chunks returned?
292     * @return Chunk[]
293     * @throws \Exception
294     */
295    public function getSimilarChunks($query, $lang = '', $limits = true)
296    {
297        global $auth;
298        $vector = $this->embedModel->getEmbedding($query);
299
300        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
301
302        if ($tokenlimit) {
303            $fetch = min(
304                ($tokenlimit / $this->getChunkSize()),
305                $this->configContextChunks
306            );
307        } else {
308            $fetch = $this->configContextChunks;
309        }
310
311        $time = microtime(true);
312        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
313        $this->timeSpent = round(microtime(true) - $time, 2);
314        if ($this->logger instanceof CLI) {
315            $this->logger->info(
316                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
317                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
318            );
319        }
320
321        $size = 0;
322        $result = [];
323        foreach ($chunks as $chunk) {
324            // filter out chunks the user is not allowed to read
325            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
326            if ($chunk->getScore() < $this->similarityThreshold) continue;
327
328            if ($tokenlimit) {
329                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
330                if ($size + $chunkSize > $tokenlimit) break; // we have enough
331            }
332
333            $result[] = $chunk;
334            $size += $chunkSize ?? 0;
335
336            if (count($result) >= $this->configContextChunks) break; // we have enough
337        }
338        return $result;
339    }
340
341    /**
342     * This works similar to getSimilarChunks, but returns the full page content for each found similar chunk
343     *
344     * This will not apply any token limits
345     *
346     * @param string $query The question
347     * @param string $lang Limit results to this language
348     * @return Chunk[]
349     * @throws \Exception
350     */
351    public function getSimilarPages($query, $lang = '')
352    {
353        $chunks = $this->getSimilarChunks($query, $lang, false);
354        $pages = [];
355
356        foreach ($chunks as $chunk) {
357            $page = $chunk->getPage();
358            if (isset($pages[$page])) continue; // we already have this page
359
360            $content = $this->getPageContent($chunk->getPage());
361            $crumbs = $this->breadcrumbTrail($chunk->getPage());
362
363            $pages[$page] = new Chunk(
364                $page,
365                $chunk->getId(),
366                $crumbs . "\n\n" . $content,
367                $chunk->getEmbedding(),
368                $chunk->getLanguage(),
369                $chunk->getCreated(),
370                $chunk->getScore()
371            );
372        }
373        return $pages;
374    }
375
376    /**
377     * Returns all chunks for a page
378     *
379     * Does not apply configContextChunks but checks token limits if requested
380     *
381     * @param string $page
382     * @param bool $limits Apply chat token limits to the number of chunks returned?
383     * @return Chunk[]
384     */
385    public function getPageChunks($page, $limits = true)
386    {
387        global $auth;
388        if ($auth && auth_quickaclcheck($page) < AUTH_READ) {
389            if ($this->logger instanceof CLI) $this->logger->warning(
390                'User not allowed to read context page {page}', ['page' => $page]
391            );
392            return [];
393        }
394
395        $indexer = new Indexer();
396        $pages = $indexer->getPages();
397        $pos = array_search(cleanID($page), $pages);
398
399        if ($pos === false) {
400            if ($this->logger instanceof CLI) $this->logger->warning(
401                'Context page {page} is not in index', ['page' => $page]
402            );
403            return [];
404        }
405
406        $chunks = $this->storage->getPageChunks($page, $pos * 100);
407
408        $tokenlimit = $limits ? $this->chatModel->getMaxInputTokenLength() : 0;
409
410        $size = 0;
411        $result = [];
412        foreach ($chunks as $chunk) {
413            if ($tokenlimit) {
414                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
415                if ($size + $chunkSize > $tokenlimit) break; // we have enough
416            }
417
418            $result[] = $chunk;
419            $size += $chunkSize ?? 0;
420        }
421
422        return $result;
423    }
424
425
426    /**
427     * Create a breadcrumb trail for the given page
428     *
429     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
430     * each chunk to give the AI some context.
431     *
432     * @param string $id
433     * @return string
434     */
435    protected function breadcrumbTrail($id)
436    {
437        $namespaces = explode(':', getNS($id));
438        $resolver = new PageResolver($id);
439        $crumbs = [];
440
441        // all namespaces
442        $check = '';
443        foreach ($namespaces as $namespace) {
444            $check .= $namespace . ':';
445            $page = $resolver->resolveId($check);
446            $title = p_get_first_heading($page);
447            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
448        }
449
450        // the page itself
451        $title = p_get_first_heading($id);
452        $page = noNS($id);
453        $crumbs[] = $title ? "$title ($page)" : $page;
454
455        return implode(' » ', $crumbs);
456    }
457}
458