1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\File\PageResolver;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var int maximum overlap between chunks in tokens */
24    final public const MAX_OVERLAP_LEN = 200;
25
26    /** @var ChatInterface */
27    protected $chatModel;
28
29    /** @var EmbeddingInterface */
30    protected $embedModel;
31
32    /** @var CLI|null */
33    protected $logger;
34    /** @var Encoder */
35    protected $tokenEncoder;
36
37    /** @var AbstractStorage */
38    protected $storage;
39
40    /** @var array remember sentences when chunking */
41    private $sentenceQueue = [];
42
43    /** @var int the time spent for the last similar chunk retrieval */
44    public $timeSpent = 0;
45
46    protected $configChunkSize;
47    protected $configContextChunks;
48    protected $similarityThreshold;
49
50    /**
51     * Embeddings constructor.
52     *
53     * @param ChatInterface $chatModel
54     * @param EmbeddingInterface $embedModel
55     * @param AbstractStorage $storage
56     * @param array $config The plugin configuration
57     */
58    public function __construct(
59        ChatInterface      $chatModel,
60        EmbeddingInterface $embedModel,
61        AbstractStorage    $storage,
62                           $config
63    )
64    {
65        $this->chatModel = $chatModel;
66        $this->embedModel = $embedModel;
67        $this->storage = $storage;
68        $this->configChunkSize = $config['chunkSize'];
69        $this->configContextChunks = $config['contextChunks'];
70        $this->similarityThreshold = $config['similarityThreshold'] / 100;
71    }
72
73    /**
74     * Access storage
75     *
76     * @return AbstractStorage
77     */
78    public function getStorage()
79    {
80        return $this->storage;
81    }
82
83    /**
84     * Override the number of used context chunks
85     *
86     * @param int $max
87     * @return void
88     */
89    public function setConfigContextChunks(int $max)
90    {
91        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
92        $this->configContextChunks = $max;
93    }
94
95    /**
96     * Override the similiarity threshold
97     *
98     * @param float $threshold
99     * @return void
100     */
101    public function setSimilarityThreshold(float $threshold)
102    {
103        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
104        $this->similarityThreshold = $threshold;
105    }
106
107    /**
108     * Add a logger instance
109     *
110     * @return void
111     */
112    public function setLogger(CLI $logger)
113    {
114        $this->logger = $logger;
115    }
116
117    /**
118     * Get the token encoder instance
119     *
120     * @return Encoder
121     */
122    public function getTokenEncoder()
123    {
124        if (!$this->tokenEncoder instanceof Encoder) {
125            $this->tokenEncoder = new Encoder();
126        }
127        return $this->tokenEncoder;
128    }
129
130    /**
131     * Return the chunk size to use
132     *
133     * @return int
134     */
135    public function getChunkSize()
136    {
137        return min(
138            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
139            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
140            $this->configChunkSize, // this is usually the smallest
141        );
142    }
143
144    /**
145     * Update the embeddings storage
146     *
147     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
148     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
149     * @param bool $clear Should any existing storage be cleared before updating?
150     * @return void
151     * @throws \Exception
152     */
153    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
154    {
155        $indexer = new Indexer();
156        $pages = $indexer->getPages();
157
158        $this->storage->startCreation($clear);
159        foreach ($pages as $pid => $page) {
160            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
161
162            if (
163                !page_exists($page) ||
164                isHiddenPage($page) ||
165                filesize(wikiFN($page)) < 150 || // skip very small pages
166                ($skipRE && preg_match($skipRE, (string)$page)) ||
167                ($matchRE && !preg_match($matchRE, ":$page"))
168            ) {
169                // this page should not be in the index (anymore)
170                $this->storage->deletePageChunks($page, $chunkID);
171                continue;
172            }
173
174            $firstChunk = $this->storage->getChunk($chunkID);
175            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
176                // page is older than the chunks we have, reuse the existing chunks
177                $this->storage->reusePageChunks($page, $chunkID);
178                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
179            } else {
180                // page is newer than the chunks we have, create new chunks
181                $this->storage->deletePageChunks($page, $chunkID);
182                $chunks = $this->createPageChunks($page, $chunkID);
183                if ($chunks) $this->storage->addPageChunks($chunks);
184            }
185        }
186        $this->storage->finalizeCreation();
187    }
188
189    /**
190     * Split the given page, fetch embedding vectors and return Chunks
191     *
192     * Will use the text renderer plugin if available to get the rendered text.
193     * Otherwise the raw wiki text is used.
194     *
195     * @param string $page Name of the page to split
196     * @param int $firstChunkID The ID of the first chunk of this page
197     * @return Chunk[] A list of chunks created for this page
198     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
199     * @throws \Exception
200     */
201    public function createPageChunks($page, $firstChunkID)
202    {
203        $chunkList = [];
204
205        global $ID;
206        $ID = $page;
207        try {
208            $text = p_cached_output(wikiFN($page), 'aichat', $page);
209        } catch (\Throwable $e) {
210            if ($this->logger) $this->logger->error(
211                'Failed to render page {page}. Using raw text instead. {msg}',
212                ['page' => $page, 'msg' => $e->getMessage()]
213            );
214            $text = rawWiki($page);
215        }
216
217        $crumbs = $this->breadcrumbTrail($page);
218
219        // allow plugins to modify the text before splitting
220        $eventData = [
221            'page' => $page,
222            'body' => '',
223            'metadata' => ['title' => $page, 'relation_references' => []],
224        ];
225        $event = new Event('INDEXER_PAGE_ADD', $eventData);
226        if ($event->advise_before()) {
227            $text = $eventData['body'] . ' ' . $text;
228        } else {
229            $text = $eventData['body'];
230        }
231
232        $parts = $this->splitIntoChunks($text);
233        foreach ($parts as $part) {
234            if (trim((string)$part) == '') continue; // skip empty chunks
235
236            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
237
238            try {
239                $embedding = $this->embedModel->getEmbedding($part);
240            } catch (\Exception $e) {
241                if ($this->logger instanceof CLI) {
242                    $this->logger->error(
243                        'Failed to get embedding for chunk of page {page}: {msg}',
244                        ['page' => $page, 'msg' => $e->getMessage()]
245                    );
246                }
247                continue;
248            }
249            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
250            $firstChunkID++;
251        }
252        if ($this->logger instanceof CLI) {
253            if ($chunkList !== []) {
254                $this->logger->success(
255                    '{id} split into {count} chunks',
256                    ['id' => $page, 'count' => count($chunkList)]
257                );
258            } else {
259                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
260            }
261        }
262        return $chunkList;
263    }
264
265    /**
266     * Do a nearest neighbor search for chunks similar to the given question
267     *
268     * Returns only chunks the current user is allowed to read, may return an empty result.
269     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
270     *
271     * @param string $query The question
272     * @param string $lang Limit results to this language
273     * @param bool $limits Apply chat token limits to the number of chunks returned?
274     * @return Chunk[]
275     * @throws \Exception
276     */
277    public function getSimilarChunks($query, $lang = '', $limits = true)
278    {
279        global $auth;
280        $vector = $this->embedModel->getEmbedding($query);
281
282        if ($limits) {
283            $fetch = min(
284                ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
285                $this->configContextChunks
286            );
287        } else {
288            $fetch = $this->configContextChunks;
289        }
290
291        $time = microtime(true);
292        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
293        $this->timeSpent = round(microtime(true) - $time, 2);
294        if ($this->logger instanceof CLI) {
295            $this->logger->info(
296                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
297                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
298            );
299        }
300
301        $size = 0;
302        $result = [];
303        foreach ($chunks as $chunk) {
304            // filter out chunks the user is not allowed to read
305            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
306            if ($chunk->getScore() < $this->similarityThreshold) continue;
307
308            if ($limits) {
309                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
310                if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
311            }
312
313            $result[] = $chunk;
314            $size += $chunkSize ?? 0;
315
316            if (count($result) >= $this->configContextChunks) break; // we have enough
317        }
318        return $result;
319    }
320
321    /**
322     * Create a breadcrumb trail for the given page
323     *
324     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
325     * each chunk to give the AI some context.
326     *
327     * @param string $id
328     * @return string
329     */
330    protected function breadcrumbTrail($id)
331    {
332        $namespaces = explode(':', getNS($id));
333        $resolver = new PageResolver($id);
334        $crumbs = [];
335
336        // all namespaces
337        $check = '';
338        foreach ($namespaces as $namespace) {
339            $check .= $namespace . ':';
340            $page = $resolver->resolveId($check);
341            $title = p_get_first_heading($page);
342            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
343        }
344
345        // the page itself
346        $title = p_get_first_heading($id);
347        $page = noNS($id);
348        $crumbs[] = $title ? "$title ($page)" : $page;
349
350        return implode(' » ', $crumbs);
351    }
352
353    /**
354     * @param $text
355     * @return array
356     * @throws \Exception
357     * @todo support splitting too long sentences
358     */
359    protected function splitIntoChunks($text)
360    {
361        $sentenceSplitter = new Sentence();
362        $tiktok = $this->getTokenEncoder();
363
364        $chunks = [];
365        $sentences = $sentenceSplitter->split($text);
366
367        $chunklen = 0;
368        $chunk = '';
369        while ($sentence = array_shift($sentences)) {
370            $slen = count($tiktok->encode($sentence));
371            if ($slen > $this->getChunkSize()) {
372                // sentence is too long, we need to split it further
373                if ($this->logger instanceof CLI) $this->logger->warning(
374                    'Sentence too long, splitting not implemented yet'
375                );
376                continue;
377            }
378
379            if ($chunklen + $slen < $this->getChunkSize()) {
380                // add to current chunk
381                $chunk .= $sentence;
382                $chunklen += $slen;
383                // remember sentence for overlap check
384                $this->rememberSentence($sentence);
385            } else {
386                // add current chunk to result
387                $chunk = trim($chunk);
388                if ($chunk !== '') $chunks[] = $chunk;
389
390                // start new chunk with remembered sentences
391                $chunk = implode(' ', $this->sentenceQueue);
392                $chunk .= $sentence;
393                $chunklen = count($tiktok->encode($chunk));
394            }
395        }
396        $chunks[] = $chunk;
397
398        return $chunks;
399    }
400
401    /**
402     * Add a sentence to the queue of remembered sentences
403     *
404     * @param string $sentence
405     * @return void
406     */
407    protected function rememberSentence($sentence)
408    {
409        // add sentence to queue
410        $this->sentenceQueue[] = $sentence;
411
412        // remove oldest sentences from queue until we are below the max overlap
413        $encoder = $this->getTokenEncoder();
414        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
415            array_shift($this->sentenceQueue);
416        }
417    }
418}
419