1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Extension\PluginInterface;
7use dokuwiki\File\PageResolver;
8use dokuwiki\plugin\aichat\Model\ChatInterface;
9use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
10use dokuwiki\plugin\aichat\Storage\AbstractStorage;
11use dokuwiki\Search\Indexer;
12use splitbrain\phpcli\CLI;
13use TikToken\Encoder;
14use Vanderlee\Sentence\Sentence;
15
16/**
17 * Manage the embeddings index
18 *
19 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
20 * OpenAI and stored in the Storage backend.
21 */
22class Embeddings
23{
24    /** @var int maximum overlap between chunks in tokens */
25    final public const MAX_OVERLAP_LEN = 200;
26
27    /** @var ChatInterface */
28    protected $chatModel;
29
30    /** @var EmbeddingInterface */
31    protected $embedModel;
32
33    /** @var CLI|null */
34    protected $logger;
35    /** @var Encoder */
36    protected $tokenEncoder;
37
38    /** @var AbstractStorage */
39    protected $storage;
40
41    /** @var array remember sentences when chunking */
42    private $sentenceQueue = [];
43
44    /** @var int the time spent for the last similar chunk retrieval */
45    public $timeSpent = 0;
46
47    protected $configChunkSize;
48    protected $configContextChunks;
49    protected $similarityThreshold;
50
51    /**
52     * Embeddings constructor.
53     *
54     * @param ChatInterface $chatModel
55     * @param EmbeddingInterface $embedModel
56     * @param AbstractStorage $storage
57     * @param array $config The plugin configuration
58     */
59    public function __construct(
60        ChatInterface $chatModel,
61        EmbeddingInterface $embedModel,
62        AbstractStorage $storage,
63        $config
64    ) {
65        $this->chatModel = $chatModel;
66        $this->embedModel = $embedModel;
67        $this->storage = $storage;
68        $this->configChunkSize = $config['chunkSize'];
69        $this->configContextChunks = $config['contextChunks'];
70        $this->similarityThreshold = $config['similarityThreshold'] / 100;
71    }
72
73    /**
74     * Access storage
75     *
76     * @return AbstractStorage
77     */
78    public function getStorage()
79    {
80        return $this->storage;
81    }
82
83    /**
84     * Add a logger instance
85     *
86     * @return void
87     */
88    public function setLogger(CLI $logger)
89    {
90        $this->logger = $logger;
91    }
92
93    /**
94     * Get the token encoder instance
95     *
96     * @return Encoder
97     */
98    public function getTokenEncoder()
99    {
100        if (!$this->tokenEncoder instanceof Encoder) {
101            $this->tokenEncoder = new Encoder();
102        }
103        return $this->tokenEncoder;
104    }
105
106    /**
107     * Return the chunk size to use
108     *
109     * @return int
110     */
111    public function getChunkSize()
112    {
113        return min(
114            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
115            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
116            $this->configChunkSize, // this is usually the smallest
117        );
118    }
119
120    /**
121     * Update the embeddings storage
122     *
123     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
124     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
125     * @param bool $clear Should any existing storage be cleared before updating?
126     * @return void
127     * @throws \Exception
128     */
129    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
130    {
131        $indexer = new Indexer();
132        $pages = $indexer->getPages();
133
134        $this->storage->startCreation($clear);
135        foreach ($pages as $pid => $page) {
136            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
137
138            if (
139                !page_exists($page) ||
140                isHiddenPage($page) ||
141                filesize(wikiFN($page)) < 150 || // skip very small pages
142                ($skipRE && preg_match($skipRE, (string)$page)) ||
143                ($matchRE && !preg_match($matchRE, ":$page"))
144            ) {
145                // this page should not be in the index (anymore)
146                $this->storage->deletePageChunks($page, $chunkID);
147                continue;
148            }
149
150            $firstChunk = $this->storage->getChunk($chunkID);
151            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
152                // page is older than the chunks we have, reuse the existing chunks
153                $this->storage->reusePageChunks($page, $chunkID);
154                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
155            } else {
156                // page is newer than the chunks we have, create new chunks
157                $this->storage->deletePageChunks($page, $chunkID);
158                $chunks = $this->createPageChunks($page, $chunkID);
159                if ($chunks) $this->storage->addPageChunks($chunks);
160            }
161        }
162        $this->storage->finalizeCreation();
163    }
164
165    /**
166     * Split the given page, fetch embedding vectors and return Chunks
167     *
168     * Will use the text renderer plugin if available to get the rendered text.
169     * Otherwise the raw wiki text is used.
170     *
171     * @param string $page Name of the page to split
172     * @param int $firstChunkID The ID of the first chunk of this page
173     * @return Chunk[] A list of chunks created for this page
174     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
175     * @throws \Exception
176     */
177    public function createPageChunks($page, $firstChunkID)
178    {
179        $chunkList = [];
180
181        global $ID;
182        $ID = $page;
183        try {
184            $text = p_cached_output(wikiFN($page), 'aichat', $page);
185        } catch (\Throwable $e) {
186            if ($this->logger) $this->logger->error(
187                'Failed to render page {page}. Using raw text instead. {msg}',
188                ['page' => $page, 'msg' => $e->getMessage()]
189            );
190            $text = rawWiki($page);
191        }
192
193        $crumbs = $this->breadcrumbTrail($page);
194
195        // allow plugins to modify the text before splitting
196        $eventData = [
197            'page' => $page,
198            'body' => '',
199            'metadata' => ['title' => $page, 'relation_references' => []],
200        ];
201        $event = new Event('INDEXER_PAGE_ADD', $eventData);
202        if ($event->advise_before()) {
203            $text = $eventData['body'] . ' ' . $text;
204        } else {
205            $text = $eventData['body'];
206        }
207
208        $parts = $this->splitIntoChunks($text);
209        foreach ($parts as $part) {
210            if (trim((string)$part) == '') continue; // skip empty chunks
211
212            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
213
214            try {
215                $embedding = $this->embedModel->getEmbedding($part);
216            } catch (\Exception $e) {
217                if ($this->logger instanceof CLI) {
218                    $this->logger->error(
219                        'Failed to get embedding for chunk of page {page}: {msg}',
220                        ['page' => $page, 'msg' => $e->getMessage()]
221                    );
222                }
223                continue;
224            }
225            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
226            $firstChunkID++;
227        }
228        if ($this->logger instanceof CLI) {
229            if ($chunkList !== []) {
230                $this->logger->success(
231                    '{id} split into {count} chunks',
232                    ['id' => $page, 'count' => count($chunkList)]
233                );
234            } else {
235                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
236            }
237        }
238        return $chunkList;
239    }
240
241    /**
242     * Do a nearest neighbor search for chunks similar to the given question
243     *
244     * Returns only chunks the current user is allowed to read, may return an empty result.
245     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
246     *
247     * @param string $query The question
248     * @param string $lang Limit results to this language
249     * @return Chunk[]
250     * @throws \Exception
251     */
252    public function getSimilarChunks($query, $lang = '')
253    {
254        global $auth;
255        $vector = $this->embedModel->getEmbedding($query);
256
257        $fetch = min(
258            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
259            $this->configContextChunks
260        );
261
262        $time = microtime(true);
263        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
264        $this->timeSpent = round(microtime(true) - $time, 2);
265        if ($this->logger instanceof CLI) {
266            $this->logger->info(
267                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
268                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
269            );
270        }
271
272        $size = 0;
273        $result = [];
274        foreach ($chunks as $chunk) {
275            // filter out chunks the user is not allowed to read
276            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
277            if ($chunk->getScore() < $this->similarityThreshold) continue;
278
279            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
280            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
281
282            $result[] = $chunk;
283            $size += $chunkSize;
284        }
285        return $result;
286    }
287
288    /**
289     * Create a breadcrumb trail for the given page
290     *
291     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
292     * each chunk to give the AI some context.
293     *
294     * @param string $id
295     * @return string
296     */
297    protected function breadcrumbTrail($id)
298    {
299        $namespaces = explode(':', getNS($id));
300        $resolver = new PageResolver($id);
301        $crumbs = [];
302
303        // all namespaces
304        $check = '';
305        foreach ($namespaces as $namespace) {
306            $check .= $namespace . ':';
307            $page = $resolver->resolveId($check);
308            $title = p_get_first_heading($page);
309            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
310        }
311
312        // the page itself
313        $title = p_get_first_heading($id);
314        $page = noNS($id);
315        $crumbs[] = $title ? "$title ($page)" : $page;
316
317        return implode(' » ', $crumbs);
318    }
319
320    /**
321     * @param $text
322     * @return array
323     * @throws \Exception
324     * @todo support splitting too long sentences
325     */
326    protected function splitIntoChunks($text)
327    {
328        $sentenceSplitter = new Sentence();
329        $tiktok = $this->getTokenEncoder();
330
331        $chunks = [];
332        $sentences = $sentenceSplitter->split($text);
333
334        $chunklen = 0;
335        $chunk = '';
336        while ($sentence = array_shift($sentences)) {
337            $slen = count($tiktok->encode($sentence));
338            if ($slen > $this->getChunkSize()) {
339                // sentence is too long, we need to split it further
340                if ($this->logger instanceof CLI) $this->logger->warning(
341                    'Sentence too long, splitting not implemented yet'
342                );
343                continue;
344            }
345
346            if ($chunklen + $slen < $this->getChunkSize()) {
347                // add to current chunk
348                $chunk .= $sentence;
349                $chunklen += $slen;
350                // remember sentence for overlap check
351                $this->rememberSentence($sentence);
352            } else {
353                // add current chunk to result
354                $chunk = trim($chunk);
355                if ($chunk !== '') $chunks[] = $chunk;
356
357                // start new chunk with remembered sentences
358                $chunk = implode(' ', $this->sentenceQueue);
359                $chunk .= $sentence;
360                $chunklen = count($tiktok->encode($chunk));
361            }
362        }
363        $chunks[] = $chunk;
364
365        return $chunks;
366    }
367
368    /**
369     * Add a sentence to the queue of remembered sentences
370     *
371     * @param string $sentence
372     * @return void
373     */
374    protected function rememberSentence($sentence)
375    {
376        // add sentence to queue
377        $this->sentenceQueue[] = $sentence;
378
379        // remove oldest sentences from queue until we are below the max overlap
380        $encoder = $this->getTokenEncoder();
381        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
382            array_shift($this->sentenceQueue);
383        }
384    }
385}
386