1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Extension\PluginInterface;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var int maximum overlap between chunks in tokens */
24    final public const MAX_OVERLAP_LEN = 200;
25
26    /** @var ChatInterface */
27    protected $chatModel;
28
29    /** @var EmbeddingInterface */
30    protected $embedModel;
31
32    /** @var CLI|null */
33    protected $logger;
34    /** @var Encoder */
35    protected $tokenEncoder;
36
37    /** @var AbstractStorage */
38    protected $storage;
39
40    /** @var array remember sentences when chunking */
41    private $sentenceQueue = [];
42
43    /** @var int the time spent for the last similar chunk retrieval */
44    public $timeSpent = 0;
45
46    protected $configChunkSize;
47    protected $configContextChunks;
48    protected $similarityThreshold;
49
50    /**
51     * Embeddings constructor.
52     *
53     * @param ChatInterface $chatModel
54     * @param EmbeddingInterface $embedModel
55     * @param AbstractStorage $storage
56     * @param array $config The plugin configuration
57     */
58    public function __construct(
59        ChatInterface $chatModel,
60        EmbeddingInterface $embedModel,
61        AbstractStorage $storage,
62        $config
63    ) {
64        $this->chatModel = $chatModel;
65        $this->embedModel = $embedModel;
66        $this->storage = $storage;
67        $this->configChunkSize = $config['chunkSize'];
68        $this->configContextChunks = $config['contextChunks'];
69        $this->similarityThreshold = $config['similarityThreshold'] / 100;
70    }
71
72    /**
73     * Access storage
74     *
75     * @return AbstractStorage
76     */
77    public function getStorage()
78    {
79        return $this->storage;
80    }
81
82    /**
83     * Add a logger instance
84     *
85     * @return void
86     */
87    public function setLogger(CLI $logger)
88    {
89        $this->logger = $logger;
90    }
91
92    /**
93     * Get the token encoder instance
94     *
95     * @return Encoder
96     */
97    public function getTokenEncoder()
98    {
99        if (!$this->tokenEncoder instanceof Encoder) {
100            $this->tokenEncoder = new Encoder();
101        }
102        return $this->tokenEncoder;
103    }
104
105    /**
106     * Return the chunk size to use
107     *
108     * @return int
109     */
110    public function getChunkSize()
111    {
112        return min(
113            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
114            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
115            $this->configChunkSize, // this is usually the smallest
116        );
117    }
118
119    /**
120     * Update the embeddings storage
121     *
122     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
123     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
124     * @param bool $clear Should any existing storage be cleared before updating?
125     * @return void
126     * @throws \Exception
127     */
128    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
129    {
130        $indexer = new Indexer();
131        $pages = $indexer->getPages();
132
133        $this->storage->startCreation($clear);
134        foreach ($pages as $pid => $page) {
135            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
136
137            if (
138                !page_exists($page) ||
139                isHiddenPage($page) ||
140                filesize(wikiFN($page)) < 150 || // skip very small pages
141                ($skipRE && preg_match($skipRE, (string)$page)) ||
142                ($matchRE && !preg_match($matchRE, ":$page"))
143            ) {
144                // this page should not be in the index (anymore)
145                $this->storage->deletePageChunks($page, $chunkID);
146                continue;
147            }
148
149            $firstChunk = $this->storage->getChunk($chunkID);
150            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
151                // page is older than the chunks we have, reuse the existing chunks
152                $this->storage->reusePageChunks($page, $chunkID);
153                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
154            } else {
155                // page is newer than the chunks we have, create new chunks
156                $this->storage->deletePageChunks($page, $chunkID);
157                $chunks = $this->createPageChunks($page, $chunkID);
158                if ($chunks) $this->storage->addPageChunks($chunks);
159            }
160        }
161        $this->storage->finalizeCreation();
162    }
163
164    /**
165     * Split the given page, fetch embedding vectors and return Chunks
166     *
167     * Will use the text renderer plugin if available to get the rendered text.
168     * Otherwise the raw wiki text is used.
169     *
170     * @param string $page Name of the page to split
171     * @param int $firstChunkID The ID of the first chunk of this page
172     * @return Chunk[] A list of chunks created for this page
173     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
174     * @throws \Exception
175     */
176    public function createPageChunks($page, $firstChunkID)
177    {
178        $chunkList = [];
179
180        $textRenderer = plugin_load('renderer', 'text');
181        if ($textRenderer instanceof PluginInterface) {
182            global $ID;
183            $ID = $page;
184            $text = p_cached_output(wikiFN($page), 'text', $page);
185        } else {
186            $text = rawWiki($page);
187        }
188
189        // allow plugins to modify the text before splitting
190        $eventData = [
191            'page' => $page,
192            'body' => '',
193            'metadata' => ['title' => $page, 'relation_references' => []],
194        ];
195        $event = new Event('INDEXER_PAGE_ADD', $eventData);
196        if ($event->advise_before()) {
197            $text = $eventData['body'] . ' ' . $text;
198        } else {
199            $text = $eventData['body'];
200        }
201
202        $parts = $this->splitIntoChunks($text);
203        foreach ($parts as $part) {
204            if (trim((string)$part) == '') continue; // skip empty chunks
205
206            try {
207                $embedding = $this->embedModel->getEmbedding($part);
208            } catch (\Exception $e) {
209                if ($this->logger instanceof CLI) {
210                    $this->logger->error(
211                        'Failed to get embedding for chunk of page {page}: {msg}',
212                        ['page' => $page, 'msg' => $e->getMessage()]
213                    );
214                }
215                continue;
216            }
217            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
218            $firstChunkID++;
219        }
220        if ($this->logger instanceof CLI) {
221            if ($chunkList !== []) {
222                $this->logger->success(
223                    '{id} split into {count} chunks',
224                    ['id' => $page, 'count' => count($chunkList)]
225                );
226            } else {
227                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
228            }
229        }
230        return $chunkList;
231    }
232
233    /**
234     * Do a nearest neighbor search for chunks similar to the given question
235     *
236     * Returns only chunks the current user is allowed to read, may return an empty result.
237     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
238     *
239     * @param string $query The question
240     * @param string $lang Limit results to this language
241     * @return Chunk[]
242     * @throws \Exception
243     */
244    public function getSimilarChunks($query, $lang = '')
245    {
246        global $auth;
247        $vector = $this->embedModel->getEmbedding($query);
248
249        $fetch = min(
250            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
251            $this->configContextChunks
252        );
253
254        $time = microtime(true);
255        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
256        $this->timeSpent = round(microtime(true) - $time, 2);
257        if ($this->logger instanceof CLI) {
258            $this->logger->info(
259                'Fetched {count} similar chunks from store in {time} seconds',
260                ['count' => count($chunks), 'time' => $this->timeSpent]
261            );
262        }
263
264        $size = 0;
265        $result = [];
266        foreach ($chunks as $chunk) {
267            // filter out chunks the user is not allowed to read
268            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
269            if ($chunk->getScore() < $this->similarityThreshold) continue;
270
271            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
272            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
273
274            $result[] = $chunk;
275            $size += $chunkSize;
276        }
277        return $result;
278    }
279
280
281    /**
282     * @param $text
283     * @return array
284     * @throws \Exception
285     * @todo support splitting too long sentences
286     */
287    protected function splitIntoChunks($text)
288    {
289        $sentenceSplitter = new Sentence();
290        $tiktok = $this->getTokenEncoder();
291
292        $chunks = [];
293        $sentences = $sentenceSplitter->split($text);
294
295        $chunklen = 0;
296        $chunk = '';
297        while ($sentence = array_shift($sentences)) {
298            $slen = count($tiktok->encode($sentence));
299            if ($slen > $this->getChunkSize()) {
300                // sentence is too long, we need to split it further
301                if ($this->logger instanceof CLI) $this->logger->warning(
302                    'Sentence too long, splitting not implemented yet'
303                );
304                continue;
305            }
306
307            if ($chunklen + $slen < $this->getChunkSize()) {
308                // add to current chunk
309                $chunk .= $sentence;
310                $chunklen += $slen;
311                // remember sentence for overlap check
312                $this->rememberSentence($sentence);
313            } else {
314                // add current chunk to result
315                $chunk = trim($chunk);
316                if ($chunk !== '') $chunks[] = $chunk;
317
318                // start new chunk with remembered sentences
319                $chunk = implode(' ', $this->sentenceQueue);
320                $chunk .= $sentence;
321                $chunklen = count($tiktok->encode($chunk));
322            }
323        }
324        $chunks[] = $chunk;
325
326        return $chunks;
327    }
328
329    /**
330     * Add a sentence to the queue of remembered sentences
331     *
332     * @param string $sentence
333     * @return void
334     */
335    protected function rememberSentence($sentence)
336    {
337        // add sentence to queue
338        $this->sentenceQueue[] = $sentence;
339
340        // remove oldest sentences from queue until we are below the max overlap
341        $encoder = $this->getTokenEncoder();
342        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
343            array_shift($this->sentenceQueue);
344        }
345    }
346}
347