xref: /plugin/aichat/Embeddings.php (revision 072e009990858d649f31eceb61c1bc980d28f40c)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\File\PageResolver;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var ChatInterface */
24    protected $chatModel;
25
26    /** @var EmbeddingInterface */
27    protected $embedModel;
28
29    /** @var CLI|null */
30    protected $logger;
31    /** @var Encoder */
32    protected $tokenEncoder;
33
34    /** @var AbstractStorage */
35    protected $storage;
36
37    /** @var array remember sentences when chunking */
38    private $sentenceQueue = [];
39
40    /** @var int the time spent for the last similar chunk retrieval */
41    public $timeSpent = 0;
42
43    protected $configChunkSize;
44    protected $configContextChunks;
45    protected $similarityThreshold;
46
47    /**
48     * Embeddings constructor.
49     *
50     * @param ChatInterface $chatModel
51     * @param EmbeddingInterface $embedModel
52     * @param AbstractStorage $storage
53     * @param array $config The plugin configuration
54     */
55    public function __construct(
56        ChatInterface      $chatModel,
57        EmbeddingInterface $embedModel,
58        AbstractStorage    $storage,
59                           $config
60    )
61    {
62        $this->chatModel = $chatModel;
63        $this->embedModel = $embedModel;
64        $this->storage = $storage;
65        $this->configChunkSize = $config['chunkSize'];
66        $this->configContextChunks = $config['contextChunks'];
67        $this->similarityThreshold = $config['similarityThreshold'] / 100;
68    }
69
70    /**
71     * Access storage
72     *
73     * @return AbstractStorage
74     */
75    public function getStorage()
76    {
77        return $this->storage;
78    }
79
80    /**
81     * Override the number of used context chunks
82     *
83     * @param int $max
84     * @return void
85     */
86    public function setConfigContextChunks(int $max)
87    {
88        if ($max <= 0) throw new \InvalidArgumentException('max context chunks must be greater than 0');
89        $this->configContextChunks = $max;
90    }
91
92    /**
93     * Override the similiarity threshold
94     *
95     * @param float $threshold
96     * @return void
97     */
98    public function setSimilarityThreshold(float $threshold)
99    {
100        if ($threshold < 0 || $threshold > 1) throw new \InvalidArgumentException('threshold must be between 0 and 1');
101        $this->similarityThreshold = $threshold;
102    }
103
104    /**
105     * Add a logger instance
106     *
107     * @return void
108     */
109    public function setLogger(CLI $logger)
110    {
111        $this->logger = $logger;
112    }
113
114    /**
115     * Get the token encoder instance
116     *
117     * @return Encoder
118     */
119    public function getTokenEncoder()
120    {
121        if (!$this->tokenEncoder instanceof Encoder) {
122            $this->tokenEncoder = new Encoder();
123        }
124        return $this->tokenEncoder;
125    }
126
127    /**
128     * Return the chunk size to use
129     *
130     * @return int
131     */
132    public function getChunkSize()
133    {
134        return min(
135            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
136            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
137            $this->configChunkSize, // this is usually the smallest
138        );
139    }
140
141    /**
142     * Update the embeddings storage
143     *
144     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
145     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
146     * @param bool $clear Should any existing storage be cleared before updating?
147     * @return void
148     * @throws \Exception
149     */
150    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
151    {
152        $indexer = new Indexer();
153        $pages = $indexer->getPages();
154
155        $this->storage->startCreation($clear);
156        foreach ($pages as $pid => $page) {
157            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
158
159            if (
160                !page_exists($page) ||
161                isHiddenPage($page) ||
162                filesize(wikiFN($page)) < 150 || // skip very small pages
163                ($skipRE && preg_match($skipRE, (string)$page)) ||
164                ($matchRE && !preg_match($matchRE, ":$page"))
165            ) {
166                // this page should not be in the index (anymore)
167                $this->storage->deletePageChunks($page, $chunkID);
168                continue;
169            }
170
171            $firstChunk = $this->storage->getChunk($chunkID);
172            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
173                // page is older than the chunks we have, reuse the existing chunks
174                $this->storage->reusePageChunks($page, $chunkID);
175                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
176            } else {
177                // page is newer than the chunks we have, create new chunks
178                $this->storage->deletePageChunks($page, $chunkID);
179                $chunks = $this->createPageChunks($page, $chunkID);
180                if ($chunks) $this->storage->addPageChunks($chunks);
181            }
182        }
183        $this->storage->finalizeCreation();
184    }
185
186    /**
187     * Split the given page, fetch embedding vectors and return Chunks
188     *
189     * Will use the text renderer plugin if available to get the rendered text.
190     * Otherwise the raw wiki text is used.
191     *
192     * @param string $page Name of the page to split
193     * @param int $firstChunkID The ID of the first chunk of this page
194     * @return Chunk[] A list of chunks created for this page
195     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
196     * @throws \Exception
197     */
198    public function createPageChunks($page, $firstChunkID)
199    {
200        $chunkList = [];
201
202        global $ID;
203        $ID = $page;
204        try {
205            $text = p_cached_output(wikiFN($page), 'aichat', $page);
206        } catch (\Throwable $e) {
207            if ($this->logger) $this->logger->error(
208                'Failed to render page {page}. Using raw text instead. {msg}',
209                ['page' => $page, 'msg' => $e->getMessage()]
210            );
211            $text = rawWiki($page);
212        }
213
214        $crumbs = $this->breadcrumbTrail($page);
215
216        // allow plugins to modify the text before splitting
217        $eventData = [
218            'page' => $page,
219            'body' => '',
220            'metadata' => ['title' => $page, 'relation_references' => []],
221        ];
222        $event = new Event('INDEXER_PAGE_ADD', $eventData);
223        if ($event->advise_before()) {
224            $text = $eventData['body'] . ' ' . $text;
225        } else {
226            $text = $eventData['body'];
227        }
228
229        $splitter = new TextSplitter($this->getChunkSize(), $this->getTokenEncoder());
230        $parts = $splitter->splitIntoChunks($text);
231        foreach ($parts as $part) {
232            if (trim($part) === '') continue; // skip empty chunks
233
234            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
235
236            try {
237                $embedding = $this->embedModel->getEmbedding($part);
238            } catch (\Exception $e) {
239                if ($this->logger instanceof CLI) {
240                    $this->logger->error(
241                        'Failed to get embedding for chunk of page {page}: {msg}',
242                        ['page' => $page, 'msg' => $e->getMessage()]
243                    );
244                }
245                continue;
246            }
247            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
248            $firstChunkID++;
249        }
250        if ($this->logger instanceof CLI) {
251            if ($chunkList !== []) {
252                $this->logger->success(
253                    '{id} split into {count} chunks',
254                    ['id' => $page, 'count' => count($chunkList)]
255                );
256            } else {
257                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
258            }
259        }
260        return $chunkList;
261    }
262
263    /**
264     * Do a nearest neighbor search for chunks similar to the given question
265     *
266     * Returns only chunks the current user is allowed to read, may return an empty result.
267     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
268     *
269     * @param string $query The question
270     * @param string $lang Limit results to this language
271     * @param bool $limits Apply chat token limits to the number of chunks returned?
272     * @return Chunk[]
273     * @throws \Exception
274     */
275    public function getSimilarChunks($query, $lang = '', $limits = true)
276    {
277        global $auth;
278        $vector = $this->embedModel->getEmbedding($query);
279
280        if ($limits) {
281            $fetch = min(
282                ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
283                $this->configContextChunks
284            );
285        } else {
286            $fetch = $this->configContextChunks;
287        }
288
289        $time = microtime(true);
290        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
291        $this->timeSpent = round(microtime(true) - $time, 2);
292        if ($this->logger instanceof CLI) {
293            $this->logger->info(
294                'Fetched {count} similar chunks from store in {time} seconds. Query: {query}',
295                ['count' => count($chunks), 'time' => $this->timeSpent, 'query' => $query]
296            );
297        }
298
299        $size = 0;
300        $result = [];
301        foreach ($chunks as $chunk) {
302            // filter out chunks the user is not allowed to read
303            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
304            if ($chunk->getScore() < $this->similarityThreshold) continue;
305
306            if ($limits) {
307                $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
308                if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
309            }
310
311            $result[] = $chunk;
312            $size += $chunkSize ?? 0;
313
314            if (count($result) >= $this->configContextChunks) break; // we have enough
315        }
316        return $result;
317    }
318
319    /**
320     * Create a breadcrumb trail for the given page
321     *
322     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
323     * each chunk to give the AI some context.
324     *
325     * @param string $id
326     * @return string
327     */
328    protected function breadcrumbTrail($id)
329    {
330        $namespaces = explode(':', getNS($id));
331        $resolver = new PageResolver($id);
332        $crumbs = [];
333
334        // all namespaces
335        $check = '';
336        foreach ($namespaces as $namespace) {
337            $check .= $namespace . ':';
338            $page = $resolver->resolveId($check);
339            $title = p_get_first_heading($page);
340            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
341        }
342
343        // the page itself
344        $title = p_get_first_heading($id);
345        $page = noNS($id);
346        $crumbs[] = $title ? "$title ($page)" : $page;
347
348        return implode(' » ', $crumbs);
349    }
350}
351