xref: /plugin/aichat/Embeddings.php (revision 34a1c47875552330ce367360d99f2c3f9f69af94)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\ChatInterface;
7use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
8use dokuwiki\plugin\aichat\Storage\AbstractStorage;
9use dokuwiki\Search\Indexer;
10use splitbrain\phpcli\CLI;
11use TikToken\Encoder;
12use Vanderlee\Sentence\Sentence;
13
14/**
15 * Manage the embeddings index
16 *
17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
18 * OpenAI and stored in the Storage backend.
19 */
20class Embeddings
21{
22    /** @var int maximum overlap between chunks in tokens */
23    final public const MAX_OVERLAP_LEN = 200;
24
25    /** @var ChatInterface */
26    protected $chatModel;
27
28    /** @var EmbeddingInterface */
29    protected $embedModel;
30
31    /** @var CLI|null */
32    protected $logger;
33    /** @var Encoder */
34    protected $tokenEncoder;
35
36    /** @var AbstractStorage */
37    protected $storage;
38
39    /** @var array remember sentences when chunking */
40    private $sentenceQueue = [];
41
42    protected $configChunkSize;
43    protected $configContextChunks;
44
45    /**
46     * Embeddings constructor.
47     *
48     * @param ChatInterface $chatModel
49     * @param EmbeddingInterface $embedModel
50     * @param AbstractStorage $storage
51     * @param array $config The plugin configuration
52     */
53    public function __construct(
54        ChatInterface      $chatModel,
55        EmbeddingInterface $embedModel,
56        AbstractStorage    $storage,
57                           $config
58    )
59    {
60        $this->chatModel = $chatModel;
61        $this->embedModel = $embedModel;
62        $this->storage = $storage;
63        $this->configChunkSize = $config['chunkSize'];
64        $this->configContextChunks = $config['contextChunks'];
65    }
66
67    /**
68     * Access storage
69     *
70     * @return AbstractStorage
71     */
72    public function getStorage()
73    {
74        return $this->storage;
75    }
76
77    /**
78     * Add a logger instance
79     *
80     * @return void
81     */
82    public function setLogger(CLI $logger)
83    {
84        $this->logger = $logger;
85    }
86
87    /**
88     * Get the token encoder instance
89     *
90     * @return Encoder
91     */
92    public function getTokenEncoder()
93    {
94        if (!$this->tokenEncoder instanceof Encoder) {
95            $this->tokenEncoder = new Encoder();
96        }
97        return $this->tokenEncoder;
98    }
99
100    /**
101     * Return the chunk size to use
102     *
103     * @return int
104     */
105    public function getChunkSize()
106    {
107        return min(
108            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
109            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
110            $this->configChunkSize, // this is usually the smallest
111        );
112    }
113
114    /**
115     * Update the embeddings storage
116     *
117     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
118     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
119     * @param bool $clear Should any existing storage be cleared before updating?
120     * @return void
121     * @throws \Exception
122     */
123    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
124    {
125        $indexer = new Indexer();
126        $pages = $indexer->getPages();
127
128        $this->storage->startCreation($clear);
129        foreach ($pages as $pid => $page) {
130            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
131
132            if (
133                !page_exists($page) ||
134                isHiddenPage($page) ||
135                filesize(wikiFN($page)) < 150 || // skip very small pages
136                ($skipRE && preg_match($skipRE, (string)$page)) ||
137                ($matchRE && !preg_match($matchRE, ":$page"))
138            ) {
139                // this page should not be in the index (anymore)
140                $this->storage->deletePageChunks($page, $chunkID);
141                continue;
142            }
143
144            $firstChunk = $this->storage->getChunk($chunkID);
145            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
146                // page is older than the chunks we have, reuse the existing chunks
147                $this->storage->reusePageChunks($page, $chunkID);
148                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
149            } else {
150                // page is newer than the chunks we have, create new chunks
151                $this->storage->deletePageChunks($page, $chunkID);
152                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
153            }
154        }
155        $this->storage->finalizeCreation();
156    }
157
158    /**
159     * Split the given page, fetch embedding vectors and return Chunks
160     *
161     * Will use the text renderer plugin if available to get the rendered text.
162     * Otherwise the raw wiki text is used.
163     *
164     * @param string $page Name of the page to split
165     * @param int $firstChunkID The ID of the first chunk of this page
166     * @return Chunk[] A list of chunks created for this page
167     * @throws \Exception
168     */
169    protected function createPageChunks($page, $firstChunkID)
170    {
171        $chunkList = [];
172
173        $textRenderer = plugin_load('renderer', 'text');
174        if ($textRenderer instanceof PluginInterface) {
175            global $ID;
176            $ID = $page;
177            $text = p_cached_output(wikiFN($page), 'text', $page);
178        } else {
179            $text = rawWiki($page);
180        }
181
182        $parts = $this->splitIntoChunks($text);
183        foreach ($parts as $part) {
184            if (trim((string)$part) == '') continue; // skip empty chunks
185
186            try {
187                $embedding = $this->embedModel->getEmbedding($part);
188            } catch (\Exception $e) {
189                if ($this->logger instanceof CLI) {
190                    $this->logger->error(
191                        'Failed to get embedding for chunk of page {page}: {msg}',
192                        ['page' => $page, 'msg' => $e->getMessage()]
193                    );
194                }
195                continue;
196            }
197            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
198            $firstChunkID++;
199        }
200        if ($this->logger instanceof CLI) {
201            if ($chunkList !== []) {
202                $this->logger->success(
203                    '{id} split into {count} chunks',
204                    ['id' => $page, 'count' => count($chunkList)]
205                );
206            } else {
207                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
208            }
209        }
210        return $chunkList;
211    }
212
213    /**
214     * Do a nearest neighbor search for chunks similar to the given question
215     *
216     * Returns only chunks the current user is allowed to read, may return an empty result.
217     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
218     *
219     * @param string $query The question
220     * @param string $lang Limit results to this language
221     * @return Chunk[]
222     * @throws \Exception
223     */
224    public function getSimilarChunks($query, $lang = '')
225    {
226        global $auth;
227        $vector = $this->embedModel->getEmbedding($query);
228
229        $fetch = (int) ceil(
230            min(
231                ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize() ),
232                $this->configContextChunks
233            )
234            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
235        );
236
237        $time = microtime(true);
238        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
239        if ($this->logger instanceof CLI) {
240            $this->logger->info(
241                'Fetched {count} similar chunks from store in {time} seconds',
242                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
243            );
244        }
245
246        $size = 0;
247        $result = [];
248        foreach ($chunks as $chunk) {
249            // filter out chunks the user is not allowed to read
250            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
251
252            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
253            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
254
255            $result[] = $chunk;
256            $size += $chunkSize;
257        }
258        return $result;
259    }
260
261
262    /**
263     * @param $text
264     * @return array
265     * @throws \Exception
266     * @todo support splitting too long sentences
267     */
268    public function splitIntoChunks($text)
269    {
270        $sentenceSplitter = new Sentence();
271        $tiktok = $this->getTokenEncoder();
272
273        $chunks = [];
274        $sentences = $sentenceSplitter->split($text);
275
276        $chunklen = 0;
277        $chunk = '';
278        while ($sentence = array_shift($sentences)) {
279            $slen = count($tiktok->encode($sentence));
280            if ($slen > $this->getChunkSize()) {
281                // sentence is too long, we need to split it further
282                if ($this->logger instanceof CLI) $this->logger->warning(
283                    'Sentence too long, splitting not implemented yet'
284                );
285                continue;
286            }
287
288            if ($chunklen + $slen < $this->getChunkSize()) {
289                // add to current chunk
290                $chunk .= $sentence;
291                $chunklen += $slen;
292                // remember sentence for overlap check
293                $this->rememberSentence($sentence);
294            } else {
295                // add current chunk to result
296                $chunks[] = $chunk;
297
298                // start new chunk with remembered sentences
299                $chunk = implode(' ', $this->sentenceQueue);
300                $chunk .= $sentence;
301                $chunklen = count($tiktok->encode($chunk));
302            }
303        }
304        $chunks[] = $chunk;
305
306        return $chunks;
307    }
308
309    /**
310     * Add a sentence to the queue of remembered sentences
311     *
312     * @param string $sentence
313     * @return void
314     */
315    protected function rememberSentence($sentence)
316    {
317        // add sentence to queue
318        $this->sentenceQueue[] = $sentence;
319
320        // remove oldest sentences from queue until we are below the max overlap
321        $encoder = $this->getTokenEncoder();
322        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
323            array_shift($this->sentenceQueue);
324        }
325    }
326}
327