xref: /plugin/aichat/Embeddings.php (revision ab1f8dde36106432cc0a6f320220da5fae6971fe)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Extension\PluginInterface;
7use dokuwiki\plugin\aichat\Model\ChatInterface;
8use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
9use dokuwiki\plugin\aichat\Storage\AbstractStorage;
10use dokuwiki\Search\Indexer;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23    /** @var int maximum overlap between chunks in tokens */
24    final public const MAX_OVERLAP_LEN = 200;
25
26    /** @var ChatInterface */
27    protected $chatModel;
28
29    /** @var EmbeddingInterface */
30    protected $embedModel;
31
32    /** @var CLI|null */
33    protected $logger;
34    /** @var Encoder */
35    protected $tokenEncoder;
36
37    /** @var AbstractStorage */
38    protected $storage;
39
40    /** @var array remember sentences when chunking */
41    private $sentenceQueue = [];
42
43    /** @var int the time spent for the last similar chunk retrieval */
44    public $timeSpent = 0;
45
46    protected $configChunkSize;
47    protected $configContextChunks;
48    protected $similarityThreshold;
49
50    /**
51     * Embeddings constructor.
52     *
53     * @param ChatInterface $chatModel
54     * @param EmbeddingInterface $embedModel
55     * @param AbstractStorage $storage
56     * @param array $config The plugin configuration
57     */
58    public function __construct(
59        ChatInterface      $chatModel,
60        EmbeddingInterface $embedModel,
61        AbstractStorage    $storage,
62                           $config
63    )
64    {
65        $this->chatModel = $chatModel;
66        $this->embedModel = $embedModel;
67        $this->storage = $storage;
68        $this->configChunkSize = $config['chunkSize'];
69        $this->configContextChunks = $config['contextChunks'];
70        $this->similarityThreshold = $config['similarityThreshold'] / 100;
71    }
72
73    /**
74     * Access storage
75     *
76     * @return AbstractStorage
77     */
78    public function getStorage()
79    {
80        return $this->storage;
81    }
82
83    /**
84     * Add a logger instance
85     *
86     * @return void
87     */
88    public function setLogger(CLI $logger)
89    {
90        $this->logger = $logger;
91    }
92
93    /**
94     * Get the token encoder instance
95     *
96     * @return Encoder
97     */
98    public function getTokenEncoder()
99    {
100        if (!$this->tokenEncoder instanceof Encoder) {
101            $this->tokenEncoder = new Encoder();
102        }
103        return $this->tokenEncoder;
104    }
105
106    /**
107     * Return the chunk size to use
108     *
109     * @return int
110     */
111    public function getChunkSize()
112    {
113        return min(
114            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
115            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
116            $this->configChunkSize, // this is usually the smallest
117        );
118    }
119
120    /**
121     * Update the embeddings storage
122     *
123     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
124     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
125     * @param bool $clear Should any existing storage be cleared before updating?
126     * @return void
127     * @throws \Exception
128     */
129    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
130    {
131        $indexer = new Indexer();
132        $pages = $indexer->getPages();
133
134        $this->storage->startCreation($clear);
135        foreach ($pages as $pid => $page) {
136            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
137
138            if (
139                !page_exists($page) ||
140                isHiddenPage($page) ||
141                filesize(wikiFN($page)) < 150 || // skip very small pages
142                ($skipRE && preg_match($skipRE, (string)$page)) ||
143                ($matchRE && !preg_match($matchRE, ":$page"))
144            ) {
145                // this page should not be in the index (anymore)
146                $this->storage->deletePageChunks($page, $chunkID);
147                continue;
148            }
149
150            $firstChunk = $this->storage->getChunk($chunkID);
151            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
152                // page is older than the chunks we have, reuse the existing chunks
153                $this->storage->reusePageChunks($page, $chunkID);
154                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
155            } else {
156                // page is newer than the chunks we have, create new chunks
157                $this->storage->deletePageChunks($page, $chunkID);
158                $chunks = $this->createPageChunks($page, $chunkID);
159                if ($chunks) $this->storage->addPageChunks($chunks);
160            }
161        }
162        $this->storage->finalizeCreation();
163    }
164
165    /**
166     * Split the given page, fetch embedding vectors and return Chunks
167     *
168     * Will use the text renderer plugin if available to get the rendered text.
169     * Otherwise the raw wiki text is used.
170     *
171     * @param string $page Name of the page to split
172     * @param int $firstChunkID The ID of the first chunk of this page
173     * @return Chunk[] A list of chunks created for this page
174     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
175     * @throws \Exception
176     */
177    public function createPageChunks($page, $firstChunkID)
178    {
179        $chunkList = [];
180
181        $textRenderer = plugin_load('renderer', 'text');
182        if ($textRenderer instanceof PluginInterface) {
183            global $ID;
184            $ID = $page;
185            $text = p_cached_output(wikiFN($page), 'text', $page);
186        } else {
187            $text = rawWiki($page);
188        }
189
190        // allow plugins to modify the text before splitting
191        $eventData = [
192            'page' => $page,
193            'body' => '',
194            'metadata' => ['title' => $page, 'relation_references' => []],
195        ];
196        $event = new Event('INDEXER_PAGE_ADD', $eventData);
197        if ($event->advise_before()) {
198            $text = $eventData['body'] . ' ' . $text;
199        } else {
200            $text = $eventData['body'];
201        }
202
203        $parts = $this->splitIntoChunks($text);
204        foreach ($parts as $part) {
205            if (trim((string)$part) == '') continue; // skip empty chunks
206
207            try {
208                $embedding = $this->embedModel->getEmbedding($part);
209            } catch (\Exception $e) {
210                if ($this->logger instanceof CLI) {
211                    $this->logger->error(
212                        'Failed to get embedding for chunk of page {page}: {msg}',
213                        ['page' => $page, 'msg' => $e->getMessage()]
214                    );
215                }
216                continue;
217            }
218            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
219            $firstChunkID++;
220        }
221        if ($this->logger instanceof CLI) {
222            if ($chunkList !== []) {
223                $this->logger->success(
224                    '{id} split into {count} chunks',
225                    ['id' => $page, 'count' => count($chunkList)]
226                );
227            } else {
228                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
229            }
230        }
231        return $chunkList;
232    }
233
234    /**
235     * Do a nearest neighbor search for chunks similar to the given question
236     *
237     * Returns only chunks the current user is allowed to read, may return an empty result.
238     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
239     *
240     * @param string $query The question
241     * @param string $lang Limit results to this language
242     * @return Chunk[]
243     * @throws \Exception
244     */
245    public function getSimilarChunks($query, $lang = '')
246    {
247        global $auth;
248        $vector = $this->embedModel->getEmbedding($query);
249
250        $fetch = min(
251            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
252            $this->configContextChunks
253        );
254
255        $time = microtime(true);
256        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
257        $this->timeSpent = round(microtime(true) - $time, 2);
258        if ($this->logger instanceof CLI) {
259            $this->logger->info(
260                'Fetched {count} similar chunks from store in {time} seconds',
261                ['count' => count($chunks), 'time' => $this->timeSpent]
262            );
263        }
264
265        $size = 0;
266        $result = [];
267        foreach ($chunks as $chunk) {
268            // filter out chunks the user is not allowed to read
269            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
270            if ($chunk->getScore() < $this->similarityThreshold) continue;
271
272            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
273            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
274
275            $result[] = $chunk;
276            $size += $chunkSize;
277        }
278        return $result;
279    }
280
281
282    /**
283     * @param $text
284     * @return array
285     * @throws \Exception
286     * @todo support splitting too long sentences
287     */
288    protected function splitIntoChunks($text)
289    {
290        $sentenceSplitter = new Sentence();
291        $tiktok = $this->getTokenEncoder();
292
293        $chunks = [];
294        $sentences = $sentenceSplitter->split($text);
295
296        $chunklen = 0;
297        $chunk = '';
298        while ($sentence = array_shift($sentences)) {
299            $slen = count($tiktok->encode($sentence));
300            if ($slen > $this->getChunkSize()) {
301                // sentence is too long, we need to split it further
302                if ($this->logger instanceof CLI) $this->logger->warning(
303                    'Sentence too long, splitting not implemented yet'
304                );
305                continue;
306            }
307
308            if ($chunklen + $slen < $this->getChunkSize()) {
309                // add to current chunk
310                $chunk .= $sentence;
311                $chunklen += $slen;
312                // remember sentence for overlap check
313                $this->rememberSentence($sentence);
314            } else {
315                // add current chunk to result
316                $chunk = trim($chunk);
317                if ($chunk !== '') $chunks[] = $chunk;
318
319                // start new chunk with remembered sentences
320                $chunk = implode(' ', $this->sentenceQueue);
321                $chunk .= $sentence;
322                $chunklen = count($tiktok->encode($chunk));
323            }
324        }
325        $chunks[] = $chunk;
326
327        return $chunks;
328    }
329
330    /**
331     * Add a sentence to the queue of remembered sentences
332     *
333     * @param string $sentence
334     * @return void
335     */
336    protected function rememberSentence($sentence)
337    {
338        // add sentence to queue
339        $this->sentenceQueue[] = $sentence;
340
341        // remove oldest sentences from queue until we are below the max overlap
342        $encoder = $this->getTokenEncoder();
343        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
344            array_shift($this->sentenceQueue);
345        }
346    }
347}
348