xref: /plugin/aichat/Embeddings.php (revision 720bb43f9ac252f6e0b09e7b06804dec7c547a47)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\ChatInterface;
7use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
8use dokuwiki\plugin\aichat\Storage\AbstractStorage;
9use dokuwiki\Search\Indexer;
10use splitbrain\phpcli\CLI;
11use TikToken\Encoder;
12use Vanderlee\Sentence\Sentence;
13
14/**
15 * Manage the embeddings index
16 *
17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
18 * OpenAI and stored in the Storage backend.
19 */
20class Embeddings
21{
22    /** @var int maximum overlap between chunks in tokens */
23    final public const MAX_OVERLAP_LEN = 200;
24
25    /** @var ChatInterface */
26    protected $chatModel;
27
28    /** @var EmbeddingInterface */
29    protected $embedModel;
30
31    /** @var CLI|null */
32    protected $logger;
33    /** @var Encoder */
34    protected $tokenEncoder;
35
36    /** @var AbstractStorage */
37    protected $storage;
38
39    /** @var array remember sentences when chunking */
40    private $sentenceQueue = [];
41
42    /** @var int the time spent for the last similar chunk retrieval */
43    public $timeSpent = 0;
44
45    protected $configChunkSize;
46    protected $configContextChunks;
47    protected $similarityThreshold;
48
49    /**
50     * Embeddings constructor.
51     *
52     * @param ChatInterface $chatModel
53     * @param EmbeddingInterface $embedModel
54     * @param AbstractStorage $storage
55     * @param array $config The plugin configuration
56     */
57    public function __construct(
58        ChatInterface $chatModel,
59        EmbeddingInterface $embedModel,
60        AbstractStorage $storage,
61        $config
62    ) {
63        $this->chatModel = $chatModel;
64        $this->embedModel = $embedModel;
65        $this->storage = $storage;
66        $this->configChunkSize = $config['chunkSize'];
67        $this->configContextChunks = $config['contextChunks'];
68        $this->similarityThreshold = $config['similarityThreshold']/100;
69    }
70
71    /**
72     * Access storage
73     *
74     * @return AbstractStorage
75     */
76    public function getStorage()
77    {
78        return $this->storage;
79    }
80
81    /**
82     * Add a logger instance
83     *
84     * @return void
85     */
86    public function setLogger(CLI $logger)
87    {
88        $this->logger = $logger;
89    }
90
91    /**
92     * Get the token encoder instance
93     *
94     * @return Encoder
95     */
96    public function getTokenEncoder()
97    {
98        if (!$this->tokenEncoder instanceof Encoder) {
99            $this->tokenEncoder = new Encoder();
100        }
101        return $this->tokenEncoder;
102    }
103
104    /**
105     * Return the chunk size to use
106     *
107     * @return int
108     */
109    public function getChunkSize()
110    {
111        return min(
112            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
113            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
114            $this->configChunkSize, // this is usually the smallest
115        );
116    }
117
118    /**
119     * Update the embeddings storage
120     *
121     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
122     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
123     * @param bool $clear Should any existing storage be cleared before updating?
124     * @return void
125     * @throws \Exception
126     */
127    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
128    {
129        $indexer = new Indexer();
130        $pages = $indexer->getPages();
131
132        $this->storage->startCreation($clear);
133        foreach ($pages as $pid => $page) {
134            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
135
136            if (
137                !page_exists($page) ||
138                isHiddenPage($page) ||
139                filesize(wikiFN($page)) < 150 || // skip very small pages
140                ($skipRE && preg_match($skipRE, (string)$page)) ||
141                ($matchRE && !preg_match($matchRE, ":$page"))
142            ) {
143                // this page should not be in the index (anymore)
144                $this->storage->deletePageChunks($page, $chunkID);
145                continue;
146            }
147
148            $firstChunk = $this->storage->getChunk($chunkID);
149            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
150                // page is older than the chunks we have, reuse the existing chunks
151                $this->storage->reusePageChunks($page, $chunkID);
152                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
153            } else {
154                // page is newer than the chunks we have, create new chunks
155                $this->storage->deletePageChunks($page, $chunkID);
156                $chunks = $this->createPageChunks($page, $chunkID);
157                if ($chunks) $this->storage->addPageChunks($chunks);
158            }
159        }
160        $this->storage->finalizeCreation();
161    }
162
163    /**
164     * Split the given page, fetch embedding vectors and return Chunks
165     *
166     * Will use the text renderer plugin if available to get the rendered text.
167     * Otherwise the raw wiki text is used.
168     *
169     * @param string $page Name of the page to split
170     * @param int $firstChunkID The ID of the first chunk of this page
171     * @return Chunk[] A list of chunks created for this page
172     * @throws \Exception
173     */
174    protected function createPageChunks($page, $firstChunkID)
175    {
176        $chunkList = [];
177
178        $textRenderer = plugin_load('renderer', 'text');
179        if ($textRenderer instanceof PluginInterface) {
180            global $ID;
181            $ID = $page;
182            $text = p_cached_output(wikiFN($page), 'text', $page);
183        } else {
184            $text = rawWiki($page);
185        }
186
187        $parts = $this->splitIntoChunks($text);
188        foreach ($parts as $part) {
189            if (trim((string)$part) == '') continue; // skip empty chunks
190
191            try {
192                $embedding = $this->embedModel->getEmbedding($part);
193            } catch (\Exception $e) {
194                if ($this->logger instanceof CLI) {
195                    $this->logger->error(
196                        'Failed to get embedding for chunk of page {page}: {msg}',
197                        ['page' => $page, 'msg' => $e->getMessage()]
198                    );
199                }
200                continue;
201            }
202            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
203            $firstChunkID++;
204        }
205        if ($this->logger instanceof CLI) {
206            if ($chunkList !== []) {
207                $this->logger->success(
208                    '{id} split into {count} chunks',
209                    ['id' => $page, 'count' => count($chunkList)]
210                );
211            } else {
212                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
213            }
214        }
215        return $chunkList;
216    }
217
218    /**
219     * Do a nearest neighbor search for chunks similar to the given question
220     *
221     * Returns only chunks the current user is allowed to read, may return an empty result.
222     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
223     *
224     * @param string $query The question
225     * @param string $lang Limit results to this language
226     * @return Chunk[]
227     * @throws \Exception
228     */
229    public function getSimilarChunks($query, $lang = '')
230    {
231        global $auth;
232        $vector = $this->embedModel->getEmbedding($query);
233
234        $fetch = min(
235            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
236            $this->configContextChunks
237        );
238
239        $time = microtime(true);
240        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
241        $this->timeSpent = round(microtime(true) - $time, 2);
242        if ($this->logger instanceof CLI) {
243            $this->logger->info(
244                'Fetched {count} similar chunks from store in {time} seconds',
245                ['count' => count($chunks), 'time' => $this->timeSpent]
246            );
247        }
248
249        $size = 0;
250        $result = [];
251        foreach ($chunks as $chunk) {
252            // filter out chunks the user is not allowed to read
253            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
254            if($chunk->getScore() < $this->similarityThreshold) continue;
255
256            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
257            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
258
259            $result[] = $chunk;
260            $size += $chunkSize;
261        }
262        return $result;
263    }
264
265
266    /**
267     * @param $text
268     * @return array
269     * @throws \Exception
270     * @todo support splitting too long sentences
271     */
272    public function splitIntoChunks($text)
273    {
274        $sentenceSplitter = new Sentence();
275        $tiktok = $this->getTokenEncoder();
276
277        $chunks = [];
278        $sentences = $sentenceSplitter->split($text);
279
280        $chunklen = 0;
281        $chunk = '';
282        while ($sentence = array_shift($sentences)) {
283            $slen = count($tiktok->encode($sentence));
284            if ($slen > $this->getChunkSize()) {
285                // sentence is too long, we need to split it further
286                if ($this->logger instanceof CLI) $this->logger->warning(
287                    'Sentence too long, splitting not implemented yet'
288                );
289                continue;
290            }
291
292            if ($chunklen + $slen < $this->getChunkSize()) {
293                // add to current chunk
294                $chunk .= $sentence;
295                $chunklen += $slen;
296                // remember sentence for overlap check
297                $this->rememberSentence($sentence);
298            } else {
299                // add current chunk to result
300                $chunks[] = $chunk;
301
302                // start new chunk with remembered sentences
303                $chunk = implode(' ', $this->sentenceQueue);
304                $chunk .= $sentence;
305                $chunklen = count($tiktok->encode($chunk));
306            }
307        }
308        $chunks[] = $chunk;
309
310        return $chunks;
311    }
312
313    /**
314     * Add a sentence to the queue of remembered sentences
315     *
316     * @param string $sentence
317     * @return void
318     */
319    protected function rememberSentence($sentence)
320    {
321        // add sentence to queue
322        $this->sentenceQueue[] = $sentence;
323
324        // remove oldest sentences from queue until we are below the max overlap
325        $encoder = $this->getTokenEncoder();
326        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
327            array_shift($this->sentenceQueue);
328        }
329    }
330}
331