xref: /plugin/aichat/Embeddings.php (revision 5f71c9bbe31dfcc1db5ab9659debc4833c4ec6eb)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\ChatInterface;
7use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
8use dokuwiki\plugin\aichat\Storage\AbstractStorage;
9use dokuwiki\Search\Indexer;
10use splitbrain\phpcli\CLI;
11use TikToken\Encoder;
12use Vanderlee\Sentence\Sentence;
13
14/**
15 * Manage the embeddings index
16 *
17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
18 * OpenAI and stored in the Storage backend.
19 */
20class Embeddings
21{
22    /** @var int maximum overlap between chunks in tokens */
23    final public const MAX_OVERLAP_LEN = 200;
24
25    /** @var ChatInterface */
26    protected $chatModel;
27
28    /** @var EmbeddingInterface */
29    protected $embedModel;
30
31    /** @var CLI|null */
32    protected $logger;
33    /** @var Encoder */
34    protected $tokenEncoder;
35
36    /** @var AbstractStorage */
37    protected $storage;
38
39    /** @var array remember sentences when chunking */
40    private $sentenceQueue = [];
41
42    /** @var int the time spent for the last similar chunk retrieval */
43    public $timeSpent = 0;
44
45    protected $configChunkSize;
46    protected $configContextChunks;
47
48    /**
49     * Embeddings constructor.
50     *
51     * @param ChatInterface $chatModel
52     * @param EmbeddingInterface $embedModel
53     * @param AbstractStorage $storage
54     * @param array $config The plugin configuration
55     */
56    public function __construct(
57        ChatInterface      $chatModel,
58        EmbeddingInterface $embedModel,
59        AbstractStorage    $storage,
60                           $config
61    )
62    {
63        $this->chatModel = $chatModel;
64        $this->embedModel = $embedModel;
65        $this->storage = $storage;
66        $this->configChunkSize = $config['chunkSize'];
67        $this->configContextChunks = $config['contextChunks'];
68    }
69
70    /**
71     * Access storage
72     *
73     * @return AbstractStorage
74     */
75    public function getStorage()
76    {
77        return $this->storage;
78    }
79
80    /**
81     * Add a logger instance
82     *
83     * @return void
84     */
85    public function setLogger(CLI $logger)
86    {
87        $this->logger = $logger;
88    }
89
90    /**
91     * Get the token encoder instance
92     *
93     * @return Encoder
94     */
95    public function getTokenEncoder()
96    {
97        if (!$this->tokenEncoder instanceof Encoder) {
98            $this->tokenEncoder = new Encoder();
99        }
100        return $this->tokenEncoder;
101    }
102
103    /**
104     * Return the chunk size to use
105     *
106     * @return int
107     */
108    public function getChunkSize()
109    {
110        return min(
111            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
112            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
113            $this->configChunkSize, // this is usually the smallest
114        );
115    }
116
117    /**
118     * Update the embeddings storage
119     *
120     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
121     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
122     * @param bool $clear Should any existing storage be cleared before updating?
123     * @return void
124     * @throws \Exception
125     */
126    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
127    {
128        $indexer = new Indexer();
129        $pages = $indexer->getPages();
130
131        $this->storage->startCreation($clear);
132        foreach ($pages as $pid => $page) {
133            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
134
135            if (
136                !page_exists($page) ||
137                isHiddenPage($page) ||
138                filesize(wikiFN($page)) < 150 || // skip very small pages
139                ($skipRE && preg_match($skipRE, (string)$page)) ||
140                ($matchRE && !preg_match($matchRE, ":$page"))
141            ) {
142                // this page should not be in the index (anymore)
143                $this->storage->deletePageChunks($page, $chunkID);
144                continue;
145            }
146
147            $firstChunk = $this->storage->getChunk($chunkID);
148            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
149                // page is older than the chunks we have, reuse the existing chunks
150                $this->storage->reusePageChunks($page, $chunkID);
151                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
152            } else {
153                // page is newer than the chunks we have, create new chunks
154                $this->storage->deletePageChunks($page, $chunkID);
155                $chunks = $this->createPageChunks($page, $chunkID);
156                if ($chunks) $this->storage->addPageChunks($chunks);
157            }
158        }
159        $this->storage->finalizeCreation();
160    }
161
162    /**
163     * Split the given page, fetch embedding vectors and return Chunks
164     *
165     * Will use the text renderer plugin if available to get the rendered text.
166     * Otherwise the raw wiki text is used.
167     *
168     * @param string $page Name of the page to split
169     * @param int $firstChunkID The ID of the first chunk of this page
170     * @return Chunk[] A list of chunks created for this page
171     * @throws \Exception
172     */
173    protected function createPageChunks($page, $firstChunkID)
174    {
175        $chunkList = [];
176
177        $textRenderer = plugin_load('renderer', 'text');
178        if ($textRenderer instanceof PluginInterface) {
179            global $ID;
180            $ID = $page;
181            $text = p_cached_output(wikiFN($page), 'text', $page);
182        } else {
183            $text = rawWiki($page);
184        }
185
186        $parts = $this->splitIntoChunks($text);
187        foreach ($parts as $part) {
188            if (trim((string)$part) == '') continue; // skip empty chunks
189
190            try {
191                $embedding = $this->embedModel->getEmbedding($part);
192            } catch (\Exception $e) {
193                if ($this->logger instanceof CLI) {
194                    $this->logger->error(
195                        'Failed to get embedding for chunk of page {page}: {msg}',
196                        ['page' => $page, 'msg' => $e->getMessage()]
197                    );
198                }
199                continue;
200            }
201            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
202            $firstChunkID++;
203        }
204        if ($this->logger instanceof CLI) {
205            if ($chunkList !== []) {
206                $this->logger->success(
207                    '{id} split into {count} chunks',
208                    ['id' => $page, 'count' => count($chunkList)]
209                );
210            } else {
211                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
212            }
213        }
214        return $chunkList;
215    }
216
217    /**
218     * Do a nearest neighbor search for chunks similar to the given question
219     *
220     * Returns only chunks the current user is allowed to read, may return an empty result.
221     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
222     *
223     * @param string $query The question
224     * @param string $lang Limit results to this language
225     * @return Chunk[]
226     * @throws \Exception
227     */
228    public function getSimilarChunks($query, $lang = '')
229    {
230        global $auth;
231        $vector = $this->embedModel->getEmbedding($query);
232
233        $fetch = min(
234            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
235            $this->configContextChunks
236        );
237
238        $time = microtime(true);
239        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
240        $this->timeSpent = round(microtime(true) - $time, 2);
241        if ($this->logger instanceof CLI) {
242            $this->logger->info(
243                'Fetched {count} similar chunks from store in {time} seconds',
244                ['count' => count($chunks), 'time' => $this->timeSpent]
245            );
246        }
247
248        $size = 0;
249        $result = [];
250        foreach ($chunks as $chunk) {
251            // filter out chunks the user is not allowed to read
252            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
253
254            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
255            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
256
257            $result[] = $chunk;
258            $size += $chunkSize;
259        }
260        return $result;
261    }
262
263
264    /**
265     * @param $text
266     * @return array
267     * @throws \Exception
268     * @todo support splitting too long sentences
269     */
270    public function splitIntoChunks($text)
271    {
272        $sentenceSplitter = new Sentence();
273        $tiktok = $this->getTokenEncoder();
274
275        $chunks = [];
276        $sentences = $sentenceSplitter->split($text);
277
278        $chunklen = 0;
279        $chunk = '';
280        while ($sentence = array_shift($sentences)) {
281            $slen = count($tiktok->encode($sentence));
282            if ($slen > $this->getChunkSize()) {
283                // sentence is too long, we need to split it further
284                if ($this->logger instanceof CLI) $this->logger->warning(
285                    'Sentence too long, splitting not implemented yet'
286                );
287                continue;
288            }
289
290            if ($chunklen + $slen < $this->getChunkSize()) {
291                // add to current chunk
292                $chunk .= $sentence;
293                $chunklen += $slen;
294                // remember sentence for overlap check
295                $this->rememberSentence($sentence);
296            } else {
297                // add current chunk to result
298                $chunks[] = $chunk;
299
300                // start new chunk with remembered sentences
301                $chunk = implode(' ', $this->sentenceQueue);
302                $chunk .= $sentence;
303                $chunklen = count($tiktok->encode($chunk));
304            }
305        }
306        $chunks[] = $chunk;
307
308        return $chunks;
309    }
310
311    /**
312     * Add a sentence to the queue of remembered sentences
313     *
314     * @param string $sentence
315     * @return void
316     */
317    protected function rememberSentence($sentence)
318    {
319        // add sentence to queue
320        $this->sentenceQueue[] = $sentence;
321
322        // remove oldest sentences from queue until we are below the max overlap
323        $encoder = $this->getTokenEncoder();
324        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
325            array_shift($this->sentenceQueue);
326        }
327    }
328}
329