xref: /plugin/aichat/Embeddings.php (revision 6a18e0f40fd2d3238b0284483f1ee9aa53dad036)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\AbstractChatModel;
7use dokuwiki\plugin\aichat\Model\AbstractEmbeddingModel;
8use dokuwiki\plugin\aichat\Storage\AbstractStorage;
9use dokuwiki\Search\Indexer;
10use splitbrain\phpcli\CLI;
11use TikToken\Encoder;
12use Vanderlee\Sentence\Sentence;
13
14/**
15 * Manage the embeddings index
16 *
17 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
18 * OpenAI and stored in the Storage backend.
19 */
20class Embeddings
21{
22    /** @var int maximum overlap between chunks in tokens */
23    final public const MAX_OVERLAP_LEN = 200;
24
25    /** @var AbstractChatModel */
26    protected $chatModel;
27
28    /** @var AbstractEmbeddingModel */
29    protected $embedModel;
30
31    /** @var CLI|null */
32    protected $logger;
33    /** @var Encoder */
34    protected $tokenEncoder;
35
36    /** @var AbstractStorage */
37    protected $storage;
38
39    /** @var array remember sentences when chunking */
40    private $sentenceQueue = [];
41
42    public function __construct(
43        AbstractChatModel $chatModel,
44        AbstractEmbeddingModel $embedModel,
45        AbstractStorage $storage
46    ) {
47        $this->chatModel = $chatModel;
48        $this->embedModel = $embedModel;
49        $this->storage = $storage;
50    }
51
52    /**
53     * Access storage
54     *
55     * @return AbstractStorage
56     */
57    public function getStorage()
58    {
59        return $this->storage;
60    }
61
62    /**
63     * Add a logger instance
64     *
65     * @return void
66     */
67    public function setLogger(CLI $logger)
68    {
69        $this->logger = $logger;
70    }
71
72    /**
73     * Get the token encoder instance
74     *
75     * @return Encoder
76     */
77    public function getTokenEncoder()
78    {
79        if (!$this->tokenEncoder instanceof Encoder) {
80            $this->tokenEncoder = new Encoder();
81        }
82        return $this->tokenEncoder;
83    }
84
85    /**
86     * Return the chunk size to use
87     *
88     * @return int
89     */
90    public function getChunkSize()
91    {
92        return min(
93            $this->chatModel->getMaxEmbeddingTokenLength(),
94            $this->embedModel->getMaxEmbeddingTokenLength()
95        );
96    }
97
98    /**
99     * Update the embeddings storage
100     *
101     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
102     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
103     * @param bool $clear Should any existing storage be cleared before updating?
104     * @return void
105     * @throws \Exception
106     */
107    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
108    {
109        $indexer = new Indexer();
110        $pages = $indexer->getPages();
111
112        $this->storage->startCreation($clear);
113        foreach ($pages as $pid => $page) {
114            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
115
116            if (
117                !page_exists($page) ||
118                isHiddenPage($page) ||
119                filesize(wikiFN($page)) < 150 || // skip very small pages
120                ($skipRE && preg_match($skipRE, (string) $page)) ||
121                ($matchRE && !preg_match($matchRE, ":$page"))
122            ) {
123                // this page should not be in the index (anymore)
124                $this->storage->deletePageChunks($page, $chunkID);
125                continue;
126            }
127
128            $firstChunk = $this->storage->getChunk($chunkID);
129            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
130                // page is older than the chunks we have, reuse the existing chunks
131                $this->storage->reusePageChunks($page, $chunkID);
132                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
133            } else {
134                // page is newer than the chunks we have, create new chunks
135                $this->storage->deletePageChunks($page, $chunkID);
136                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
137            }
138        }
139        $this->storage->finalizeCreation();
140    }
141
142    /**
143     * Split the given page, fetch embedding vectors and return Chunks
144     *
145     * Will use the text renderer plugin if available to get the rendered text.
146     * Otherwise the raw wiki text is used.
147     *
148     * @param string $page Name of the page to split
149     * @param int $firstChunkID The ID of the first chunk of this page
150     * @return Chunk[] A list of chunks created for this page
151     * @throws \Exception
152     */
153    protected function createPageChunks($page, $firstChunkID)
154    {
155        $chunkList = [];
156
157        $textRenderer = plugin_load('renderer', 'text');
158        if ($textRenderer instanceof PluginInterface) {
159            global $ID;
160            $ID = $page;
161            $text = p_cached_output(wikiFN($page), 'text', $page);
162        } else {
163            $text = rawWiki($page);
164        }
165
166        $parts = $this->splitIntoChunks($text);
167        foreach ($parts as $part) {
168            if (trim((string) $part) == '') continue; // skip empty chunks
169
170            try {
171                $embedding = $this->embedModel->getEmbedding($part);
172            } catch (\Exception $e) {
173                if ($this->logger instanceof CLI) {
174                    $this->logger->error(
175                        'Failed to get embedding for chunk of page {page}: {msg}',
176                        ['page' => $page, 'msg' => $e->getMessage()]
177                    );
178                }
179                continue;
180            }
181            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
182            $firstChunkID++;
183        }
184        if ($this->logger instanceof CLI) {
185            if ($chunkList !== []) {
186                $this->logger->success(
187                    '{id} split into {count} chunks',
188                    ['id' => $page, 'count' => count($chunkList)]
189                );
190            } else {
191                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
192            }
193        }
194        return $chunkList;
195    }
196
197    /**
198     * Do a nearest neighbor search for chunks similar to the given question
199     *
200     * Returns only chunks the current user is allowed to read, may return an empty result.
201     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
202     *
203     * @param string $query The question
204     * @param string $lang Limit results to this language
205     * @return Chunk[]
206     * @throws \Exception
207     */
208    public function getSimilarChunks($query, $lang = '')
209    {
210        global $auth;
211        $vector = $this->embedModel->getEmbedding($query);
212
213        $fetch = ceil(
214            ($this->getChunkSize() / $this->chatModel->getMaxEmbeddingTokenLength())
215            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
216        );
217
218        $time = microtime(true);
219        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
220        if ($this->logger instanceof CLI) {
221            $this->logger->info(
222                'Fetched {count} similar chunks from store in {time} seconds',
223                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
224            );
225        }
226
227        $size = 0;
228        $result = [];
229        foreach ($chunks as $chunk) {
230            // filter out chunks the user is not allowed to read
231            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
232
233            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
234            if ($size + $chunkSize > $this->chatModel->getMaxContextTokenLength()) break; // we have enough
235
236            $result[] = $chunk;
237            $size += $chunkSize;
238        }
239        return $result;
240    }
241
242
243    /**
244     * @param $text
245     * @return array
246     * @throws \Exception
247     * @todo support splitting too long sentences
248     */
249    public function splitIntoChunks($text)
250    {
251        $sentenceSplitter = new Sentence();
252        $tiktok = $this->getTokenEncoder();
253
254        $chunks = [];
255        $sentences = $sentenceSplitter->split($text);
256
257        $chunklen = 0;
258        $chunk = '';
259        while ($sentence = array_shift($sentences)) {
260            $slen = count($tiktok->encode($sentence));
261            if ($slen > $this->getChunkSize()) {
262                // sentence is too long, we need to split it further
263                if ($this->logger instanceof CLI) $this->logger->warning(
264                    'Sentence too long, splitting not implemented yet'
265                );
266                continue;
267            }
268
269            if ($chunklen + $slen < $this->getChunkSize()) {
270                // add to current chunk
271                $chunk .= $sentence;
272                $chunklen += $slen;
273                // remember sentence for overlap check
274                $this->rememberSentence($sentence);
275            } else {
276                // add current chunk to result
277                $chunks[] = $chunk;
278
279                // start new chunk with remembered sentences
280                $chunk = implode(' ', $this->sentenceQueue);
281                $chunk .= $sentence;
282                $chunklen = count($tiktok->encode($chunk));
283            }
284        }
285        $chunks[] = $chunk;
286
287        return $chunks;
288    }
289
290    /**
291     * Add a sentence to the queue of remembered sentences
292     *
293     * @param string $sentence
294     * @return void
295     */
296    protected function rememberSentence($sentence)
297    {
298        // add sentence to queue
299        $this->sentenceQueue[] = $sentence;
300
301        // remove oldest sentences from queue until we are below the max overlap
302        $encoder = $this->getTokenEncoder();
303        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
304            array_shift($this->sentenceQueue);
305        }
306    }
307}
308