xref: /plugin/aichat/Embeddings.php (revision f8d5ae013d1e8cb3669240e961cb98f1d60a5931)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\AbstractModel;
7use dokuwiki\plugin\aichat\Storage\AbstractStorage;
8use dokuwiki\Search\Indexer;
9use splitbrain\phpcli\CLI;
10use TikToken\Encoder;
11use Vanderlee\Sentence\Sentence;
12
13/**
14 * Manage the embeddings index
15 *
16 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
17 * OpenAI and stored in the Storage backend.
18 */
19class Embeddings
20{
21    /** @var int maximum overlap between chunks in tokens */
22    public const MAX_OVERLAP_LEN = 200;
23
24    /** @var AbstractModel */
25    protected $model;
26    /** @var CLI|null */
27    protected $logger;
28    /** @var Encoder */
29    protected $tokenEncoder;
30
31    /** @var AbstractStorage */
32    protected $storage;
33
34    /** @var array remember sentences when chunking */
35    private $sentenceQueue = [];
36
37    /**
38     * @param AbstractModel $model
39     */
40    public function __construct(AbstractModel $model, AbstractStorage $storage)
41    {
42        $this->model = $model;
43        $this->storage = $storage;
44    }
45
46    /**
47     * Access storage
48     *
49     * @return AbstractStorage
50     */
51    public function getStorage()
52    {
53        return $this->storage;
54    }
55
56    /**
57     * Add a logger instance
58     *
59     * @param CLI $logger
60     * @return void
61     */
62    public function setLogger(CLI $logger)
63    {
64        $this->logger = $logger;
65    }
66
67    /**
68     * Get the token encoder instance
69     *
70     * @return Encoder
71     */
72    public function getTokenEncoder()
73    {
74        if (!$this->tokenEncoder instanceof Encoder) {
75            $this->tokenEncoder = new Encoder();
76        }
77        return $this->tokenEncoder;
78    }
79
80    /**
81     * Update the embeddings storage
82     *
83     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
84     * @param bool $clear Should any existing storage be cleared before updating?
85     * @return void
86     * @throws \Exception
87     */
88    public function createNewIndex($skipRE = '', $clear = false)
89    {
90        $indexer = new Indexer();
91        $pages = $indexer->getPages();
92
93        $this->storage->startCreation($clear);
94        foreach ($pages as $pid => $page) {
95            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
96
97            if (
98                !page_exists($page) ||
99                isHiddenPage($page) ||
100                filesize(wikiFN($page)) < 150 || // skip very small pages
101                ($skipRE && preg_match($skipRE, $page))
102            ) {
103                // this page should not be in the index (anymore)
104                $this->storage->deletePageChunks($page, $chunkID);
105                continue;
106            }
107
108            $firstChunk = $this->storage->getChunk($chunkID);
109            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
110                // page is older than the chunks we have, reuse the existing chunks
111                $this->storage->reusePageChunks($page, $chunkID);
112                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
113            } else {
114                // page is newer than the chunks we have, create new chunks
115                $this->storage->deletePageChunks($page, $chunkID);
116                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
117            }
118        }
119        $this->storage->finalizeCreation();
120    }
121
122    /**
123     * Split the given page, fetch embedding vectors and return Chunks
124     *
125     * Will use the text renderer plugin if available to get the rendered text.
126     * Otherwise the raw wiki text is used.
127     *
128     * @param string $page Name of the page to split
129     * @param int $firstChunkID The ID of the first chunk of this page
130     * @return Chunk[] A list of chunks created for this page
131     * @throws \Exception
132     */
133    protected function createPageChunks($page, $firstChunkID)
134    {
135        $chunkList = [];
136
137        $textRenderer = plugin_load('renderer', 'text');
138        if ($textRenderer instanceof PluginInterface) {
139            global $ID;
140            $ID = $page;
141            $text = p_cached_output(wikiFN($page), 'text', $page);
142        } else {
143            $text = rawWiki($page);
144        }
145
146        $parts = $this->splitIntoChunks($text);
147        foreach ($parts as $part) {
148            if (trim($part) == '') continue; // skip empty chunks
149
150            try {
151                $embedding = $this->model->getEmbedding($part);
152            } catch (\Exception $e) {
153                if ($this->logger instanceof CLI) {
154                    $this->logger->error(
155                        'Failed to get embedding for chunk of page {page}: {msg}',
156                        ['page' => $page, 'msg' => $e->getMessage()]
157                    );
158                }
159                continue;
160            }
161            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
162            $firstChunkID++;
163        }
164        if ($this->logger instanceof CLI) {
165            if ($chunkList !== []) {
166                $this->logger->success(
167                    '{id} split into {count} chunks',
168                    ['id' => $page, 'count' => count($chunkList)]
169                );
170            } else {
171                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
172            }
173        }
174        return $chunkList;
175    }
176
177    /**
178     * Do a nearest neighbor search for chunks similar to the given question
179     *
180     * Returns only chunks the current user is allowed to read, may return an empty result.
181     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
182     *
183     * @param string $query The question
184     * @param string $lang Limit results to this language
185     * @return Chunk[]
186     * @throws \Exception
187     */
188    public function getSimilarChunks($query, $lang = '')
189    {
190        global $auth;
191        $vector = $this->model->getEmbedding($query);
192
193        $fetch = ceil(
194            ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength())
195            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
196        );
197
198        $time = microtime(true);
199        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
200        if ($this->logger instanceof CLI) {
201            $this->logger->info(
202                'Fetched {count} similar chunks from store in {time} seconds',
203                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
204            );
205        }
206
207        $size = 0;
208        $result = [];
209        foreach ($chunks as $chunk) {
210            // filter out chunks the user is not allowed to read
211            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
212
213            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
214            if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough
215
216            $result[] = $chunk;
217            $size += $chunkSize;
218        }
219        return $result;
220    }
221
222
223    /**
224     * @param $text
225     * @return array
226     * @throws \Exception
227     * @todo support splitting too long sentences
228     */
229    public function splitIntoChunks($text)
230    {
231        $sentenceSplitter = new Sentence();
232        $tiktok = $this->getTokenEncoder();
233
234        $chunks = [];
235        $sentences = $sentenceSplitter->split($text);
236
237        $chunklen = 0;
238        $chunk = '';
239        while ($sentence = array_shift($sentences)) {
240            $slen = count($tiktok->encode($sentence));
241            if ($slen > $this->model->getMaxEmbeddingTokenLength()) {
242                // sentence is too long, we need to split it further
243                if ($this->logger instanceof CLI) $this->logger->warning(
244                    'Sentence too long, splitting not implemented yet'
245                );
246                continue;
247            }
248
249            if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) {
250                // add to current chunk
251                $chunk .= $sentence;
252                $chunklen += $slen;
253                // remember sentence for overlap check
254                $this->rememberSentence($sentence);
255            } else {
256                // add current chunk to result
257                $chunks[] = $chunk;
258
259                // start new chunk with remembered sentences
260                $chunk = implode(' ', $this->sentenceQueue);
261                $chunk .= $sentence;
262                $chunklen = count($tiktok->encode($chunk));
263            }
264        }
265        $chunks[] = $chunk;
266
267        return $chunks;
268    }
269
270    /**
271     * Add a sentence to the queue of remembered sentences
272     *
273     * @param string $sentence
274     * @return void
275     */
276    protected function rememberSentence($sentence)
277    {
278        // add sentence to queue
279        $this->sentenceQueue[] = $sentence;
280
281        // remove oldest sentences from queue until we are below the max overlap
282        $encoder = $this->getTokenEncoder();
283        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
284            array_shift($this->sentenceQueue);
285        }
286    }
287}
288