xref: /plugin/aichat/Embeddings.php (revision d5c102b3f940c6a699499e715eeb66b02677d7df)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Extension\PluginInterface;
6use dokuwiki\plugin\aichat\Model\AbstractModel;
7use dokuwiki\plugin\aichat\Storage\AbstractStorage;
8use dokuwiki\Search\Indexer;
9use splitbrain\phpcli\CLI;
10use TikToken\Encoder;
11use Vanderlee\Sentence\Sentence;
12
13/**
14 * Manage the embeddings index
15 *
16 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
17 * OpenAI and stored in the Storage backend.
18 */
19class Embeddings
20{
21    /** @var int maximum overlap between chunks in tokens */
22    final public const MAX_OVERLAP_LEN = 200;
23
24    /** @var AbstractModel */
25    protected $model;
26    /** @var CLI|null */
27    protected $logger;
28    /** @var Encoder */
29    protected $tokenEncoder;
30
31    /** @var AbstractStorage */
32    protected $storage;
33
34    /** @var array remember sentences when chunking */
35    private $sentenceQueue = [];
36
37    public function __construct(AbstractModel $model, AbstractStorage $storage)
38    {
39        $this->model = $model;
40        $this->storage = $storage;
41    }
42
43    /**
44     * Access storage
45     *
46     * @return AbstractStorage
47     */
48    public function getStorage()
49    {
50        return $this->storage;
51    }
52
53    /**
54     * Add a logger instance
55     *
56     * @return void
57     */
58    public function setLogger(CLI $logger)
59    {
60        $this->logger = $logger;
61    }
62
63    /**
64     * Get the token encoder instance
65     *
66     * @return Encoder
67     */
68    public function getTokenEncoder()
69    {
70        if (!$this->tokenEncoder instanceof Encoder) {
71            $this->tokenEncoder = new Encoder();
72        }
73        return $this->tokenEncoder;
74    }
75
76    /**
77     * Update the embeddings storage
78     *
79     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
80     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
81     * @param bool $clear Should any existing storage be cleared before updating?
82     * @return void
83     * @throws \Exception
84     */
85    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
86    {
87        $indexer = new Indexer();
88        $pages = $indexer->getPages();
89
90        $this->storage->startCreation($clear);
91        foreach ($pages as $pid => $page) {
92            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
93
94            if (
95                !page_exists($page) ||
96                isHiddenPage($page) ||
97                filesize(wikiFN($page)) < 150 || // skip very small pages
98                ($skipRE && preg_match($skipRE, (string) $page)) ||
99                ($matchRE && !preg_match($matchRE, ":$page"))
100            ) {
101                // this page should not be in the index (anymore)
102                $this->storage->deletePageChunks($page, $chunkID);
103                continue;
104            }
105
106            $firstChunk = $this->storage->getChunk($chunkID);
107            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
108                // page is older than the chunks we have, reuse the existing chunks
109                $this->storage->reusePageChunks($page, $chunkID);
110                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
111            } else {
112                // page is newer than the chunks we have, create new chunks
113                $this->storage->deletePageChunks($page, $chunkID);
114                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
115            }
116        }
117        $this->storage->finalizeCreation();
118    }
119
120    /**
121     * Split the given page, fetch embedding vectors and return Chunks
122     *
123     * Will use the text renderer plugin if available to get the rendered text.
124     * Otherwise the raw wiki text is used.
125     *
126     * @param string $page Name of the page to split
127     * @param int $firstChunkID The ID of the first chunk of this page
128     * @return Chunk[] A list of chunks created for this page
129     * @throws \Exception
130     */
131    protected function createPageChunks($page, $firstChunkID)
132    {
133        $chunkList = [];
134
135        $textRenderer = plugin_load('renderer', 'text');
136        if ($textRenderer instanceof PluginInterface) {
137            global $ID;
138            $ID = $page;
139            $text = p_cached_output(wikiFN($page), 'text', $page);
140        } else {
141            $text = rawWiki($page);
142        }
143
144        $parts = $this->splitIntoChunks($text);
145        foreach ($parts as $part) {
146            if (trim((string) $part) == '') continue; // skip empty chunks
147
148            try {
149                $embedding = $this->model->getEmbedding($part);
150            } catch (\Exception $e) {
151                if ($this->logger instanceof CLI) {
152                    $this->logger->error(
153                        'Failed to get embedding for chunk of page {page}: {msg}',
154                        ['page' => $page, 'msg' => $e->getMessage()]
155                    );
156                }
157                continue;
158            }
159            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
160            $firstChunkID++;
161        }
162        if ($this->logger instanceof CLI) {
163            if ($chunkList !== []) {
164                $this->logger->success(
165                    '{id} split into {count} chunks',
166                    ['id' => $page, 'count' => count($chunkList)]
167                );
168            } else {
169                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
170            }
171        }
172        return $chunkList;
173    }
174
175    /**
176     * Do a nearest neighbor search for chunks similar to the given question
177     *
178     * Returns only chunks the current user is allowed to read, may return an empty result.
179     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
180     *
181     * @param string $query The question
182     * @param string $lang Limit results to this language
183     * @return Chunk[]
184     * @throws \Exception
185     */
186    public function getSimilarChunks($query, $lang = '')
187    {
188        global $auth;
189        $vector = $this->model->getEmbedding($query);
190
191        $fetch = ceil(
192            ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength())
193            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
194        );
195
196        $time = microtime(true);
197        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
198        if ($this->logger instanceof CLI) {
199            $this->logger->info(
200                'Fetched {count} similar chunks from store in {time} seconds',
201                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
202            );
203        }
204
205        $size = 0;
206        $result = [];
207        foreach ($chunks as $chunk) {
208            // filter out chunks the user is not allowed to read
209            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
210
211            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
212            if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough
213
214            $result[] = $chunk;
215            $size += $chunkSize;
216        }
217        return $result;
218    }
219
220
221    /**
222     * @param $text
223     * @return array
224     * @throws \Exception
225     * @todo support splitting too long sentences
226     */
227    public function splitIntoChunks($text)
228    {
229        $sentenceSplitter = new Sentence();
230        $tiktok = $this->getTokenEncoder();
231
232        $chunks = [];
233        $sentences = $sentenceSplitter->split($text);
234
235        $chunklen = 0;
236        $chunk = '';
237        while ($sentence = array_shift($sentences)) {
238            $slen = count($tiktok->encode($sentence));
239            if ($slen > $this->model->getMaxEmbeddingTokenLength()) {
240                // sentence is too long, we need to split it further
241                if ($this->logger instanceof CLI) $this->logger->warning(
242                    'Sentence too long, splitting not implemented yet'
243                );
244                continue;
245            }
246
247            if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) {
248                // add to current chunk
249                $chunk .= $sentence;
250                $chunklen += $slen;
251                // remember sentence for overlap check
252                $this->rememberSentence($sentence);
253            } else {
254                // add current chunk to result
255                $chunks[] = $chunk;
256
257                // start new chunk with remembered sentences
258                $chunk = implode(' ', $this->sentenceQueue);
259                $chunk .= $sentence;
260                $chunklen = count($tiktok->encode($chunk));
261            }
262        }
263        $chunks[] = $chunk;
264
265        return $chunks;
266    }
267
268    /**
269     * Add a sentence to the queue of remembered sentences
270     *
271     * @param string $sentence
272     * @return void
273     */
274    protected function rememberSentence($sentence)
275    {
276        // add sentence to queue
277        $this->sentenceQueue[] = $sentence;
278
279        // remove oldest sentences from queue until we are below the max overlap
280        $encoder = $this->getTokenEncoder();
281        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
282            array_shift($this->sentenceQueue);
283        }
284    }
285}
286