xref: /plugin/aichat/Embeddings.php (revision aee9b3838d65096a9b570fe90c0fd4cb03ab738f)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\plugin\aichat\Model\AbstractModel;
6use dokuwiki\plugin\aichat\Storage\AbstractStorage;
7use dokuwiki\Search\Indexer;
8use splitbrain\phpcli\CLI;
9use TikToken\Encoder;
10use Vanderlee\Sentence\Sentence;
11
12/**
13 * Manage the embeddings index
14 *
15 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
16 * OpenAI and stored in the Storage backend.
17 */
18class Embeddings
19{
20    /** @var int maximum overlap between chunks in tokens */
21    const MAX_OVERLAP_LEN = 200;
22
23    /** @var AbstractModel */
24    protected $model;
25    /** @var CLI|null */
26    protected $logger;
27    /** @var Encoder */
28    protected $tokenEncoder;
29
30    /** @var AbstractStorage */
31    protected $storage;
32
33    /** @var array remember sentences when chunking */
34    private $sentenceQueue = [];
35
36    /**
37     * @param AbstractModel $model
38     */
39    public function __construct(AbstractModel $model, AbstractStorage $storage)
40    {
41        $this->model = $model;
42        $this->storage = $storage;
43    }
44
45    /**
46     * Access storage
47     *
48     * @return AbstractStorage
49     */
50    public function getStorage()
51    {
52        return $this->storage;
53    }
54
55    /**
56     * Add a logger instance
57     *
58     * @param CLI $logger
59     * @return void
60     */
61    public function setLogger(CLI $logger)
62    {
63        $this->logger = $logger;
64    }
65
66    /**
67     * Get the token encoder instance
68     *
69     * @return Encoder
70     */
71    public function getTokenEncoder()
72    {
73        if ($this->tokenEncoder === null) {
74            $this->tokenEncoder = new Encoder();
75        }
76        return $this->tokenEncoder;
77    }
78
79    /**
80     * Update the embeddings storage
81     *
82     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
83     * @param bool $clear Should any existing storage be cleared before updating?
84     * @return void
85     * @throws \Exception
86     */
87    public function createNewIndex($skipRE = '', $clear = false)
88    {
89        $indexer = new Indexer();
90        $pages = $indexer->getPages();
91
92        $this->storage->startCreation($clear);
93        foreach ($pages as $pid => $page) {
94            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
95
96            if (
97                !page_exists($page) ||
98                isHiddenPage($page) ||
99                filesize(wikiFN($page)) < 150 || // skip very small pages
100                ($skipRE && preg_match($skipRE, $page))
101            ) {
102                // this page should not be in the index (anymore)
103                $this->storage->deletePageChunks($page, $chunkID);
104                continue;
105            }
106
107            $firstChunk = $this->storage->getChunk($chunkID);
108            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
109                // page is older than the chunks we have, reuse the existing chunks
110                $this->storage->reusePageChunks($page, $chunkID);
111                if ($this->logger) $this->logger->info("Reusing chunks for $page");
112            } else {
113                // page is newer than the chunks we have, create new chunks
114                $this->storage->deletePageChunks($page, $chunkID);
115                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
116            }
117        }
118        $this->storage->finalizeCreation();
119    }
120
121    /**
122     * Split the given page, fetch embedding vectors and return Chunks
123     *
124     * Will use the text renderer plugin if available to get the rendered text.
125     * Otherwise the raw wiki text is used.
126     *
127     * @param string $page Name of the page to split
128     * @param int $firstChunkID The ID of the first chunk of this page
129     * @return Chunk[] A list of chunks created for this page
130     * @throws \Exception
131     */
132    protected function createPageChunks($page, $firstChunkID)
133    {
134        $chunkList = [];
135
136        $textRenderer = plugin_load('renderer', 'text');
137        if ($textRenderer) {
138            global $ID;
139            $ID = $page;
140            $text = p_cached_output(wikiFN($page), 'text', $page);
141        } else {
142            $text = rawWiki($page);
143        }
144
145        $parts = $this->splitIntoChunks($text);
146        foreach ($parts as $part) {
147            if (trim($part) == '') continue; // skip empty chunks
148
149            try {
150                $embedding = $this->model->getEmbedding($part);
151            } catch (\Exception $e) {
152                if ($this->logger) {
153                    $this->logger->error(
154                        'Failed to get embedding for chunk of page {page}: {msg}',
155                        ['page' => $page, 'msg' => $e->getMessage()]
156                    );
157                }
158                continue;
159            }
160            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
161            $firstChunkID++;
162        }
163        if ($this->logger) {
164            if (count($chunkList)) {
165                $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]);
166            } else {
167                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
168            }
169        }
170        return $chunkList;
171    }
172
173    /**
174     * Do a nearest neighbor search for chunks similar to the given question
175     *
176     * Returns only chunks the current user is allowed to read, may return an empty result.
177     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
178     *
179     * @param string $query The question
180     * @return Chunk[]
181     * @throws \Exception
182     */
183    public function getSimilarChunks($query)
184    {
185        global $auth;
186        $vector = $this->model->getEmbedding($query);
187
188        $fetch = ceil(
189            ($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength())
190            * 1.5 // fetch a few more than needed, since not all chunks are maximum length
191        );
192
193        $time = microtime(true);
194        $chunks = $this->storage->getSimilarChunks($vector, $fetch);
195        if ($this->logger) {
196            $this->logger->info(
197                'Fetched {count} similar chunks from store in {time} seconds',
198                ['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
199            );
200        }
201
202        $size = 0;
203        $result = [];
204        foreach ($chunks as $chunk) {
205            // filter out chunks the user is not allowed to read
206            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
207
208            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
209            if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough
210
211            $result[] = $chunk;
212            $size += $chunkSize;
213        }
214        return $result;
215    }
216
217
218    /**
219     * @param $text
220     * @return array
221     * @throws \Exception
222     * @todo support splitting too long sentences
223     */
224    public function splitIntoChunks($text)
225    {
226        $sentenceSplitter = new Sentence();
227        $tiktok = $this->getTokenEncoder();
228
229        $chunks = [];
230        $sentences = $sentenceSplitter->split($text);
231
232        $chunklen = 0;
233        $chunk = '';
234        while ($sentence = array_shift($sentences)) {
235            $slen = count($tiktok->encode($sentence));
236            if ($slen > $this->model->getMaxEmbeddingTokenLength()) {
237                // sentence is too long, we need to split it further
238                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
239                continue;
240            }
241
242            if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) {
243                // add to current chunk
244                $chunk .= $sentence;
245                $chunklen += $slen;
246                // remember sentence for overlap check
247                $this->rememberSentence($sentence);
248            } else {
249                // add current chunk to result
250                $chunks[] = $chunk;
251
252                // start new chunk with remembered sentences
253                $chunk = join(' ', $this->sentenceQueue);
254                $chunk .= $sentence;
255                $chunklen = count($tiktok->encode($chunk));
256            }
257        }
258        $chunks[] = $chunk;
259
260        return $chunks;
261    }
262
263    /**
264     * Add a sentence to the queue of remembered sentences
265     *
266     * @param string $sentence
267     * @return void
268     */
269    protected function rememberSentence($sentence)
270    {
271        // add sentence to queue
272        $this->sentenceQueue[] = $sentence;
273
274        // remove oldest sentences from queue until we are below the max overlap
275        $encoder = $this->getTokenEncoder();
276        while (count($encoder->encode(join(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
277            array_shift($this->sentenceQueue);
278        }
279    }
280}
281