xref: /plugin/aichat/Embeddings.php (revision 614f8ab4ac738aed3502238736999a25e6cf719a)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\plugin\aichat\backend\AbstractStorage;
6use dokuwiki\plugin\aichat\backend\Chunk;
7use dokuwiki\plugin\aichat\backend\KDTreeStorage;
8use dokuwiki\plugin\aichat\backend\SQLiteStorage;
9use dokuwiki\Search\Indexer;
10use Hexogen\KDTree\Exception\ValidationException;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23
24    const MAX_TOKEN_LEN = 1000;
25
26
27    /** @var OpenAI */
28    protected $openAI;
29    /** @var CLI|null */
30    protected $logger;
31
32    /** @var AbstractStorage */
33    protected $storage;
34
35    /**
36     * @param OpenAI $openAI
37     */
38    public function __construct(OpenAI $openAI)
39    {
40        $this->openAI = $openAI;
41        $this->storage = new SQLiteStorage();
42    }
43
44    /**
45     * Access storage
46     *
47     * @return AbstractStorage
48     */
49    public function getStorage()
50    {
51        return $this->storage;
52    }
53
54    /**
55     * Add a logger instance
56     *
57     * @param CLI $logger
58     * @return void
59     */
60    public function setLogger(CLI $logger)
61    {
62        $this->logger = $logger;
63    }
64
65    /**
66     * Update the embeddings storage
67     *
68     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
69     * @param bool $clear Should any existing storage be cleared before updating?
70     * @return void
71     * @throws \Exception
72     */
73    public function createNewIndex($skipRE = '', $clear = false)
74    {
75        $indexer = new Indexer();
76        $pages = $indexer->getPages();
77
78        $this->storage->startCreation(1536, $clear);
79        foreach ($pages as $pid => $page) {
80            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
81
82            if (
83                !page_exists($page) ||
84                isHiddenPage($page) ||
85                filesize(wikiFN($page)) < 150 || // skip very small pages
86                ($skipRE && preg_match($skipRE, $page))
87            ) {
88                // this page should not be in the index (anymore)
89                $this->storage->deletePageChunks($page, $chunkID);
90                continue;
91            }
92
93            $firstChunk = $this->storage->getChunk($chunkID);
94            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
95                // page is older than the chunks we have, reuse the existing chunks
96                $this->storage->reusePageChunks($page, $chunkID);
97                if ($this->logger) $this->logger->info("Reusing chunks for $page");
98            } else {
99                // page is newer than the chunks we have, create new chunks
100                $this->storage->deletePageChunks($page, $chunkID);
101                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
102            }
103        }
104        $this->storage->finalizeCreation();
105    }
106
107    /**
108     * Split the given page, fetch embedding vectors and return Chunks
109     *
110     * Will use the text renderer plugin if available to get the rendered text.
111     * Otherwise the raw wiki text is used.
112     *
113     * @param string $page Name of the page to split
114     * @param int $firstChunkID The ID of the first chunk of this page
115     * @return Chunk[] A list of chunks created for this page
116     * @throws \Exception
117     */
118    protected function createPageChunks($page, $firstChunkID)
119    {
120        $chunkList = [];
121
122        $textRenderer = plugin_load('renderer', 'text');
123        if ($textRenderer) {
124            global $ID;
125            $ID = $page;
126            $text = p_cached_output(wikiFN($page), 'text', $page);
127        } else {
128            $text = rawWiki($page);
129        }
130
131        $parts = $this->splitIntoChunks($text);
132        foreach ($parts as $part) {
133            if(trim($part) == '') continue; // skip empty chunks
134
135            try {
136                $embedding = $this->openAI->getEmbedding($part);
137            } catch (\Exception $e) {
138                if ($this->logger) {
139                    $this->logger->error(
140                        'Failed to get embedding for chunk of page {page}: {msg}',
141                        ['page' => $page, 'msg' => $e->getMessage()]
142                    );
143                }
144                continue;
145            }
146            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
147            $firstChunkID++;
148        }
149        if ($this->logger) {
150            if(count($chunkList)) {
151                $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunkList)]);
152            } else {
153                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
154            }
155        }
156        return $chunkList;
157    }
158
159    /**
160     * Do a nearest neighbor search for chunks similar to the given question
161     *
162     * Returns only chunks the current user is allowed to read, may return an empty result.
163     *
164     * @param string $query The question
165     * @param int $limit The number of results to return
166     * @return Chunk[]
167     * @throws \Exception
168     */
169    public function getSimilarChunks($query, $limit = 4)
170    {
171        global $auth;
172        $vector = $this->openAI->getEmbedding($query);
173
174        $chunks = $this->storage->getSimilarChunks($vector, $limit);
175        $result = [];
176        foreach ($chunks as $chunk) {
177            // filter out chunks the user is not allowed to read
178            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
179            $result[] = $chunk;
180            if (count($result) >= $limit) break;
181        }
182        return $result;
183    }
184
185
186    /**
187     * @param $text
188     * @return array
189     * @throws \Exception
190     * @todo maybe add overlap support
191     * @todo support splitting too long sentences
192     */
193    public function splitIntoChunks($text)
194    {
195        $sentenceSplitter = new Sentence();
196        $tiktok = new Encoder();
197
198        $chunks = [];
199        $sentences = $sentenceSplitter->split($text);
200
201        $chunklen = 0;
202        $chunk = '';
203        while ($sentence = array_shift($sentences)) {
204            $slen = count($tiktok->encode($sentence));
205            if ($slen > self::MAX_TOKEN_LEN) {
206                // sentence is too long, we need to split it further
207                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
208                continue;
209            }
210
211            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
212                // add to current chunk
213                $chunk .= $sentence;
214                $chunklen += $slen;
215            } else {
216                // start new chunk
217                $chunks[] = $chunk;
218                $chunk = $sentence;
219                $chunklen = $slen;
220            }
221        }
222        $chunks[] = $chunk;
223
224        return $chunks;
225    }
226}
227