xref: /plugin/aichat/Embeddings.php (revision 5786be46be4ea7477d2002e973db2f0e45f3db8b)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\Exception\ValidationException;
7use Hexogen\KDTree\FSKDTree;
8use Hexogen\KDTree\FSTreePersister;
9use Hexogen\KDTree\Item;
10use Hexogen\KDTree\ItemFactory;
11use Hexogen\KDTree\ItemList;
12use Hexogen\KDTree\KDTree;
13use Hexogen\KDTree\NearestSearch;
14use Hexogen\KDTree\Point;
15use splitbrain\phpcli\CLI;
16use TikToken\Encoder;
17use Vanderlee\Sentence\Sentence;
18
19/**
20 * Manage the embeddings index
21 *
22 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
23 * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
24 */
25class Embeddings
26{
27
28    const MAX_TOKEN_LEN = 1000;
29    const INDEX_NAME = 'aichat';
30    const INDEX_FILE = 'index.bin';
31
32    /** @var OpenAI */
33    protected $openAI;
34    /** @var CLI|null */
35    protected $logger;
36
37    /**
38     * @param OpenAI $openAI
39     */
40    public function __construct(OpenAI $openAI)
41    {
42        $this->openAI = $openAI;
43    }
44
45    /**
46     * Add a logger instance
47     *
48     * @param CLI $logger
49     * @return void
50     */
51    public function setLogger(CLI $logger)
52    {
53        $this->logger = $logger;
54    }
55
56    /**
57     * Create a new K-D Tree from all pages
58     *
59     * Deletes the existing index
60     *
61     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
62     * @return void
63     * @throws ValidationException
64     */
65    public function createNewIndex($skipRE = '')
66    {
67        $indexer = new Indexer();
68        $pages = $indexer->getPages();
69
70        $itemList = new ItemList(1536);
71        foreach ($pages as $pid => $page) {
72            if (!page_exists($page)) continue;
73            if (isHiddenPage($page)) continue;
74            if ($skipRE && preg_match($skipRE, $page)) continue;
75
76            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
77
78            $firstChunk = $this->getChunkFilePath($chunkID);
79            if (@filemtime(wikiFN($page)) < @filemtime($firstChunk)) {
80                // page is older than the chunks we have, reuse the existing chunks
81                $this->reusePageChunks($itemList, $page, $chunkID);
82            } else {
83                // page is newer than the chunks we have, create new chunks
84                $this->deletePageChunks($chunkID);
85                $this->createPageChunks($itemList, $page, $chunkID);
86            }
87        }
88
89        $tree = new KDTree($itemList);
90        if ($this->logger) {
91            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
92        }
93        $persister = new FSTreePersister($this->getStorageDir());
94        $persister->convert($tree, self::INDEX_FILE);
95    }
96
97    /**
98     * Split the given page, fetch embedding vectors, save chunks and add them to the tree list
99     *
100     * @param ItemList $itemList The list to add the items to
101     * @param string $page Name of the page to split
102     * @param int $chunkID The ID of the first chunk of this page
103     * @return void
104     * @throws \Exception
105     */
106    protected function createPageChunks(ItemList $itemList, $page, $chunkID)
107    {
108        $text = rawWiki($page);
109        $chunks = $this->splitIntoChunks($text);
110        $meta = [
111            'pageid' => $page,
112        ];
113        foreach ($chunks as $chunk) {
114            try {
115                $embedding = $this->openAI->getEmbedding($chunk);
116            } catch (\Exception $e) {
117                if ($this->logger) {
118                    $this->logger->error(
119                        'Failed to get embedding for chunk of page {page}: {msg}',
120                        ['page' => $page, 'msg' => $e->getMessage()]
121                    );
122                }
123                continue;
124            }
125            $item = new Item($chunkID, $embedding);
126            $itemList->addItem($item);
127            $this->saveChunk($item->getId(), $chunk, $embedding, $meta);
128            $chunkID++;
129        }
130        if ($this->logger) {
131            $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
132        }
133    }
134
135    /**
136     * Load the existing chunks for the given page and add them to the tree list
137     *
138     * @param ItemList $itemList The list to add the items to
139     * @param string $page Name of the page to split
140     * @param int $chunkID The ID of the first chunk of this page
141     * @return void
142     */
143    protected function reusePageChunks(ItemList $itemList, $page, $chunkID)
144    {
145        for ($i = 0; $i < 100; $i++) {
146            $chunk = $this->loadChunk($chunkID + $i);
147            if (!$chunk) break;
148            $item = new Item($chunkID, $chunk['embedding']);
149            $itemList->addItem($item);
150        }
151        if ($this->logger) {
152            $this->logger->success('{id} reused {count} chunks', ['id' => $page, 'count' => $i]);
153        }
154    }
155
156    /**
157     * Delete all possibly existing chunks for one page (identified by the first chunk ID)
158     *
159     * @param int $chunkID The ID of the first chunk of this page
160     * @return void
161     */
162    protected function deletePageChunks($chunkID)
163    {
164        for ($i = 0; $i < 100; $i++) {
165            $chunk = $this->getChunkFilePath($chunkID + $i);
166            if (!file_exists($chunk)) break;
167            unlink($chunk);
168        }
169    }
170
171    /**
172     * Do a nearest neighbor search for chunks similar to the given question
173     *
174     * Returns only chunks the current user is allowed to read, may return an empty result.
175     *
176     * @param string $query The question
177     * @param int $limit The number of results to return
178     * @return array
179     * @throws \Exception
180     */
181    public function getSimilarChunks($query, $limit = 4)
182    {
183        global $auth;
184        $embedding = $this->openAI->getEmbedding($query);
185
186        $fsTree = $this->getTree();
187        $fsSearcher = new NearestSearch($fsTree);
188        $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed
189
190        $result = [];
191        foreach ($items as $item) {
192            $chunk = $this->loadChunk($item->getId());
193            // filter out chunks the user is not allowed to read
194            if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue;
195            $result[] = $chunk;
196            if (count($result) >= $limit) break;
197        }
198        return $result;
199    }
200
201    /**
202     * Access to the KD Tree
203     *
204     * @return FSKDTree
205     */
206    public function getTree()
207    {
208        $file = $this->getStorageDir() . self::INDEX_FILE;
209        return new FSKDTree($file, new ItemFactory());
210    }
211
212    /**
213     * @param $text
214     * @return array
215     * @throws \Exception
216     * @todo maybe add overlap support
217     * @todo support splitting too long sentences
218     */
219    public function splitIntoChunks($text)
220    {
221        $sentenceSplitter = new Sentence();
222        $tiktok = new Encoder();
223
224        $chunks = [];
225        $sentences = $sentenceSplitter->split($text);
226
227        $chunklen = 0;
228        $chunk = '';
229        while ($sentence = array_shift($sentences)) {
230            $slen = count($tiktok->encode($sentence));
231            if ($slen > self::MAX_TOKEN_LEN) {
232                // sentence is too long, we need to split it further
233                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
234                continue;
235            }
236
237            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
238                // add to current chunk
239                $chunk .= $sentence;
240                $chunklen += $slen;
241            } else {
242                // start new chunk
243                $chunks[] = $chunk;
244                $chunk = $sentence;
245                $chunklen = $slen;
246            }
247        }
248        $chunks[] = $chunk;
249
250        return $chunks;
251    }
252
253    /**
254     * Store additional chunk data in the file system
255     *
256     * @param int $id The chunk id in the K-D tree
257     * @param string $text raw text of the chunk
258     * @param float[] $embedding embedding vector of the chunk
259     * @param array $meta meta data to store with the chunk
260     * @return void
261     */
262    public function saveChunk($id, $text, $embedding, $meta = [])
263    {
264        $data = [
265            'id' => $id,
266            'text' => $text,
267            'embedding' => $embedding,
268            'meta' => $meta,
269        ];
270
271        $chunkfile = $this->getChunkFilePath($id);
272        io_saveFile($chunkfile, json_encode($data));
273    }
274
275    /**
276     * Load chunk data from the file system
277     *
278     * @param int $id
279     * @return array|false The chunk data [id, text, embedding, meta => []], false if not found
280     */
281    public function loadChunk($id)
282    {
283        $chunkfile = $this->getChunkFilePath($id);
284        if (!file_exists($chunkfile)) return false;
285        return json_decode(io_readFile($chunkfile, false), true);
286    }
287
288    /**
289     * Return the path to the chunk file
290     *
291     * @param $id
292     * @return string
293     */
294    protected function getChunkFilePath($id)
295    {
296        $id = dechex($id); // use hexadecimal for shorter file names
297        return $this->getStorageDir('chunk') . $id . '.json';
298    }
299
300    /**
301     * Return the path to where the K-D tree and chunk data is stored
302     *
303     * @param string $subdir
304     * @return string
305     */
306    protected function getStorageDir($subdir = '')
307    {
308        global $conf;
309        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
310        if ($subdir) $dir .= $subdir . '/';
311        io_mkdir_p($dir);
312        return $dir;
313    }
314}
315