xref: /plugin/aichat/Embeddings.php (revision ad38c5fd62a65d04772bdd994d54a93483f88639)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\Exception\ValidationException;
7use Hexogen\KDTree\FSKDTree;
8use Hexogen\KDTree\FSTreePersister;
9use Hexogen\KDTree\Item;
10use Hexogen\KDTree\ItemFactory;
11use Hexogen\KDTree\ItemList;
12use Hexogen\KDTree\KDTree;
13use Hexogen\KDTree\NearestSearch;
14use Hexogen\KDTree\Point;
15use splitbrain\phpcli\CLI;
16use TikToken\Encoder;
17use Vanderlee\Sentence\Sentence;
18
19/**
20 * Manage the embeddings index
21 *
22 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
23 * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
24 */
25class Embeddings
26{
27
28    const MAX_TOKEN_LEN = 1000;
29    const INDEX_NAME = 'aichat';
30    const INDEX_FILE = 'index.bin';
31
32    /** @var OpenAI */
33    protected $openAI;
34    /** @var CLI|null */
35    protected $logger;
36
37    /**
38     * @param OpenAI $openAI
39     */
40    public function __construct(OpenAI $openAI)
41    {
42        $this->openAI = $openAI;
43    }
44
45    /**
46     * Add a logger instance
47     *
48     * @param CLI $logger
49     * @return void
50     */
51    public function setLogger(CLI $logger)
52    {
53        $this->logger = $logger;
54    }
55
56    /**
57     * Create a new K-D Tree from all pages
58     *
59     * Deletes the existing index
60     *
61     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
62     * @return void
63     * @throws ValidationException
64     */
65    public function createNewIndex($skipRE = '')
66    {
67        io_rmdir($this->getStorageDir(), true); // delete old index
68
69        $indexer = new Indexer();
70        $pages = $indexer->getPages();
71        $itemCount = 0;
72
73        $itemList = new ItemList(1536);
74        foreach ($pages as $page) {
75            if (!page_exists($page)) continue;
76            if (isHiddenPage($page)) continue;
77            if ($skipRE && preg_match($skipRE, $page)) continue;
78            $text = rawWiki($page);
79            $chunks = $this->splitIntoChunks($text);
80            $meta = [
81                'pageid' => $page,
82            ];
83            foreach ($chunks as $chunk) {
84                try {
85                    $embedding = $this->openAI->getEmbedding($chunk);
86                } catch (\Exception $e) {
87                    if ($this->logger) {
88                        $this->logger->error(
89                            'Failed to get embedding for chunk of page {page}: {msg}',
90                            ['page' => $page, 'msg' => $e->getMessage()]
91                        );
92                    }
93                    continue;
94                }
95                $item = new Item($itemCount++, $embedding);
96                $itemList->addItem($item);
97                $this->saveChunk($item->getId(), $chunk, $meta);
98            }
99            if ($this->logger) {
100                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
101            }
102        }
103
104        $tree = new KDTree($itemList);
105        if ($this->logger) {
106            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
107        }
108        $persister = new FSTreePersister($this->getStorageDir());
109        $persister->convert($tree, self::INDEX_FILE);
110    }
111
112    /**
113     * Do a nearest neighbor search for chunks similar to the given question
114     *
115     * Returns only chunks the current user is allowed to read, may return an empty result.
116     *
117     * @param string $query The question
118     * @param int $limit The number of results to return
119     * @return array
120     * @throws \Exception
121     */
122    public function getSimilarChunks($query, $limit = 4)
123    {
124        global $auth;
125        $embedding = $this->openAI->getEmbedding($query);
126
127        $file = $this->getStorageDir() . self::INDEX_FILE;
128        $fsTree = new FSKDTree($file, new ItemFactory());
129        $fsSearcher = new NearestSearch($fsTree);
130        $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed
131
132        $result = [];
133        foreach ($items as $item) {
134            $chunk = $this->loadChunk($item->getId());
135            // filter out chunks the user is not allowed to read
136            if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue;
137            $result[] = $chunk;
138            if (count($result) >= $limit) break;
139        }
140        return $result;
141    }
142
143    /**
144     * @param $text
145     * @return array
146     * @throws \Exception
147     * @todo maybe add overlap support
148     * @todo support splitting too long sentences
149     */
150    public function splitIntoChunks($text)
151    {
152        $sentenceSplitter = new Sentence();
153        $tiktok = new Encoder();
154
155        $chunks = [];
156        $sentences = $sentenceSplitter->split($text);
157
158        $chunklen = 0;
159        $chunk = '';
160        while ($sentence = array_shift($sentences)) {
161            $slen = count($tiktok->encode($sentence));
162            if ($slen > self::MAX_TOKEN_LEN) {
163                // sentence is too long, we need to split it further
164                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
165                continue;
166            }
167
168            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
169                // add to current chunk
170                $chunk .= $sentence;
171                $chunklen += $slen;
172            } else {
173                // start new chunk
174                $chunks[] = $chunk;
175                $chunk = $sentence;
176                $chunklen = $slen;
177            }
178        }
179        $chunks[] = $chunk;
180
181        return $chunks;
182    }
183
184    /**
185     * Store additional chunk data in the file system
186     *
187     * @param int $id The chunk id in the K-D tree
188     * @param string $text raw text of the chunk
189     * @param array $meta meta data to store with the chunk
190     * @return void
191     */
192    public function saveChunk($id, $text, $meta = [])
193    {
194        $data = [
195            'id' => $id,
196            'text' => $text,
197            'meta' => $meta,
198        ];
199
200        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
201        io_saveFile($chunkfile, json_encode($data));
202    }
203
204    /**
205     * Load chunk data from the file system
206     *
207     * @param int $id
208     * @return array The chunk data [id, text, meta => []]
209     */
210    public function loadChunk($id)
211    {
212        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
213        return json_decode(io_readFile($chunkfile, false), true);
214    }
215
216    /**
217     * Return the path to where the K-D tree and chunk data is stored
218     *
219     * @param string $subdir
220     * @return string
221     */
222    protected function getStorageDir($subdir = '')
223    {
224        global $conf;
225        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
226        if ($subdir) $dir .= $subdir . '/';
227        io_mkdir_p($dir);
228        return $dir;
229    }
230}
231