xref: /plugin/aichat/Embeddings.php (revision 6f9744f7496f902f519f76ccd13e8a26b5dea6c9)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\FSKDTree;
7use Hexogen\KDTree\FSTreePersister;
8use Hexogen\KDTree\Item;
9use Hexogen\KDTree\ItemFactory;
10use Hexogen\KDTree\ItemList;
11use Hexogen\KDTree\KDTree;
12use Hexogen\KDTree\NearestSearch;
13use Hexogen\KDTree\Point;
14use splitbrain\phpcli\CLI;
15use TikToken\Encoder;
16use Vanderlee\Sentence\Sentence;
17
18/**
19 * Manage the embeddings index
20 *
21 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
22 * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
23 */
24class Embeddings
25{
26
27    const MAX_TOKEN_LEN = 1000;
28    const INDEX_NAME = 'aichat';
29    const INDEX_FILE = 'index.bin';
30
31    /** @var OpenAI */
32    protected $openAI;
33    /** @var CLI|null */
34    protected $logger;
35
36    /**
37     * @param OpenAI $openAI
38     */
39    public function __construct(OpenAI $openAI)
40    {
41        $this->openAI = $openAI;
42    }
43
44    /**
45     * Add a logger instance
46     *
47     * @param CLI $logger
48     * @return void
49     */
50    public function setLogger(CLI $logger)
51    {
52        $this->logger = $logger;
53    }
54
55    /**
56     * Create a new K-D Tree from all pages
57     *
58     * Deletes the existing index
59     *
60     * @return void
61     * @throws \Hexogen\KDTree\Exception\ValidationException
62     */
63    public function createNewIndex()
64    {
65        io_rmdir($this->getStorageDir(), true); // delete old index
66
67        $indexer = new Indexer();
68        $pages = $indexer->getPages();
69        $itemCount = 0;
70
71        $itemList = new ItemList(1536);
72        foreach ($pages as $page) {
73            if (!page_exists($page)) continue;
74            if (isHiddenPage($page)) continue;
75            $text = rawWiki($page);
76            $chunks = $this->splitIntoChunks($text);
77            $meta = [
78                'pageid' => $page,
79            ];
80            foreach ($chunks as $chunk) {
81                $embedding = $this->openAI->getEmbedding($chunk);
82                $item = new Item($itemCount++, $embedding);
83                $itemList->addItem($item);
84                $this->saveChunk($item->getId(), $chunk, $meta);
85            }
86            if ($this->logger) {
87                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
88            }
89        }
90
91        $tree = new KDTree($itemList);
92        if ($this->logger) {
93            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
94        }
95        $persister = new FSTreePersister($this->getStorageDir());
96        $persister->convert($tree, self::INDEX_FILE);
97    }
98
99    /**
100     * Do a nearest neighbor search for chunks similar to the given question
101     *
102     * Returns only chunks the current user is allowed to read, may return an empty result.
103     *
104     * @param string $query The question
105     * @param int $limit The number of results to return
106     * @return array
107     * @throws \Exception
108     */
109    public function getSimilarChunks($query, $limit = 4)
110    {
111        global $auth;
112        $embedding = $this->openAI->getEmbedding($query);
113
114        $file = $this->getStorageDir() . self::INDEX_FILE;
115        $fsTree = new FSKDTree($file, new ItemFactory());
116        $fsSearcher = new NearestSearch($fsTree);
117        $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed
118
119        $result = [];
120        foreach ($items as $item) {
121            $chunk = $this->loadChunk($item->getId());
122            // filter out chunks the user is not allowed to read
123            if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue;
124            $result[] = $chunk;
125            if (count($result) >= $limit) break;
126        }
127        return $result;
128    }
129
130    /**
131     * @param $text
132     * @return array
133     * @throws \Exception
134     * @todo maybe add overlap support
135     * @todo support splitting too long sentences
136     */
137    protected function splitIntoChunks($text)
138    {
139        $sentenceSplitter = new Sentence();
140        $tiktok = new Encoder();
141
142        $chunks = [];
143        $sentences = $sentenceSplitter->split($text);
144
145        $chunklen = 0;
146        $chunk = '';
147        while ($sentence = array_shift($sentences)) {
148            $slen = count($tiktok->encode($sentence));
149            if ($slen > self::MAX_TOKEN_LEN) {
150                // sentence is too long, we need to split it further
151                throw new \Exception('Sentence too long, splitting not implemented yet');
152            }
153
154            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
155                // add to current chunk
156                $chunk .= $sentence;
157                $chunklen += $slen;
158            } else {
159                // start new chunk
160                $chunks[] = $chunk;
161                $chunk = $sentence;
162                $chunklen = $slen;
163            }
164        }
165        $chunks[] = $chunk;
166
167        return $chunks;
168    }
169
170    /**
171     * Store additional chunk data in the file system
172     *
173     * @param int $id The chunk id in the K-D tree
174     * @param string $text raw text of the chunk
175     * @param array $meta meta data to store with the chunk
176     * @return void
177     */
178    public function saveChunk($id, $text, $meta = [])
179    {
180        $data = [
181            'id' => $id,
182            'text' => $text,
183            'meta' => $meta,
184        ];
185
186        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
187        io_saveFile($chunkfile, json_encode($data));
188    }
189
190    /**
191     * Load chunk data from the file system
192     *
193     * @param int $id
194     * @return array The chunk data [id, text, meta => []]
195     */
196    public function loadChunk($id)
197    {
198        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
199        return json_decode(io_readFile($chunkfile, false), true);
200    }
201
202    /**
203     * Return the path to where the K-D tree and chunk data is stored
204     *
205     * @param string $subdir
206     * @return string
207     */
208    protected function getStorageDir($subdir = '')
209    {
210        global $conf;
211        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
212        if ($subdir) $dir .= $subdir . '/';
213        io_mkdir_p($dir);
214        return $dir;
215    }
216}
217