xref: /plugin/aichat/Embeddings.php (revision 9e81bea79e153f16a1470ded51bcd0395dfad8cc)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\FSKDTree;
7use Hexogen\KDTree\FSTreePersister;
8use Hexogen\KDTree\Item;
9use Hexogen\KDTree\ItemFactory;
10use Hexogen\KDTree\ItemList;
11use Hexogen\KDTree\KDTree;
12use Hexogen\KDTree\NearestSearch;
13use Hexogen\KDTree\Point;
14use TikToken\Encoder;
15use Vanderlee\Sentence\Sentence;
16
17/**
18 * Manage the embeddings index
19 *
20 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
21 * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
22 */
23class Embeddings
24{
25
26    const MAX_TOKEN_LEN = 1000;
27    const INDEX_NAME = 'aichat';
28    const INDEX_FILE = 'index.bin';
29
30    protected $openAI;
31
32    public function __construct(OpenAI $openAI, $logger = null)
33    {
34        $this->openAI = $openAI;
35        $this->logger = $logger;
36    }
37
38    public function createNewIndex()
39    {
40        io_rmdir($this->getStorageDir(), true); // delete old index
41
42        $indexer = new Indexer();
43        $pages = $indexer->getPages();
44        $itemCount = 0;
45
46        $itemList = new ItemList(1536);
47        foreach ($pages as $page) {
48            if (!page_exists($page)) continue;
49            $text = rawWiki($page);
50            $chunks = $this->splitIntoChunks($text);
51            $meta = [
52                'pageid' => $page,
53                // fixme add title here?
54            ];
55            foreach ($chunks as $chunk) {
56                $embedding = $this->openAI->getEmbedding($chunk);
57                $item = new Item($itemCount++, $embedding);
58                $itemList->addItem($item);
59                $this->saveChunk($item->getId(), $chunk, $meta);
60            }
61            if ($this->logger) {
62                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
63            }
64        }
65
66        $tree = new KDTree($itemList);
67        if ($this->logger) {
68            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
69        }
70        $persister = new FSTreePersister($this->getStorageDir());
71        $persister->convert($tree, self::INDEX_FILE);
72    }
73
74    /**
75     * Do a nearest neighbor search for chunks similar to the given question
76     *
77     * Returns only chunks the current user is allowed to read, may return an empty result.
78     *
79     * @param string $query The question
80     * @param int $limit The number of results to return
81     * @return array
82     * @throws \Exception
83     */
84    public function getSimilarChunks($query, $limit = 4)
85    {
86        global $auth;
87        $embedding = $this->openAI->getEmbedding($query);
88
89        $file = $this->getStorageDir() . self::INDEX_FILE;
90        $fsTree = new FSKDTree($file, new ItemFactory());
91        $fsSearcher = new NearestSearch($fsTree);
92        $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed
93
94        $result = [];
95        foreach ($items as $item) {
96            $chunk = $this->loadChunk($item->getId());
97            // filter out chunks the user is not allowed to read
98            if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue;
99            $result[] = $chunk;
100            if (count($result) >= $limit) break;
101        }
102        return $result;
103    }
104
105    /**
106     * @param $text
107     * @return array
108     * @throws \Exception
109     * @todo maybe add overlap support
110     * @todo support splitting too long sentences
111     */
112    protected function splitIntoChunks($text)
113    {
114        $sentenceSplitter = new Sentence();
115        $tiktok = new Encoder();
116
117        $chunks = [];
118        $sentences = $sentenceSplitter->split($text);
119
120        $chunklen = 0;
121        $chunk = '';
122        while ($sentence = array_shift($sentences)) {
123            $slen = count($tiktok->encode($sentence));
124            if ($slen > self::MAX_TOKEN_LEN) {
125                // sentence is too long, we need to split it further
126                throw new \Exception('Sentence too long, splitting not implemented yet');
127            }
128
129            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
130                // add to current chunk
131                $chunk .= $sentence;
132                $chunklen += $slen;
133            } else {
134                // start new chunk
135                $chunks[] = $chunk;
136                $chunk = $sentence;
137                $chunklen = $slen;
138            }
139        }
140        $chunks[] = $chunk;
141
142        return $chunks;
143    }
144
145    /**
146     * Store additional chunk data in the file system
147     *
148     * @param int $id The chunk id in the K-D tree
149     * @param string $text raw text of the chunk
150     * @param array $meta meta data to store with the chunk
151     * @return void
152     */
153    public function saveChunk($id, $text, $meta = [])
154    {
155        $data = [
156            'id' => $id,
157            'text' => $text,
158            'meta' => $meta,
159        ];
160
161        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
162        io_saveFile($chunkfile, json_encode($data));
163    }
164
165    /**
166     * Load chunk data from the file system
167     *
168     * @param int $id
169     * @return array The chunk data [id, text, meta => []]
170     */
171    public function loadChunk($id)
172    {
173        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
174        return json_decode(io_readFile($chunkfile, false), true);
175    }
176
177    /**
178     * Return the path to where the K-D tree and chunk data is stored
179     *
180     * @param string $subdir
181     * @return string
182     */
183    protected function getStorageDir($subdir = '')
184    {
185        global $conf;
186        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
187        if ($subdir) $dir .= $subdir . '/';
188        io_mkdir_p($dir);
189        return $dir;
190    }
191}
192