xref: /plugin/aichat/Embeddings.php (revision 9da5f0df9b3bbdaf1de18d37258b054d79f1eaa7)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\FSKDTree;
7use Hexogen\KDTree\FSTreePersister;
8use Hexogen\KDTree\Item;
9use Hexogen\KDTree\ItemFactory;
10use Hexogen\KDTree\ItemList;
11use Hexogen\KDTree\KDTree;
12use Hexogen\KDTree\NearestSearch;
13use Hexogen\KDTree\Point;
14use TikToken\Encoder;
15use Vanderlee\Sentence\Sentence;
16
17/**
18 * Manage the embeddings index
19 *
20 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
21 * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
22 */
23class Embeddings
24{
25
26    const MAX_TOKEN_LEN = 1000;
27    const INDEX_NAME = 'aichat';
28    const INDEX_FILE = 'index.bin';
29
30    protected $openAI;
31
32    public function __construct(OpenAI $openAI, $logger = null)
33    {
34        $this->openAI = $openAI;
35        $this->logger = $logger;
36    }
37
38    public function createNewIndex()
39    {
40        io_rmdir($this->getStorageDir(), true); // delete old index
41
42        $indexer = new Indexer();
43        $pages = $indexer->getPages();
44        $itemCount = 0;
45
46        $itemList = new ItemList(1536);
47        foreach ($pages as $page) {
48            if (!page_exists($page)) continue;
49            $text = rawWiki($page);
50            $chunks = $this->splitIntoChunks($text);
51            $meta = [
52                'pageid' => $page,
53                // fixme add title here?
54            ];
55            foreach ($chunks as $chunk) {
56                $embedding = $this->openAI->getEmbedding($chunk);
57                $item = new Item($itemCount++, $embedding);
58                $itemList->addItem($item);
59                $this->saveChunk($item->getId(), $chunk, $meta);
60            }
61            if ($this->logger) {
62                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
63            }
64        }
65
66        $tree = new KDTree($itemList);
67        if($this->logger) {
68            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
69        }
70        $persister = new FSTreePersister($this->getStorageDir());
71        $persister->convert($tree, self::INDEX_FILE);
72    }
73
74    public function getSimilarChunks($query, $limit = 4)
75    {
76        $embedding = $this->openAI->getEmbedding($query);
77
78        $file = $this->getStorageDir() . self::INDEX_FILE;
79        $fsTree = new FSKDTree($file, new ItemFactory());
80        $fsSearcher = new NearestSearch($fsTree);
81        $items = $fsSearcher->search(new Point($embedding), $limit);
82
83        $result = [];
84        foreach ($items as $item) {
85            $result [] = $this->loadChunk($item->getId());
86        }
87        return $result;
88    }
89
90    /**
91     * @param $text
92     * @return array
93     * @throws \Exception
94     * @todo maybe add overlap support
95     * @todo support splitting too long sentences
96     */
97    protected function splitIntoChunks($text)
98    {
99        $sentenceSplitter = new Sentence();
100        $tiktok = new Encoder();
101
102        $chunks = [];
103        $sentences = $sentenceSplitter->split($text);
104
105        $chunklen = 0;
106        $chunk = '';
107        while ($sentence = array_shift($sentences)) {
108            $slen = count($tiktok->encode($sentence));
109            if ($slen > self::MAX_TOKEN_LEN) {
110                // sentence is too long, we need to split it further
111                throw new \Exception('Sentence too long, splitting not implemented yet');
112            }
113
114            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
115                // add to current chunk
116                $chunk .= $sentence;
117                $chunklen += $slen;
118            } else {
119                // start new chunk
120                $chunks[] = $chunk;
121                $chunk = $sentence;
122                $chunklen = $slen;
123            }
124        }
125        $chunks[] = $chunk;
126
127        return $chunks;
128    }
129
130    /**
131     * Store additional chunk data in the file system
132     *
133     * @param int $id The chunk id in the K-D tree
134     * @param string $text raw text of the chunk
135     * @param array $meta meta data to store with the chunk
136     * @return void
137     */
138    public function saveChunk($id, $text, $meta = [])
139    {
140        $data = [
141            'id' => $id,
142            'text' => $text,
143            'meta' => $meta,
144        ];
145
146        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
147        io_saveFile($chunkfile, json_encode($data));
148    }
149
150    /**
151     * Load chunk data from the file system
152     *
153     * @param int $id
154     * @return array The chunk data [id, text, meta => []]
155     */
156    public function loadChunk($id)
157    {
158        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
159        return json_decode(io_readFile($chunkfile, false), true);
160    }
161
162    /**
163     * Return the path to where the K-D tree and chunk data is stored
164     *
165     * @param string $subdir
166     * @return string
167     */
168    protected function getStorageDir($subdir = '')
169    {
170        global $conf;
171        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
172        if ($subdir) $dir .= $subdir . '/';
173        io_mkdir_p($dir);
174        return $dir;
175    }
176}
177