xref: /plugin/aichat/Embeddings.php (revision 8817535b0c67f8b10e9b8c05dcdf58fc17827423)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\FSKDTree;
7use Hexogen\KDTree\FSTreePersister;
8use Hexogen\KDTree\Item;
9use Hexogen\KDTree\ItemFactory;
10use Hexogen\KDTree\ItemList;
11use Hexogen\KDTree\KDTree;
12use Hexogen\KDTree\NearestSearch;
13use Hexogen\KDTree\Point;
14use TikToken\Encoder;
15use Vanderlee\Sentence\Sentence;
16
17class Embeddings
18{
19
20    const MAX_TOKEN_LEN = 1500;
21    const INDEX_NAME = 'aichat';
22    const INDEX_FILE = 'index.bin';
23
24    protected $openAI;
25
26    public function __construct(OpenAI $openAI, $logger = null)
27    {
28        $this->openAI = $openAI;
29        $this->logger = $logger;
30    }
31
32    public function createNewIndex()
33    {
34        io_rmdir($this->getStorageDir(), true); // delete old index
35
36        $indexer = new Indexer();
37        $pages = $indexer->getPages();
38        $itemCount = 0;
39
40        $itemList = new ItemList(1536);
41        foreach ($pages as $page) {
42            if (!page_exists($page)) continue;
43            $text = rawWiki($page);
44            $chunks = $this->splitIntoChunks($text);
45            $meta = [
46                'pageid' => $page,
47                // fixme add title here?
48            ];
49            foreach ($chunks as $chunk) {
50                $embedding = $this->openAI->getEmbedding($chunk);
51                $item = new Item($itemCount++, $embedding);
52                $itemList->addItem($item);
53                $this->saveChunk($item->getId(), $chunk, $meta);
54            }
55            if ($this->logger) {
56                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
57            }
58        }
59
60        $tree = new KDTree($itemList);
61        if($this->logger) {
62            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
63        }
64        $persister = new FSTreePersister($this->getStorageDir());
65        $persister->convert($tree, self::INDEX_FILE);
66    }
67
68    public function getSimilarChunks($query, $limit = 4)
69    {
70        $embedding = $this->openAI->getEmbedding($query);
71
72        $file = $this->getStorageDir() . self::INDEX_FILE;
73        $fsTree = new FSKDTree($file, new ItemFactory());
74        $items = $fsTree->getItemCount();
75        $fsSearcher = new NearestSearch($fsTree);
76        $items = $fsSearcher->search(new Point($embedding), $limit);
77
78        $result = [];
79        foreach ($items as $item) {
80            $result [] = $this->loadChunk($item->getId());
81        }
82        return $result;
83    }
84
85    /**
86     * @param $text
87     * @return array
88     * @throws \Exception
89     * @todo maybe add overlap support
90     * @todo support splitting too long sentences
91     */
92    protected function splitIntoChunks($text)
93    {
94        $sentenceSplitter = new Sentence();
95        $tiktok = new Encoder();
96
97        $chunks = [];
98        $sentences = $sentenceSplitter->split($text);
99
100        $chunklen = 0;
101        $chunk = '';
102        while ($sentence = array_shift($sentences)) {
103            $slen = count($tiktok->encode($sentence));
104            if ($slen > self::MAX_TOKEN_LEN) {
105                // sentence is too long, we need to split it further
106                throw new \Exception('Sentence too long, splitting not implemented yet');
107            }
108
109            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
110                // add to current chunk
111                $chunk .= $sentence;
112                $chunklen += $slen;
113            } else {
114                // start new chunk
115                $chunks[] = $chunk;
116                $chunk = $sentence;
117                $chunklen = $slen;
118            }
119        }
120        $chunks[] = $chunk;
121
122        return $chunks;
123    }
124
125
126    public function saveChunk($id, $text, $meta = [])
127    {
128        $data = [
129            'id' => $id,
130            'text' => $text,
131            'meta' => $meta,
132        ];
133
134        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
135        io_saveFile($chunkfile, json_encode($data));
136    }
137
138
139    public function loadChunk($id)
140    {
141        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
142        return json_decode(io_readFile($chunkfile, false), true);
143    }
144
145    protected function getStorageDir($subdir = '')
146    {
147        global $conf;
148        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
149        if ($subdir) $dir .= $subdir . '/';
150        io_mkdir_p($dir);
151        return $dir;
152    }
153}
154