xref: /plugin/aichat/Embeddings.php (revision c4584168c6c9af22a69973c17af0a7ff5f7fb802)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\Search\Indexer;
6use Hexogen\KDTree\FSKDTree;
7use Hexogen\KDTree\FSTreePersister;
8use Hexogen\KDTree\Item;
9use Hexogen\KDTree\ItemFactory;
10use Hexogen\KDTree\ItemList;
11use Hexogen\KDTree\KDTree;
12use Hexogen\KDTree\NearestSearch;
13use Hexogen\KDTree\Point;
14use TikToken\Encoder;
15use Vanderlee\Sentence\Sentence;
16
17class Embeddings
18{
19
20    const MAX_TOKEN_LEN = 1000;
21    const INDEX_NAME = 'aichat';
22    const INDEX_FILE = 'index.bin';
23
24    protected $openAI;
25
26    public function __construct(OpenAI $openAI, $logger = null)
27    {
28        $this->openAI = $openAI;
29        $this->logger = $logger;
30    }
31
32    public function createNewIndex()
33    {
34        io_rmdir($this->getStorageDir(), true); // delete old index
35
36        $indexer = new Indexer();
37        $pages = $indexer->getPages();
38        $itemCount = 0;
39
40        $itemList = new ItemList(1536);
41        foreach ($pages as $page) {
42            if (!page_exists($page)) continue;
43            $text = rawWiki($page);
44            $chunks = $this->splitIntoChunks($text);
45            $meta = [
46                'pageid' => $page,
47                // fixme add title here?
48            ];
49            foreach ($chunks as $chunk) {
50                $embedding = $this->openAI->getEmbedding($chunk);
51                $item = new Item($itemCount++, $embedding);
52                $itemList->addItem($item);
53                $this->saveChunk($item->getId(), $chunk, $meta);
54            }
55            if ($this->logger) {
56                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
57            }
58        }
59
60        $tree = new KDTree($itemList);
61        if($this->logger) {
62            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
63        }
64        $persister = new FSTreePersister($this->getStorageDir());
65        $persister->convert($tree, self::INDEX_FILE);
66    }
67
68    public function getSimilarChunks($query, $limit = 4)
69    {
70        $embedding = $this->openAI->getEmbedding($query);
71
72        $file = $this->getStorageDir() . self::INDEX_FILE;
73        $fsTree = new FSKDTree($file, new ItemFactory());
74        $fsSearcher = new NearestSearch($fsTree);
75        $items = $fsSearcher->search(new Point($embedding), $limit);
76
77        $result = [];
78        foreach ($items as $item) {
79            $result [] = $this->loadChunk($item->getId());
80        }
81        return $result;
82    }
83
84    /**
85     * @param $text
86     * @return array
87     * @throws \Exception
88     * @todo maybe add overlap support
89     * @todo support splitting too long sentences
90     */
91    protected function splitIntoChunks($text)
92    {
93        $sentenceSplitter = new Sentence();
94        $tiktok = new Encoder();
95
96        $chunks = [];
97        $sentences = $sentenceSplitter->split($text);
98
99        $chunklen = 0;
100        $chunk = '';
101        while ($sentence = array_shift($sentences)) {
102            $slen = count($tiktok->encode($sentence));
103            if ($slen > self::MAX_TOKEN_LEN) {
104                // sentence is too long, we need to split it further
105                throw new \Exception('Sentence too long, splitting not implemented yet');
106            }
107
108            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
109                // add to current chunk
110                $chunk .= $sentence;
111                $chunklen += $slen;
112            } else {
113                // start new chunk
114                $chunks[] = $chunk;
115                $chunk = $sentence;
116                $chunklen = $slen;
117            }
118        }
119        $chunks[] = $chunk;
120
121        return $chunks;
122    }
123
124
125    public function saveChunk($id, $text, $meta = [])
126    {
127        $data = [
128            'id' => $id,
129            'text' => $text,
130            'meta' => $meta,
131        ];
132
133        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
134        io_saveFile($chunkfile, json_encode($data));
135    }
136
137
138    public function loadChunk($id)
139    {
140        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
141        return json_decode(io_readFile($chunkfile, false), true);
142    }
143
144    protected function getStorageDir($subdir = '')
145    {
146        global $conf;
147        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
148        if ($subdir) $dir .= $subdir . '/';
149        io_mkdir_p($dir);
150        return $dir;
151    }
152}
153