xref: /plugin/aichat/Embeddings.php (revision 8817535b0c67f8b10e9b8c05dcdf58fc17827423)
1*8817535bSAndreas Gohr<?php
2*8817535bSAndreas Gohr
3*8817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
4*8817535bSAndreas Gohr
5*8817535bSAndreas Gohruse dokuwiki\Search\Indexer;
6*8817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree;
7*8817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister;
8*8817535bSAndreas Gohruse Hexogen\KDTree\Item;
9*8817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory;
10*8817535bSAndreas Gohruse Hexogen\KDTree\ItemList;
11*8817535bSAndreas Gohruse Hexogen\KDTree\KDTree;
12*8817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch;
13*8817535bSAndreas Gohruse Hexogen\KDTree\Point;
14*8817535bSAndreas Gohruse TikToken\Encoder;
15*8817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
16*8817535bSAndreas Gohr
17*8817535bSAndreas Gohrclass Embeddings
18*8817535bSAndreas Gohr{
19*8817535bSAndreas Gohr
20*8817535bSAndreas Gohr    const MAX_TOKEN_LEN = 1500;
21*8817535bSAndreas Gohr    const INDEX_NAME = 'aichat';
22*8817535bSAndreas Gohr    const INDEX_FILE = 'index.bin';
23*8817535bSAndreas Gohr
24*8817535bSAndreas Gohr    protected $openAI;
25*8817535bSAndreas Gohr
26*8817535bSAndreas Gohr    public function __construct(OpenAI $openAI, $logger = null)
27*8817535bSAndreas Gohr    {
28*8817535bSAndreas Gohr        $this->openAI = $openAI;
29*8817535bSAndreas Gohr        $this->logger = $logger;
30*8817535bSAndreas Gohr    }
31*8817535bSAndreas Gohr
32*8817535bSAndreas Gohr    public function createNewIndex()
33*8817535bSAndreas Gohr    {
34*8817535bSAndreas Gohr        io_rmdir($this->getStorageDir(), true); // delete old index
35*8817535bSAndreas Gohr
36*8817535bSAndreas Gohr        $indexer = new Indexer();
37*8817535bSAndreas Gohr        $pages = $indexer->getPages();
38*8817535bSAndreas Gohr        $itemCount = 0;
39*8817535bSAndreas Gohr
40*8817535bSAndreas Gohr        $itemList = new ItemList(1536);
41*8817535bSAndreas Gohr        foreach ($pages as $page) {
42*8817535bSAndreas Gohr            if (!page_exists($page)) continue;
43*8817535bSAndreas Gohr            $text = rawWiki($page);
44*8817535bSAndreas Gohr            $chunks = $this->splitIntoChunks($text);
45*8817535bSAndreas Gohr            $meta = [
46*8817535bSAndreas Gohr                'pageid' => $page,
47*8817535bSAndreas Gohr                // fixme add title here?
48*8817535bSAndreas Gohr            ];
49*8817535bSAndreas Gohr            foreach ($chunks as $chunk) {
50*8817535bSAndreas Gohr                $embedding = $this->openAI->getEmbedding($chunk);
51*8817535bSAndreas Gohr                $item = new Item($itemCount++, $embedding);
52*8817535bSAndreas Gohr                $itemList->addItem($item);
53*8817535bSAndreas Gohr                $this->saveChunk($item->getId(), $chunk, $meta);
54*8817535bSAndreas Gohr            }
55*8817535bSAndreas Gohr            if ($this->logger) {
56*8817535bSAndreas Gohr                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
57*8817535bSAndreas Gohr            }
58*8817535bSAndreas Gohr        }
59*8817535bSAndreas Gohr
60*8817535bSAndreas Gohr        $tree = new KDTree($itemList);
61*8817535bSAndreas Gohr        if($this->logger) {
62*8817535bSAndreas Gohr            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
63*8817535bSAndreas Gohr        }
64*8817535bSAndreas Gohr        $persister = new FSTreePersister($this->getStorageDir());
65*8817535bSAndreas Gohr        $persister->convert($tree, self::INDEX_FILE);
66*8817535bSAndreas Gohr    }
67*8817535bSAndreas Gohr
68*8817535bSAndreas Gohr    public function getSimilarChunks($query, $limit = 4)
69*8817535bSAndreas Gohr    {
70*8817535bSAndreas Gohr        $embedding = $this->openAI->getEmbedding($query);
71*8817535bSAndreas Gohr
72*8817535bSAndreas Gohr        $file = $this->getStorageDir() . self::INDEX_FILE;
73*8817535bSAndreas Gohr        $fsTree = new FSKDTree($file, new ItemFactory());
74*8817535bSAndreas Gohr        $items = $fsTree->getItemCount();
75*8817535bSAndreas Gohr        $fsSearcher = new NearestSearch($fsTree);
76*8817535bSAndreas Gohr        $items = $fsSearcher->search(new Point($embedding), $limit);
77*8817535bSAndreas Gohr
78*8817535bSAndreas Gohr        $result = [];
79*8817535bSAndreas Gohr        foreach ($items as $item) {
80*8817535bSAndreas Gohr            $result [] = $this->loadChunk($item->getId());
81*8817535bSAndreas Gohr        }
82*8817535bSAndreas Gohr        return $result;
83*8817535bSAndreas Gohr    }
84*8817535bSAndreas Gohr
85*8817535bSAndreas Gohr    /**
86*8817535bSAndreas Gohr     * @param $text
87*8817535bSAndreas Gohr     * @return array
88*8817535bSAndreas Gohr     * @throws \Exception
89*8817535bSAndreas Gohr     * @todo maybe add overlap support
90*8817535bSAndreas Gohr     * @todo support splitting too long sentences
91*8817535bSAndreas Gohr     */
92*8817535bSAndreas Gohr    protected function splitIntoChunks($text)
93*8817535bSAndreas Gohr    {
94*8817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
95*8817535bSAndreas Gohr        $tiktok = new Encoder();
96*8817535bSAndreas Gohr
97*8817535bSAndreas Gohr        $chunks = [];
98*8817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
99*8817535bSAndreas Gohr
100*8817535bSAndreas Gohr        $chunklen = 0;
101*8817535bSAndreas Gohr        $chunk = '';
102*8817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
103*8817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
104*8817535bSAndreas Gohr            if ($slen > self::MAX_TOKEN_LEN) {
105*8817535bSAndreas Gohr                // sentence is too long, we need to split it further
106*8817535bSAndreas Gohr                throw new \Exception('Sentence too long, splitting not implemented yet');
107*8817535bSAndreas Gohr            }
108*8817535bSAndreas Gohr
109*8817535bSAndreas Gohr            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
110*8817535bSAndreas Gohr                // add to current chunk
111*8817535bSAndreas Gohr                $chunk .= $sentence;
112*8817535bSAndreas Gohr                $chunklen += $slen;
113*8817535bSAndreas Gohr            } else {
114*8817535bSAndreas Gohr                // start new chunk
115*8817535bSAndreas Gohr                $chunks[] = $chunk;
116*8817535bSAndreas Gohr                $chunk = $sentence;
117*8817535bSAndreas Gohr                $chunklen = $slen;
118*8817535bSAndreas Gohr            }
119*8817535bSAndreas Gohr        }
120*8817535bSAndreas Gohr        $chunks[] = $chunk;
121*8817535bSAndreas Gohr
122*8817535bSAndreas Gohr        return $chunks;
123*8817535bSAndreas Gohr    }
124*8817535bSAndreas Gohr
125*8817535bSAndreas Gohr
126*8817535bSAndreas Gohr    public function saveChunk($id, $text, $meta = [])
127*8817535bSAndreas Gohr    {
128*8817535bSAndreas Gohr        $data = [
129*8817535bSAndreas Gohr            'id' => $id,
130*8817535bSAndreas Gohr            'text' => $text,
131*8817535bSAndreas Gohr            'meta' => $meta,
132*8817535bSAndreas Gohr        ];
133*8817535bSAndreas Gohr
134*8817535bSAndreas Gohr        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
135*8817535bSAndreas Gohr        io_saveFile($chunkfile, json_encode($data));
136*8817535bSAndreas Gohr    }
137*8817535bSAndreas Gohr
138*8817535bSAndreas Gohr
139*8817535bSAndreas Gohr    public function loadChunk($id)
140*8817535bSAndreas Gohr    {
141*8817535bSAndreas Gohr        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
142*8817535bSAndreas Gohr        return json_decode(io_readFile($chunkfile, false), true);
143*8817535bSAndreas Gohr    }
144*8817535bSAndreas Gohr
145*8817535bSAndreas Gohr    protected function getStorageDir($subdir = '')
146*8817535bSAndreas Gohr    {
147*8817535bSAndreas Gohr        global $conf;
148*8817535bSAndreas Gohr        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
149*8817535bSAndreas Gohr        if ($subdir) $dir .= $subdir . '/';
150*8817535bSAndreas Gohr        io_mkdir_p($dir);
151*8817535bSAndreas Gohr        return $dir;
152*8817535bSAndreas Gohr    }
153*8817535bSAndreas Gohr}
154