xref: /plugin/aichat/Embeddings.php (revision c4584168c6c9af22a69973c17af0a7ff5f7fb802)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
58817535bSAndreas Gohruse dokuwiki\Search\Indexer;
68817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree;
78817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister;
88817535bSAndreas Gohruse Hexogen\KDTree\Item;
98817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory;
108817535bSAndreas Gohruse Hexogen\KDTree\ItemList;
118817535bSAndreas Gohruse Hexogen\KDTree\KDTree;
128817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch;
138817535bSAndreas Gohruse Hexogen\KDTree\Point;
148817535bSAndreas Gohruse TikToken\Encoder;
158817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
168817535bSAndreas Gohr
178817535bSAndreas Gohrclass Embeddings
188817535bSAndreas Gohr{
198817535bSAndreas Gohr
20*c4584168SAndreas Gohr    const MAX_TOKEN_LEN = 1000;
218817535bSAndreas Gohr    const INDEX_NAME = 'aichat';
228817535bSAndreas Gohr    const INDEX_FILE = 'index.bin';
238817535bSAndreas Gohr
248817535bSAndreas Gohr    protected $openAI;
258817535bSAndreas Gohr
268817535bSAndreas Gohr    public function __construct(OpenAI $openAI, $logger = null)
278817535bSAndreas Gohr    {
288817535bSAndreas Gohr        $this->openAI = $openAI;
298817535bSAndreas Gohr        $this->logger = $logger;
308817535bSAndreas Gohr    }
318817535bSAndreas Gohr
328817535bSAndreas Gohr    public function createNewIndex()
338817535bSAndreas Gohr    {
348817535bSAndreas Gohr        io_rmdir($this->getStorageDir(), true); // delete old index
358817535bSAndreas Gohr
368817535bSAndreas Gohr        $indexer = new Indexer();
378817535bSAndreas Gohr        $pages = $indexer->getPages();
388817535bSAndreas Gohr        $itemCount = 0;
398817535bSAndreas Gohr
408817535bSAndreas Gohr        $itemList = new ItemList(1536);
418817535bSAndreas Gohr        foreach ($pages as $page) {
428817535bSAndreas Gohr            if (!page_exists($page)) continue;
438817535bSAndreas Gohr            $text = rawWiki($page);
448817535bSAndreas Gohr            $chunks = $this->splitIntoChunks($text);
458817535bSAndreas Gohr            $meta = [
468817535bSAndreas Gohr                'pageid' => $page,
478817535bSAndreas Gohr                // fixme add title here?
488817535bSAndreas Gohr            ];
498817535bSAndreas Gohr            foreach ($chunks as $chunk) {
508817535bSAndreas Gohr                $embedding = $this->openAI->getEmbedding($chunk);
518817535bSAndreas Gohr                $item = new Item($itemCount++, $embedding);
528817535bSAndreas Gohr                $itemList->addItem($item);
538817535bSAndreas Gohr                $this->saveChunk($item->getId(), $chunk, $meta);
548817535bSAndreas Gohr            }
558817535bSAndreas Gohr            if ($this->logger) {
568817535bSAndreas Gohr                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
578817535bSAndreas Gohr            }
588817535bSAndreas Gohr        }
598817535bSAndreas Gohr
608817535bSAndreas Gohr        $tree = new KDTree($itemList);
618817535bSAndreas Gohr        if($this->logger) {
628817535bSAndreas Gohr            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
638817535bSAndreas Gohr        }
648817535bSAndreas Gohr        $persister = new FSTreePersister($this->getStorageDir());
658817535bSAndreas Gohr        $persister->convert($tree, self::INDEX_FILE);
668817535bSAndreas Gohr    }
678817535bSAndreas Gohr
688817535bSAndreas Gohr    public function getSimilarChunks($query, $limit = 4)
698817535bSAndreas Gohr    {
708817535bSAndreas Gohr        $embedding = $this->openAI->getEmbedding($query);
718817535bSAndreas Gohr
728817535bSAndreas Gohr        $file = $this->getStorageDir() . self::INDEX_FILE;
738817535bSAndreas Gohr        $fsTree = new FSKDTree($file, new ItemFactory());
748817535bSAndreas Gohr        $fsSearcher = new NearestSearch($fsTree);
758817535bSAndreas Gohr        $items = $fsSearcher->search(new Point($embedding), $limit);
768817535bSAndreas Gohr
778817535bSAndreas Gohr        $result = [];
788817535bSAndreas Gohr        foreach ($items as $item) {
798817535bSAndreas Gohr            $result [] = $this->loadChunk($item->getId());
808817535bSAndreas Gohr        }
818817535bSAndreas Gohr        return $result;
828817535bSAndreas Gohr    }
838817535bSAndreas Gohr
848817535bSAndreas Gohr    /**
858817535bSAndreas Gohr     * @param $text
868817535bSAndreas Gohr     * @return array
878817535bSAndreas Gohr     * @throws \Exception
888817535bSAndreas Gohr     * @todo maybe add overlap support
898817535bSAndreas Gohr     * @todo support splitting too long sentences
908817535bSAndreas Gohr     */
918817535bSAndreas Gohr    protected function splitIntoChunks($text)
928817535bSAndreas Gohr    {
938817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
948817535bSAndreas Gohr        $tiktok = new Encoder();
958817535bSAndreas Gohr
968817535bSAndreas Gohr        $chunks = [];
978817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
988817535bSAndreas Gohr
998817535bSAndreas Gohr        $chunklen = 0;
1008817535bSAndreas Gohr        $chunk = '';
1018817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
1028817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
1038817535bSAndreas Gohr            if ($slen > self::MAX_TOKEN_LEN) {
1048817535bSAndreas Gohr                // sentence is too long, we need to split it further
1058817535bSAndreas Gohr                throw new \Exception('Sentence too long, splitting not implemented yet');
1068817535bSAndreas Gohr            }
1078817535bSAndreas Gohr
1088817535bSAndreas Gohr            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
1098817535bSAndreas Gohr                // add to current chunk
1108817535bSAndreas Gohr                $chunk .= $sentence;
1118817535bSAndreas Gohr                $chunklen += $slen;
1128817535bSAndreas Gohr            } else {
1138817535bSAndreas Gohr                // start new chunk
1148817535bSAndreas Gohr                $chunks[] = $chunk;
1158817535bSAndreas Gohr                $chunk = $sentence;
1168817535bSAndreas Gohr                $chunklen = $slen;
1178817535bSAndreas Gohr            }
1188817535bSAndreas Gohr        }
1198817535bSAndreas Gohr        $chunks[] = $chunk;
1208817535bSAndreas Gohr
1218817535bSAndreas Gohr        return $chunks;
1228817535bSAndreas Gohr    }
1238817535bSAndreas Gohr
1248817535bSAndreas Gohr
1258817535bSAndreas Gohr    public function saveChunk($id, $text, $meta = [])
1268817535bSAndreas Gohr    {
1278817535bSAndreas Gohr        $data = [
1288817535bSAndreas Gohr            'id' => $id,
1298817535bSAndreas Gohr            'text' => $text,
1308817535bSAndreas Gohr            'meta' => $meta,
1318817535bSAndreas Gohr        ];
1328817535bSAndreas Gohr
1338817535bSAndreas Gohr        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
1348817535bSAndreas Gohr        io_saveFile($chunkfile, json_encode($data));
1358817535bSAndreas Gohr    }
1368817535bSAndreas Gohr
1378817535bSAndreas Gohr
1388817535bSAndreas Gohr    public function loadChunk($id)
1398817535bSAndreas Gohr    {
1408817535bSAndreas Gohr        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
1418817535bSAndreas Gohr        return json_decode(io_readFile($chunkfile, false), true);
1428817535bSAndreas Gohr    }
1438817535bSAndreas Gohr
1448817535bSAndreas Gohr    protected function getStorageDir($subdir = '')
1458817535bSAndreas Gohr    {
1468817535bSAndreas Gohr        global $conf;
1478817535bSAndreas Gohr        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
1488817535bSAndreas Gohr        if ($subdir) $dir .= $subdir . '/';
1498817535bSAndreas Gohr        io_mkdir_p($dir);
1508817535bSAndreas Gohr        return $dir;
1518817535bSAndreas Gohr    }
1528817535bSAndreas Gohr}
153