xref: /plugin/aichat/Embeddings.php (revision ad38c5fd62a65d04772bdd994d54a93483f88639)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
58817535bSAndreas Gohruse dokuwiki\Search\Indexer;
6*ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException;
78817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree;
88817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister;
98817535bSAndreas Gohruse Hexogen\KDTree\Item;
108817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory;
118817535bSAndreas Gohruse Hexogen\KDTree\ItemList;
128817535bSAndreas Gohruse Hexogen\KDTree\KDTree;
138817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch;
148817535bSAndreas Gohruse Hexogen\KDTree\Point;
152ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
168817535bSAndreas Gohruse TikToken\Encoder;
178817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
188817535bSAndreas Gohr
199da5f0dfSAndreas Gohr/**
209da5f0dfSAndreas Gohr * Manage the embeddings index
219da5f0dfSAndreas Gohr *
229da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
239da5f0dfSAndreas Gohr * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
249da5f0dfSAndreas Gohr */
258817535bSAndreas Gohrclass Embeddings
268817535bSAndreas Gohr{
278817535bSAndreas Gohr
28c4584168SAndreas Gohr    const MAX_TOKEN_LEN = 1000;
298817535bSAndreas Gohr    const INDEX_NAME = 'aichat';
308817535bSAndreas Gohr    const INDEX_FILE = 'index.bin';
318817535bSAndreas Gohr
322ecc089aSAndreas Gohr    /** @var OpenAI */
338817535bSAndreas Gohr    protected $openAI;
342ecc089aSAndreas Gohr    /** @var CLI|null */
352ecc089aSAndreas Gohr    protected $logger;
368817535bSAndreas Gohr
372ecc089aSAndreas Gohr    /**
382ecc089aSAndreas Gohr     * @param OpenAI $openAI
392ecc089aSAndreas Gohr     */
402ecc089aSAndreas Gohr    public function __construct(OpenAI $openAI)
418817535bSAndreas Gohr    {
428817535bSAndreas Gohr        $this->openAI = $openAI;
432ecc089aSAndreas Gohr    }
442ecc089aSAndreas Gohr
452ecc089aSAndreas Gohr    /**
462ecc089aSAndreas Gohr     * Add a logger instance
472ecc089aSAndreas Gohr     *
482ecc089aSAndreas Gohr     * @param CLI $logger
492ecc089aSAndreas Gohr     * @return void
502ecc089aSAndreas Gohr     */
512ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
522ecc089aSAndreas Gohr    {
538817535bSAndreas Gohr        $this->logger = $logger;
548817535bSAndreas Gohr    }
558817535bSAndreas Gohr
562ecc089aSAndreas Gohr    /**
572ecc089aSAndreas Gohr     * Create a new K-D Tree from all pages
582ecc089aSAndreas Gohr     *
592ecc089aSAndreas Gohr     * Deletes the existing index
602ecc089aSAndreas Gohr     *
61*ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
622ecc089aSAndreas Gohr     * @return void
63*ad38c5fdSAndreas Gohr     * @throws ValidationException
642ecc089aSAndreas Gohr     */
65*ad38c5fdSAndreas Gohr    public function createNewIndex($skipRE = '')
668817535bSAndreas Gohr    {
678817535bSAndreas Gohr        io_rmdir($this->getStorageDir(), true); // delete old index
688817535bSAndreas Gohr
698817535bSAndreas Gohr        $indexer = new Indexer();
708817535bSAndreas Gohr        $pages = $indexer->getPages();
718817535bSAndreas Gohr        $itemCount = 0;
728817535bSAndreas Gohr
738817535bSAndreas Gohr        $itemList = new ItemList(1536);
748817535bSAndreas Gohr        foreach ($pages as $page) {
758817535bSAndreas Gohr            if (!page_exists($page)) continue;
766f9744f7SAndreas Gohr            if (isHiddenPage($page)) continue;
77*ad38c5fdSAndreas Gohr            if ($skipRE && preg_match($skipRE, $page)) continue;
788817535bSAndreas Gohr            $text = rawWiki($page);
798817535bSAndreas Gohr            $chunks = $this->splitIntoChunks($text);
808817535bSAndreas Gohr            $meta = [
818817535bSAndreas Gohr                'pageid' => $page,
828817535bSAndreas Gohr            ];
838817535bSAndreas Gohr            foreach ($chunks as $chunk) {
84*ad38c5fdSAndreas Gohr                try {
858817535bSAndreas Gohr                    $embedding = $this->openAI->getEmbedding($chunk);
86*ad38c5fdSAndreas Gohr                } catch (\Exception $e) {
87*ad38c5fdSAndreas Gohr                    if ($this->logger) {
88*ad38c5fdSAndreas Gohr                        $this->logger->error(
89*ad38c5fdSAndreas Gohr                            'Failed to get embedding for chunk of page {page}: {msg}',
90*ad38c5fdSAndreas Gohr                            ['page' => $page, 'msg' => $e->getMessage()]
91*ad38c5fdSAndreas Gohr                        );
92*ad38c5fdSAndreas Gohr                    }
93*ad38c5fdSAndreas Gohr                    continue;
94*ad38c5fdSAndreas Gohr                }
958817535bSAndreas Gohr                $item = new Item($itemCount++, $embedding);
968817535bSAndreas Gohr                $itemList->addItem($item);
978817535bSAndreas Gohr                $this->saveChunk($item->getId(), $chunk, $meta);
988817535bSAndreas Gohr            }
998817535bSAndreas Gohr            if ($this->logger) {
1008817535bSAndreas Gohr                $this->logger->success('Split {id} into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
1018817535bSAndreas Gohr            }
1028817535bSAndreas Gohr        }
1038817535bSAndreas Gohr
1048817535bSAndreas Gohr        $tree = new KDTree($itemList);
1058817535bSAndreas Gohr        if ($this->logger) {
1068817535bSAndreas Gohr            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
1078817535bSAndreas Gohr        }
1088817535bSAndreas Gohr        $persister = new FSTreePersister($this->getStorageDir());
1098817535bSAndreas Gohr        $persister->convert($tree, self::INDEX_FILE);
1108817535bSAndreas Gohr    }
1118817535bSAndreas Gohr
1129e81bea7SAndreas Gohr    /**
1139e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1149e81bea7SAndreas Gohr     *
1159e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
1169e81bea7SAndreas Gohr     *
1179e81bea7SAndreas Gohr     * @param string $query The question
1189e81bea7SAndreas Gohr     * @param int $limit The number of results to return
1199e81bea7SAndreas Gohr     * @return array
1209e81bea7SAndreas Gohr     * @throws \Exception
1219e81bea7SAndreas Gohr     */
1228817535bSAndreas Gohr    public function getSimilarChunks($query, $limit = 4)
1238817535bSAndreas Gohr    {
1249e81bea7SAndreas Gohr        global $auth;
1258817535bSAndreas Gohr        $embedding = $this->openAI->getEmbedding($query);
1268817535bSAndreas Gohr
1278817535bSAndreas Gohr        $file = $this->getStorageDir() . self::INDEX_FILE;
1288817535bSAndreas Gohr        $fsTree = new FSKDTree($file, new ItemFactory());
1298817535bSAndreas Gohr        $fsSearcher = new NearestSearch($fsTree);
1309e81bea7SAndreas Gohr        $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed
1318817535bSAndreas Gohr
1328817535bSAndreas Gohr        $result = [];
1338817535bSAndreas Gohr        foreach ($items as $item) {
1349e81bea7SAndreas Gohr            $chunk = $this->loadChunk($item->getId());
1359e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
1369e81bea7SAndreas Gohr            if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue;
1379e81bea7SAndreas Gohr            $result[] = $chunk;
1389e81bea7SAndreas Gohr            if (count($result) >= $limit) break;
1398817535bSAndreas Gohr        }
1408817535bSAndreas Gohr        return $result;
1418817535bSAndreas Gohr    }
1428817535bSAndreas Gohr
1438817535bSAndreas Gohr    /**
1448817535bSAndreas Gohr     * @param $text
1458817535bSAndreas Gohr     * @return array
1468817535bSAndreas Gohr     * @throws \Exception
1478817535bSAndreas Gohr     * @todo maybe add overlap support
1488817535bSAndreas Gohr     * @todo support splitting too long sentences
1498817535bSAndreas Gohr     */
150*ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
1518817535bSAndreas Gohr    {
1528817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
1538817535bSAndreas Gohr        $tiktok = new Encoder();
1548817535bSAndreas Gohr
1558817535bSAndreas Gohr        $chunks = [];
1568817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
1578817535bSAndreas Gohr
1588817535bSAndreas Gohr        $chunklen = 0;
1598817535bSAndreas Gohr        $chunk = '';
1608817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
1618817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
1628817535bSAndreas Gohr            if ($slen > self::MAX_TOKEN_LEN) {
1638817535bSAndreas Gohr                // sentence is too long, we need to split it further
164*ad38c5fdSAndreas Gohr                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
165*ad38c5fdSAndreas Gohr                continue;
1668817535bSAndreas Gohr            }
1678817535bSAndreas Gohr
1688817535bSAndreas Gohr            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
1698817535bSAndreas Gohr                // add to current chunk
1708817535bSAndreas Gohr                $chunk .= $sentence;
1718817535bSAndreas Gohr                $chunklen += $slen;
1728817535bSAndreas Gohr            } else {
1738817535bSAndreas Gohr                // start new chunk
1748817535bSAndreas Gohr                $chunks[] = $chunk;
1758817535bSAndreas Gohr                $chunk = $sentence;
1768817535bSAndreas Gohr                $chunklen = $slen;
1778817535bSAndreas Gohr            }
1788817535bSAndreas Gohr        }
1798817535bSAndreas Gohr        $chunks[] = $chunk;
1808817535bSAndreas Gohr
1818817535bSAndreas Gohr        return $chunks;
1828817535bSAndreas Gohr    }
1838817535bSAndreas Gohr
1849da5f0dfSAndreas Gohr    /**
1859da5f0dfSAndreas Gohr     * Store additional chunk data in the file system
1869da5f0dfSAndreas Gohr     *
1879da5f0dfSAndreas Gohr     * @param int $id The chunk id in the K-D tree
1889da5f0dfSAndreas Gohr     * @param string $text raw text of the chunk
1899da5f0dfSAndreas Gohr     * @param array $meta meta data to store with the chunk
1909da5f0dfSAndreas Gohr     * @return void
1919da5f0dfSAndreas Gohr     */
1928817535bSAndreas Gohr    public function saveChunk($id, $text, $meta = [])
1938817535bSAndreas Gohr    {
1948817535bSAndreas Gohr        $data = [
1958817535bSAndreas Gohr            'id' => $id,
1968817535bSAndreas Gohr            'text' => $text,
1978817535bSAndreas Gohr            'meta' => $meta,
1988817535bSAndreas Gohr        ];
1998817535bSAndreas Gohr
2008817535bSAndreas Gohr        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
2018817535bSAndreas Gohr        io_saveFile($chunkfile, json_encode($data));
2028817535bSAndreas Gohr    }
2038817535bSAndreas Gohr
2049da5f0dfSAndreas Gohr    /**
2059da5f0dfSAndreas Gohr     * Load chunk data from the file system
2069da5f0dfSAndreas Gohr     *
2079da5f0dfSAndreas Gohr     * @param int $id
2089da5f0dfSAndreas Gohr     * @return array The chunk data [id, text, meta => []]
2099da5f0dfSAndreas Gohr     */
2108817535bSAndreas Gohr    public function loadChunk($id)
2118817535bSAndreas Gohr    {
2128817535bSAndreas Gohr        $chunkfile = $this->getStorageDir('chunk') . $id . '.json';
2138817535bSAndreas Gohr        return json_decode(io_readFile($chunkfile, false), true);
2148817535bSAndreas Gohr    }
2158817535bSAndreas Gohr
2169da5f0dfSAndreas Gohr    /**
2179da5f0dfSAndreas Gohr     * Return the path to where the K-D tree and chunk data is stored
2189da5f0dfSAndreas Gohr     *
2199da5f0dfSAndreas Gohr     * @param string $subdir
2209da5f0dfSAndreas Gohr     * @return string
2219da5f0dfSAndreas Gohr     */
2228817535bSAndreas Gohr    protected function getStorageDir($subdir = '')
2238817535bSAndreas Gohr    {
2248817535bSAndreas Gohr        global $conf;
2258817535bSAndreas Gohr        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
2268817535bSAndreas Gohr        if ($subdir) $dir .= $subdir . '/';
2278817535bSAndreas Gohr        io_mkdir_p($dir);
2288817535bSAndreas Gohr        return $dir;
2298817535bSAndreas Gohr    }
2308817535bSAndreas Gohr}
231