xref: /plugin/aichat/Embeddings.php (revision 5786be46be4ea7477d2002e973db2f0e45f3db8b)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
58817535bSAndreas Gohruse dokuwiki\Search\Indexer;
6ad38c5fdSAndreas Gohruse Hexogen\KDTree\Exception\ValidationException;
78817535bSAndreas Gohruse Hexogen\KDTree\FSKDTree;
88817535bSAndreas Gohruse Hexogen\KDTree\FSTreePersister;
98817535bSAndreas Gohruse Hexogen\KDTree\Item;
108817535bSAndreas Gohruse Hexogen\KDTree\ItemFactory;
118817535bSAndreas Gohruse Hexogen\KDTree\ItemList;
128817535bSAndreas Gohruse Hexogen\KDTree\KDTree;
138817535bSAndreas Gohruse Hexogen\KDTree\NearestSearch;
148817535bSAndreas Gohruse Hexogen\KDTree\Point;
152ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
168817535bSAndreas Gohruse TikToken\Encoder;
178817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
188817535bSAndreas Gohr
199da5f0dfSAndreas Gohr/**
209da5f0dfSAndreas Gohr * Manage the embeddings index
219da5f0dfSAndreas Gohr *
229da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
239da5f0dfSAndreas Gohr * OpenAI and stored in a K-D Tree, chunk data is written to the file system.
249da5f0dfSAndreas Gohr */
258817535bSAndreas Gohrclass Embeddings
268817535bSAndreas Gohr{
278817535bSAndreas Gohr
28c4584168SAndreas Gohr    const MAX_TOKEN_LEN = 1000;
298817535bSAndreas Gohr    const INDEX_NAME = 'aichat';
308817535bSAndreas Gohr    const INDEX_FILE = 'index.bin';
318817535bSAndreas Gohr
322ecc089aSAndreas Gohr    /** @var OpenAI */
338817535bSAndreas Gohr    protected $openAI;
342ecc089aSAndreas Gohr    /** @var CLI|null */
352ecc089aSAndreas Gohr    protected $logger;
368817535bSAndreas Gohr
372ecc089aSAndreas Gohr    /**
382ecc089aSAndreas Gohr     * @param OpenAI $openAI
392ecc089aSAndreas Gohr     */
402ecc089aSAndreas Gohr    public function __construct(OpenAI $openAI)
418817535bSAndreas Gohr    {
428817535bSAndreas Gohr        $this->openAI = $openAI;
432ecc089aSAndreas Gohr    }
442ecc089aSAndreas Gohr
452ecc089aSAndreas Gohr    /**
462ecc089aSAndreas Gohr     * Add a logger instance
472ecc089aSAndreas Gohr     *
482ecc089aSAndreas Gohr     * @param CLI $logger
492ecc089aSAndreas Gohr     * @return void
502ecc089aSAndreas Gohr     */
512ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
522ecc089aSAndreas Gohr    {
538817535bSAndreas Gohr        $this->logger = $logger;
548817535bSAndreas Gohr    }
558817535bSAndreas Gohr
562ecc089aSAndreas Gohr    /**
572ecc089aSAndreas Gohr     * Create a new K-D Tree from all pages
582ecc089aSAndreas Gohr     *
592ecc089aSAndreas Gohr     * Deletes the existing index
602ecc089aSAndreas Gohr     *
61ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
622ecc089aSAndreas Gohr     * @return void
63ad38c5fdSAndreas Gohr     * @throws ValidationException
642ecc089aSAndreas Gohr     */
65ad38c5fdSAndreas Gohr    public function createNewIndex($skipRE = '')
668817535bSAndreas Gohr    {
678817535bSAndreas Gohr        $indexer = new Indexer();
688817535bSAndreas Gohr        $pages = $indexer->getPages();
698817535bSAndreas Gohr
708817535bSAndreas Gohr        $itemList = new ItemList(1536);
715aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
728817535bSAndreas Gohr            if (!page_exists($page)) continue;
736f9744f7SAndreas Gohr            if (isHiddenPage($page)) continue;
74ad38c5fdSAndreas Gohr            if ($skipRE && preg_match($skipRE, $page)) continue;
755aa45b4dSAndreas Gohr
765aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
775aa45b4dSAndreas Gohr
785aa45b4dSAndreas Gohr            $firstChunk = $this->getChunkFilePath($chunkID);
795aa45b4dSAndreas Gohr            if (@filemtime(wikiFN($page)) < @filemtime($firstChunk)) {
805aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
815aa45b4dSAndreas Gohr                $this->reusePageChunks($itemList, $page, $chunkID);
825aa45b4dSAndreas Gohr            } else {
835aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
845aa45b4dSAndreas Gohr                $this->deletePageChunks($chunkID);
855aa45b4dSAndreas Gohr                $this->createPageChunks($itemList, $page, $chunkID);
865aa45b4dSAndreas Gohr            }
875aa45b4dSAndreas Gohr        }
885aa45b4dSAndreas Gohr
895aa45b4dSAndreas Gohr        $tree = new KDTree($itemList);
905aa45b4dSAndreas Gohr        if ($this->logger) {
915aa45b4dSAndreas Gohr            $this->logger->success('Created index with {count} items', ['count' => $tree->getItemCount()]);
925aa45b4dSAndreas Gohr        }
935aa45b4dSAndreas Gohr        $persister = new FSTreePersister($this->getStorageDir());
945aa45b4dSAndreas Gohr        $persister->convert($tree, self::INDEX_FILE);
955aa45b4dSAndreas Gohr    }
965aa45b4dSAndreas Gohr
975aa45b4dSAndreas Gohr    /**
985aa45b4dSAndreas Gohr     * Split the given page, fetch embedding vectors, save chunks and add them to the tree list
995aa45b4dSAndreas Gohr     *
1005aa45b4dSAndreas Gohr     * @param ItemList $itemList The list to add the items to
1015aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1025aa45b4dSAndreas Gohr     * @param int $chunkID The ID of the first chunk of this page
1035aa45b4dSAndreas Gohr     * @return void
1045aa45b4dSAndreas Gohr     * @throws \Exception
1055aa45b4dSAndreas Gohr     */
1065aa45b4dSAndreas Gohr    protected function createPageChunks(ItemList $itemList, $page, $chunkID)
1075aa45b4dSAndreas Gohr    {
1088817535bSAndreas Gohr        $text = rawWiki($page);
1098817535bSAndreas Gohr        $chunks = $this->splitIntoChunks($text);
1108817535bSAndreas Gohr        $meta = [
1118817535bSAndreas Gohr            'pageid' => $page,
1128817535bSAndreas Gohr        ];
1138817535bSAndreas Gohr        foreach ($chunks as $chunk) {
114ad38c5fdSAndreas Gohr            try {
1158817535bSAndreas Gohr                $embedding = $this->openAI->getEmbedding($chunk);
116ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
117ad38c5fdSAndreas Gohr                if ($this->logger) {
118ad38c5fdSAndreas Gohr                    $this->logger->error(
119ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
120ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
121ad38c5fdSAndreas Gohr                    );
122ad38c5fdSAndreas Gohr                }
123ad38c5fdSAndreas Gohr                continue;
124ad38c5fdSAndreas Gohr            }
1255aa45b4dSAndreas Gohr            $item = new Item($chunkID, $embedding);
1268817535bSAndreas Gohr            $itemList->addItem($item);
1275aa45b4dSAndreas Gohr            $this->saveChunk($item->getId(), $chunk, $embedding, $meta);
1285aa45b4dSAndreas Gohr            $chunkID++;
1298817535bSAndreas Gohr        }
1308817535bSAndreas Gohr        if ($this->logger) {
1315aa45b4dSAndreas Gohr            $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($chunks)]);
1328817535bSAndreas Gohr        }
1338817535bSAndreas Gohr    }
1348817535bSAndreas Gohr
1355aa45b4dSAndreas Gohr    /**
1365aa45b4dSAndreas Gohr     * Load the existing chunks for the given page and add them to the tree list
1375aa45b4dSAndreas Gohr     *
1385aa45b4dSAndreas Gohr     * @param ItemList $itemList The list to add the items to
1395aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1405aa45b4dSAndreas Gohr     * @param int $chunkID The ID of the first chunk of this page
1415aa45b4dSAndreas Gohr     * @return void
1425aa45b4dSAndreas Gohr     */
1435aa45b4dSAndreas Gohr    protected function reusePageChunks(ItemList $itemList, $page, $chunkID)
1445aa45b4dSAndreas Gohr    {
1455aa45b4dSAndreas Gohr        for ($i = 0; $i < 100; $i++) {
1465aa45b4dSAndreas Gohr            $chunk = $this->loadChunk($chunkID + $i);
1475aa45b4dSAndreas Gohr            if (!$chunk) break;
1485aa45b4dSAndreas Gohr            $item = new Item($chunkID, $chunk['embedding']);
1495aa45b4dSAndreas Gohr            $itemList->addItem($item);
1508817535bSAndreas Gohr        }
1515aa45b4dSAndreas Gohr        if ($this->logger) {
1525aa45b4dSAndreas Gohr            $this->logger->success('{id} reused {count} chunks', ['id' => $page, 'count' => $i]);
1535aa45b4dSAndreas Gohr        }
1545aa45b4dSAndreas Gohr    }
1555aa45b4dSAndreas Gohr
1565aa45b4dSAndreas Gohr    /**
1575aa45b4dSAndreas Gohr     * Delete all possibly existing chunks for one page (identified by the first chunk ID)
1585aa45b4dSAndreas Gohr     *
1595aa45b4dSAndreas Gohr     * @param int $chunkID The ID of the first chunk of this page
1605aa45b4dSAndreas Gohr     * @return void
1615aa45b4dSAndreas Gohr     */
1625aa45b4dSAndreas Gohr    protected function deletePageChunks($chunkID)
1635aa45b4dSAndreas Gohr    {
1645aa45b4dSAndreas Gohr        for ($i = 0; $i < 100; $i++) {
1655aa45b4dSAndreas Gohr            $chunk = $this->getChunkFilePath($chunkID + $i);
1665aa45b4dSAndreas Gohr            if (!file_exists($chunk)) break;
1675aa45b4dSAndreas Gohr            unlink($chunk);
1685aa45b4dSAndreas Gohr        }
1698817535bSAndreas Gohr    }
1708817535bSAndreas Gohr
1719e81bea7SAndreas Gohr    /**
1729e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
1739e81bea7SAndreas Gohr     *
1749e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
1759e81bea7SAndreas Gohr     *
1769e81bea7SAndreas Gohr     * @param string $query The question
1779e81bea7SAndreas Gohr     * @param int $limit The number of results to return
1789e81bea7SAndreas Gohr     * @return array
1799e81bea7SAndreas Gohr     * @throws \Exception
1809e81bea7SAndreas Gohr     */
1818817535bSAndreas Gohr    public function getSimilarChunks($query, $limit = 4)
1828817535bSAndreas Gohr    {
1839e81bea7SAndreas Gohr        global $auth;
1848817535bSAndreas Gohr        $embedding = $this->openAI->getEmbedding($query);
1858817535bSAndreas Gohr
186*5786be46SAndreas Gohr        $fsTree = $this->getTree();
1878817535bSAndreas Gohr        $fsSearcher = new NearestSearch($fsTree);
1889e81bea7SAndreas Gohr        $items = $fsSearcher->search(new Point($embedding), $limit * 2); // we get twice as many as needed
1898817535bSAndreas Gohr
1908817535bSAndreas Gohr        $result = [];
1918817535bSAndreas Gohr        foreach ($items as $item) {
1929e81bea7SAndreas Gohr            $chunk = $this->loadChunk($item->getId());
1939e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
1949e81bea7SAndreas Gohr            if ($auth && auth_quickaclcheck($chunk['meta']['pageid']) < AUTH_READ) continue;
1959e81bea7SAndreas Gohr            $result[] = $chunk;
1969e81bea7SAndreas Gohr            if (count($result) >= $limit) break;
1978817535bSAndreas Gohr        }
1988817535bSAndreas Gohr        return $result;
1998817535bSAndreas Gohr    }
2008817535bSAndreas Gohr
2018817535bSAndreas Gohr    /**
202*5786be46SAndreas Gohr     * Access to the KD Tree
203*5786be46SAndreas Gohr     *
204*5786be46SAndreas Gohr     * @return FSKDTree
205*5786be46SAndreas Gohr     */
206*5786be46SAndreas Gohr    public function getTree()
207*5786be46SAndreas Gohr    {
208*5786be46SAndreas Gohr        $file = $this->getStorageDir() . self::INDEX_FILE;
209*5786be46SAndreas Gohr        return new FSKDTree($file, new ItemFactory());
210*5786be46SAndreas Gohr    }
211*5786be46SAndreas Gohr
212*5786be46SAndreas Gohr    /**
2138817535bSAndreas Gohr     * @param $text
2148817535bSAndreas Gohr     * @return array
2158817535bSAndreas Gohr     * @throws \Exception
2168817535bSAndreas Gohr     * @todo maybe add overlap support
2178817535bSAndreas Gohr     * @todo support splitting too long sentences
2188817535bSAndreas Gohr     */
219ad38c5fdSAndreas Gohr    public function splitIntoChunks($text)
2208817535bSAndreas Gohr    {
2218817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
2228817535bSAndreas Gohr        $tiktok = new Encoder();
2238817535bSAndreas Gohr
2248817535bSAndreas Gohr        $chunks = [];
2258817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
2268817535bSAndreas Gohr
2278817535bSAndreas Gohr        $chunklen = 0;
2288817535bSAndreas Gohr        $chunk = '';
2298817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
2308817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
2318817535bSAndreas Gohr            if ($slen > self::MAX_TOKEN_LEN) {
2328817535bSAndreas Gohr                // sentence is too long, we need to split it further
233ad38c5fdSAndreas Gohr                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
234ad38c5fdSAndreas Gohr                continue;
2358817535bSAndreas Gohr            }
2368817535bSAndreas Gohr
2378817535bSAndreas Gohr            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
2388817535bSAndreas Gohr                // add to current chunk
2398817535bSAndreas Gohr                $chunk .= $sentence;
2408817535bSAndreas Gohr                $chunklen += $slen;
2418817535bSAndreas Gohr            } else {
2428817535bSAndreas Gohr                // start new chunk
2438817535bSAndreas Gohr                $chunks[] = $chunk;
2448817535bSAndreas Gohr                $chunk = $sentence;
2458817535bSAndreas Gohr                $chunklen = $slen;
2468817535bSAndreas Gohr            }
2478817535bSAndreas Gohr        }
2488817535bSAndreas Gohr        $chunks[] = $chunk;
2498817535bSAndreas Gohr
2508817535bSAndreas Gohr        return $chunks;
2518817535bSAndreas Gohr    }
2528817535bSAndreas Gohr
2539da5f0dfSAndreas Gohr    /**
2549da5f0dfSAndreas Gohr     * Store additional chunk data in the file system
2559da5f0dfSAndreas Gohr     *
2569da5f0dfSAndreas Gohr     * @param int $id The chunk id in the K-D tree
2579da5f0dfSAndreas Gohr     * @param string $text raw text of the chunk
2585aa45b4dSAndreas Gohr     * @param float[] $embedding embedding vector of the chunk
2599da5f0dfSAndreas Gohr     * @param array $meta meta data to store with the chunk
2609da5f0dfSAndreas Gohr     * @return void
2619da5f0dfSAndreas Gohr     */
2625aa45b4dSAndreas Gohr    public function saveChunk($id, $text, $embedding, $meta = [])
2638817535bSAndreas Gohr    {
2648817535bSAndreas Gohr        $data = [
2658817535bSAndreas Gohr            'id' => $id,
2668817535bSAndreas Gohr            'text' => $text,
2675aa45b4dSAndreas Gohr            'embedding' => $embedding,
2688817535bSAndreas Gohr            'meta' => $meta,
2698817535bSAndreas Gohr        ];
2708817535bSAndreas Gohr
2715aa45b4dSAndreas Gohr        $chunkfile = $this->getChunkFilePath($id);
2728817535bSAndreas Gohr        io_saveFile($chunkfile, json_encode($data));
2738817535bSAndreas Gohr    }
2748817535bSAndreas Gohr
2759da5f0dfSAndreas Gohr    /**
2769da5f0dfSAndreas Gohr     * Load chunk data from the file system
2779da5f0dfSAndreas Gohr     *
2789da5f0dfSAndreas Gohr     * @param int $id
2795aa45b4dSAndreas Gohr     * @return array|false The chunk data [id, text, embedding, meta => []], false if not found
2809da5f0dfSAndreas Gohr     */
2818817535bSAndreas Gohr    public function loadChunk($id)
2828817535bSAndreas Gohr    {
2835aa45b4dSAndreas Gohr        $chunkfile = $this->getChunkFilePath($id);
2845aa45b4dSAndreas Gohr        if (!file_exists($chunkfile)) return false;
2858817535bSAndreas Gohr        return json_decode(io_readFile($chunkfile, false), true);
2868817535bSAndreas Gohr    }
2878817535bSAndreas Gohr
2889da5f0dfSAndreas Gohr    /**
2895aa45b4dSAndreas Gohr     * Return the path to the chunk file
2905aa45b4dSAndreas Gohr     *
2915aa45b4dSAndreas Gohr     * @param $id
2925aa45b4dSAndreas Gohr     * @return string
2935aa45b4dSAndreas Gohr     */
2945aa45b4dSAndreas Gohr    protected function getChunkFilePath($id)
2955aa45b4dSAndreas Gohr    {
2965aa45b4dSAndreas Gohr        $id = dechex($id); // use hexadecimal for shorter file names
2975aa45b4dSAndreas Gohr        return $this->getStorageDir('chunk') . $id . '.json';
2985aa45b4dSAndreas Gohr    }
2995aa45b4dSAndreas Gohr
3005aa45b4dSAndreas Gohr    /**
3019da5f0dfSAndreas Gohr     * Return the path to where the K-D tree and chunk data is stored
3029da5f0dfSAndreas Gohr     *
3039da5f0dfSAndreas Gohr     * @param string $subdir
3049da5f0dfSAndreas Gohr     * @return string
3059da5f0dfSAndreas Gohr     */
3068817535bSAndreas Gohr    protected function getStorageDir($subdir = '')
3078817535bSAndreas Gohr    {
3088817535bSAndreas Gohr        global $conf;
3098817535bSAndreas Gohr        $dir = $conf['indexdir'] . '/' . self::INDEX_NAME . '/';
3108817535bSAndreas Gohr        if ($subdir) $dir .= $subdir . '/';
3118817535bSAndreas Gohr        io_mkdir_p($dir);
3128817535bSAndreas Gohr        return $dir;
3138817535bSAndreas Gohr    }
3148817535bSAndreas Gohr}
315