xref: /plugin/aichat/Embeddings.php (revision 883057195a47ebf6c0d68d209e87735466d25f89)
1<?php
2
3namespace dokuwiki\plugin\aichat;
4
5use dokuwiki\plugin\aichat\backend\AbstractStorage;
6use dokuwiki\plugin\aichat\backend\Chunk;
7use dokuwiki\plugin\aichat\backend\KDTreeStorage;
8use dokuwiki\plugin\aichat\backend\SQLiteStorage;
9use dokuwiki\Search\Indexer;
10use Hexogen\KDTree\Exception\ValidationException;
11use splitbrain\phpcli\CLI;
12use TikToken\Encoder;
13use Vanderlee\Sentence\Sentence;
14
15/**
16 * Manage the embeddings index
17 *
18 * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
19 * OpenAI and stored in the Storage backend.
20 */
21class Embeddings
22{
23
24    const MAX_TOKEN_LEN = 1000;
25
26
27    /** @var OpenAI */
28    protected $openAI;
29    /** @var CLI|null */
30    protected $logger;
31
32    /** @var AbstractStorage */
33    protected $storage;
34
35    /**
36     * @param OpenAI $openAI
37     */
38    public function __construct(OpenAI $openAI)
39    {
40        $this->openAI = $openAI;
41        //$this->storage = new KDTreeStorage(); // FIXME make configurable
42        $this->storage = new SQLiteStorage(); // FIXME make configurable
43    }
44
45    /**
46     * Access storage
47     *
48     * @return AbstractStorage
49     */
50    public function getStorage()
51    {
52        return $this->storage;
53    }
54
55    /**
56     * Add a logger instance
57     *
58     * @param CLI $logger
59     * @return void
60     */
61    public function setLogger(CLI $logger)
62    {
63        $this->logger = $logger;
64    }
65
66    /**
67     * Update the embeddings storage
68     *
69     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
70     * @param bool $clear Should any existing storage be cleared before updating?
71     * @return void
72     * @throws \Exception
73     */
74    public function createNewIndex($skipRE = '', $clear = false)
75    {
76        $indexer = new Indexer();
77        $pages = $indexer->getPages();
78
79        $this->storage->startCreation(1536, $clear);
80        foreach ($pages as $pid => $page) {
81            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
82
83            if (
84                !page_exists($page) ||
85                isHiddenPage($page) ||
86                filesize(wikiFN($page)) < 150 || // skip very small pages
87                ($skipRE && preg_match($skipRE, $page))
88            ) {
89                // this page should not be in the index (anymore)
90                $this->storage->deletePageChunks($page, $chunkID);
91                continue;
92            }
93
94            $firstChunk = $this->storage->getChunk($chunkID);
95            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
96                // page is older than the chunks we have, reuse the existing chunks
97                $this->storage->reusePageChunks($page, $chunkID);
98                if ($this->logger) $this->logger->info("Reusing chunks for $page");
99            } else {
100                // page is newer than the chunks we have, create new chunks
101                $this->storage->deletePageChunks($page, $chunkID);
102                $this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
103            }
104        }
105        $this->storage->finalizeCreation();
106    }
107
108    /**
109     * Split the given page, fetch embedding vectors and return Chunks
110     *
111     * Will use the text renderer plugin if available to get the rendered text.
112     * Otherwise the raw wiki text is used.
113     *
114     * @param string $page Name of the page to split
115     * @param int $firstChunkID The ID of the first chunk of this page
116     * @return Chunk[] A list of chunks created for this page
117     * @throws \Exception
118     */
119    protected function createPageChunks($page, $firstChunkID)
120    {
121        $chunkList = [];
122
123        $textRenderer = plugin_load('renderer', 'text');
124        if ($textRenderer) {
125            global $ID;
126            $ID = $page;
127            $text = p_cached_output(wikiFN($page), 'text', $page);
128        } else {
129            $text = rawWiki($page);
130        }
131
132        $parts = $this->splitIntoChunks($text);
133        foreach ($parts as $part) {
134            try {
135                $embedding = $this->openAI->getEmbedding($part);
136            } catch (\Exception $e) {
137                if ($this->logger) {
138                    $this->logger->error(
139                        'Failed to get embedding for chunk of page {page}: {msg}',
140                        ['page' => $page, 'msg' => $e->getMessage()]
141                    );
142                }
143                continue;
144            }
145            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
146            $firstChunkID++;
147        }
148        if ($this->logger) {
149            $this->logger->success('{id} split into {count} chunks', ['id' => $page, 'count' => count($parts)]);
150        }
151        return $chunkList;
152    }
153
154    /**
155     * Do a nearest neighbor search for chunks similar to the given question
156     *
157     * Returns only chunks the current user is allowed to read, may return an empty result.
158     *
159     * @param string $query The question
160     * @param int $limit The number of results to return
161     * @return Chunk[]
162     * @throws \Exception
163     */
164    public function getSimilarChunks($query, $limit = 4)
165    {
166        global $auth;
167        $vector = $this->openAI->getEmbedding($query);
168
169        $chunks = $this->storage->getSimilarChunks($vector, $limit);
170        $result = [];
171        foreach ($chunks as $chunk) {
172            // filter out chunks the user is not allowed to read
173            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
174            $result[] = $chunk;
175            if (count($result) >= $limit) break;
176        }
177        return $result;
178    }
179
180
181    /**
182     * @param $text
183     * @return array
184     * @throws \Exception
185     * @todo maybe add overlap support
186     * @todo support splitting too long sentences
187     */
188    public function splitIntoChunks($text)
189    {
190        $sentenceSplitter = new Sentence();
191        $tiktok = new Encoder();
192
193        $chunks = [];
194        $sentences = $sentenceSplitter->split($text);
195
196        $chunklen = 0;
197        $chunk = '';
198        while ($sentence = array_shift($sentences)) {
199            $slen = count($tiktok->encode($sentence));
200            if ($slen > self::MAX_TOKEN_LEN) {
201                // sentence is too long, we need to split it further
202                if ($this->logger) $this->logger->warning('Sentence too long, splitting not implemented yet');
203                continue;
204            }
205
206            if ($chunklen + $slen < self::MAX_TOKEN_LEN) {
207                // add to current chunk
208                $chunk .= $sentence;
209                $chunklen += $slen;
210            } else {
211                // start new chunk
212                $chunks[] = $chunk;
213                $chunk = $sentence;
214                $chunklen = $slen;
215            }
216        }
217        $chunks[] = $chunk;
218
219        return $chunks;
220    }
221}
222