xref: /plugin/aichat/Storage/ChromaStorage.php (revision 5e6dd16e3c119de7bfaea05e364c6f864ff9ca03)
1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Chroma DB in server mode
10 */
11class ChromaStorage extends AbstractStorage
12{
13    /** @var string URL to the chroma server instance */
14    protected $baseurl;
15
16    /** @var DokuHTTPClient http client */
17    protected $http;
18
19    protected $tenant = 'default_tenant';
20    protected $database = 'default_database';
21    protected $collection = '';
22    protected $collectionID = '';
23
24    /**
25     * PineconeStorage constructor.
26     */
27    public function __construct()
28    {
29        $helper = plugin_load('helper', 'aichat');
30
31        $this->baseurl = $helper->getConf('chroma_baseurl');
32        $this->tenant = $helper->getConf('chroma_tenant');
33        $this->database = $helper->getConf('chroma_database');
34        $this->collection = $helper->getConf('chroma_collection');
35
36        $this->http = new DokuHTTPClient();
37        $this->http->headers['Content-Type'] = 'application/json';
38        $this->http->headers['Accept'] = 'application/json';
39        $this->http->keep_alive = false;
40        $this->http->timeout = 30;
41
42        if($helper->getConf('chroma_apikey')) {
43            $this->http->headers['Authorization'] = 'Bearer ' . $helper->getConf('chroma_apikey');
44        }
45    }
46
47    /**
48     * Execute a query against the Chroma API
49     *
50     * @param string $endpoint API endpoint, will be added to the base URL
51     * @param mixed $data The data to send, will be JSON encoded
52     * @param string $method POST|GET
53     * @return mixed
54     * @throws \Exception
55     */
56    protected function runQuery($endpoint, $data, $method = 'POST')
57    {
58        $url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database;
59
60        if (is_array($data) && $data === []) {
61            $json = '{}';
62        } else {
63            $json = json_encode($data);
64        }
65
66        $this->http->sendRequest($url, $json, $method);
67        $response = $this->http->resp_body;
68
69        if (!$response) {
70            throw new \Exception('Chroma API returned no response. ' . $this->http->error);
71        }
72
73        try {
74            $result = json_decode($response, true, 512, JSON_THROW_ON_ERROR);
75        } catch (\Exception $e) {
76            throw new \Exception('Chroma API returned invalid JSON. ' . $response);
77        }
78
79        if ((int)$this->http->status !== 200) {
80            if (isset($result['detail'][0]['msg'])) {
81                $error = $result['detail'][0]['msg'];
82            } else if (isset($result['detail']['msg'])) {
83                $error = $result['detail']['msg'];
84            } else if (isset($result['detail']) && is_string($result['detail'])) {
85                $error = $result['detail'];
86            } else if (isset($result['error'])) {
87                $error = $result['error'];
88            } else {
89                $error = $this->http->error;
90            }
91
92            throw new \Exception('Chroma API returned error. ' . $error);
93        }
94
95        return $result;
96    }
97
98    /**
99     * Get the collection ID for the configured collection
100     *
101     * @return string
102     * @throws \Exception
103     */
104    protected function getCollectionID()
105    {
106        if ($this->collectionID) return $this->collectionID;
107
108        $result = $this->runQuery(
109            '/collections/',
110            [
111                'name' => $this->collection,
112                'get_or_create' => true
113            ]
114        );
115        $this->collectionID = $result['id'];
116        return $this->collectionID;
117    }
118
119    /** @inheritdoc */
120    public function getChunk($chunkID)
121    {
122        $data = $this->runQuery(
123            '/collections/' . $this->getCollectionID() . '/get',
124            [
125                'ids' => [(string)$chunkID],
126                'include' => [
127                    'metadatas',
128                    'documents',
129                    'embeddings'
130                ]
131            ]
132        );
133
134        if (!$data) return null;
135        if (!$data['ids']) return null;
136
137        return new Chunk(
138            $data['metadatas'][0]['page'],
139            (int)$data['ids'][0],
140            $data['documents'][0],
141            $data['embeddings'][0],
142            $data['metadatas'][0]['language'] ?? '',
143            $data['metadatas'][0]['created']
144        );
145    }
146
147    /** @inheritdoc */
148    public function startCreation($clear = false)
149    {
150        if ($clear) {
151            $this->runQuery('/collections/' . $this->collection, '', 'DELETE');
152            $this->collectionID = '';
153        }
154    }
155
156    /** @inheritdoc */
157    public function reusePageChunks($page, $firstChunkID)
158    {
159        // no-op
160    }
161
162    /** @inheritdoc */
163    public function deletePageChunks($page, $firstChunkID)
164    {
165        // delete all possible chunk IDs
166        $ids = range($firstChunkID, $firstChunkID + 99, 1);
167        $ids = array_map(function ($id) {
168            return (string)$id;
169        }, $ids);
170        $this->runQuery(
171            '/collections/' . $this->getCollectionID() . '/delete',
172            [
173                'ids' => $ids
174            ]
175        );
176    }
177
178    /** @inheritdoc */
179    public function addPageChunks($chunks)
180    {
181        $ids = [];
182        $embeddings = [];
183        $metadatas = [];
184        $documents = [];
185
186        foreach ($chunks as $chunk) {
187            $ids[] = (string)$chunk->getId();
188            $embeddings[] = $chunk->getEmbedding();
189            $metadatas[] = [
190                'page' => $chunk->getPage(),
191                'created' => $chunk->getCreated(),
192                'language' => $chunk->getLanguage()
193            ];
194            $documents[] = $chunk->getText();
195
196        }
197
198        $this->runQuery(
199            '/collections/' . $this->getCollectionID() . '/upsert',
200            [
201                'ids' => $ids,
202                'embeddings' => $embeddings,
203                'metadatas' => $metadatas,
204                'documents' => $documents
205            ]
206        );
207    }
208
209    /** @inheritdoc */
210    public function finalizeCreation()
211    {
212        // no-op
213    }
214
215    /** @inheritdoc */
216    public function runMaintenance()
217    {
218        // no-op
219    }
220
221    /** @inheritdoc */
222    public function getPageChunks($page, $firstChunkID)
223    {
224        $ids = range($firstChunkID, $firstChunkID + 99, 1);
225        $ids = array_map(function ($id) {
226            return (string)$id;
227        }, $ids);
228
229        $data = $this->runQuery(
230            '/collections/' . $this->getCollectionID() . '/get',
231            [
232                'ids' => $ids,
233                'include' => [
234                    'metadatas',
235                    'documents',
236                    'embeddings'
237                ],
238                'limit' => 100,
239            ]
240        );
241
242        if (!$data) return [];
243        if (!$data['ids']) return null;
244
245        $chunks = [];
246        foreach ($data['ids'] as $idx => $id) {
247            $chunks[] = new Chunk(
248                $data['metadatas'][$idx]['page'],
249                (int)$id,
250                $data['documents'][$idx],
251                $data['embeddings'][$idx],
252                $data['metadatas'][$idx]['language'] ?? '',
253                $data['metadatas'][$idx]['created']
254            );
255        }
256        return $chunks;
257    }
258
259    /** @inheritdoc */
260    public function getSimilarChunks($vector, $lang = '', $limit = 4)
261    {
262        $limit *= 2; // we can't check ACLs, so we return more than requested
263
264        if ($lang) {
265            $filter = ['language' => ['$eq', $lang]];
266        } else {
267            $filter = null;
268        }
269
270        $data = $this->runQuery(
271            '/collections/' . $this->getCollectionID() . '/query',
272            [
273                'query_embeddings' => [$vector],
274                'n_results' => (int)$limit,
275                'where' => $filter,
276                'include' => [
277                    'metadatas',
278                    'documents',
279                    'embeddings',
280                    'distances',
281                ]
282            ]
283        );
284
285        $chunks = [];
286        foreach ($data['ids'][0] as $idx => $id) {
287            $chunks[] = new Chunk(
288                $data['metadatas'][0][$idx]['page'],
289                (int)$id,
290                $data['documents'][0][$idx],
291                $data['embeddings'][0][$idx],
292                $data['metadatas'][0][$idx]['language'] ?? '',
293                $data['metadatas'][0][$idx]['created'],
294                $data['distances'][0][$idx]
295            );
296        }
297        return $chunks;
298    }
299
300    /** @inheritdoc */
301    public function statistics()
302    {
303        $count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET');
304        $version = $this->runQuery('/version', '', 'GET');
305
306        return [
307            'chroma_version' => $version,
308            'collection_id' => $this->getCollectionID(),
309            'chunks' => $count
310        ];
311    }
312}
313