xref: /plugin/aichat/Storage/ChromaStorage.php (revision 30b9cbc7090f3bc7eca85292f4ff98a1cf513f8f)
1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Chroma DB in server mode
10 */
11class ChromaStorage extends AbstractStorage
12{
13    /** @var string URL to the chroma server instance */
14    protected $baseurl;
15
16    /** @var DokuHTTPClient http client */
17    protected $http;
18
19    protected $tenant = 'default_tenant';
20    protected $database = 'default_database';
21    protected $collection = '';
22    protected $collectionID = '';
23
24    /**
25     * PineconeStorage constructor.
26     */
27    public function __construct()
28    {
29        $helper = plugin_load('helper', 'aichat');
30
31        $this->baseurl = $helper->getConf('chroma_baseurl');
32        $this->tenant = $helper->getConf('chroma_tenant');
33        $this->database = $helper->getConf('chroma_database');
34        $this->collection = $helper->getConf('chroma_collection');
35
36        $this->http = new DokuHTTPClient();
37        $this->http->headers['Content-Type'] = 'application/json';
38        $this->http->headers['Accept'] = 'application/json';
39        $this->http->keep_alive = false;
40        $this->http->timeout = 30;
41
42        if ($helper->getConf('chroma_apikey')) {
43            $this->http->headers['Authorization'] = 'Bearer ' . $helper->getConf('chroma_apikey');
44        }
45    }
46
47    /**
48     * Execute a query against the Chroma API
49     *
50     * @param string $endpoint API endpoint, will be added to the base URL
51     * @param mixed $data The data to send, will be JSON encoded
52     * @param string $method POST|GET
53     * @return mixed
54     * @throws \Exception
55     */
56    protected function runQuery($endpoint, mixed $data, $method = 'POST')
57    {
58        $url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database;
59
60        if (is_array($data) && $data === []) {
61            $json = '{}';
62        } else {
63            $json = json_encode($data, JSON_THROW_ON_ERROR);
64        }
65
66        $this->http->sendRequest($url, $json, $method);
67        $response = $this->http->resp_body;
68
69        if (!$response) {
70            throw new \Exception('Chroma API returned no response. ' . $this->http->error);
71        }
72
73        try {
74            $result = json_decode((string) $response, true, 512, JSON_THROW_ON_ERROR);
75        } catch (\Exception) {
76            throw new \Exception('Chroma API returned invalid JSON. ' . $response);
77        }
78
79        if ((int)$this->http->status !== 200) {
80            if (isset($result['detail'][0]['msg'])) {
81                $error = $result['detail'][0]['msg'];
82            } elseif (isset($result['detail']['msg'])) {
83                $error = $result['detail']['msg'];
84            } elseif (isset($result['detail']) && is_string($result['detail'])) {
85                $error = $result['detail'];
86            } elseif (isset($result['error'])) {
87                $error = $result['error'];
88            } else {
89                $error = $this->http->error;
90            }
91
92            throw new \Exception('Chroma API returned error. ' . $error);
93        }
94
95        return $result;
96    }
97
98    /**
99     * Get the collection ID for the configured collection
100     *
101     * @return string
102     * @throws \Exception
103     */
104    protected function getCollectionID()
105    {
106        if ($this->collectionID) return $this->collectionID;
107
108        $result = $this->runQuery(
109            '/collections/',
110            [
111                'name' => $this->collection,
112                'get_or_create' => true
113            ]
114        );
115        $this->collectionID = $result['id'];
116        return $this->collectionID;
117    }
118
119    /** @inheritdoc */
120    public function getChunk($chunkID)
121    {
122        $data = $this->runQuery(
123            '/collections/' . $this->getCollectionID() . '/get',
124            [
125                'ids' => [(string)$chunkID],
126                'include' => [
127                    'metadatas',
128                    'documents',
129                    'embeddings'
130                ]
131            ]
132        );
133
134        if (!$data) return null;
135        if (!$data['ids']) return null;
136
137        return new Chunk(
138            $data['metadatas'][0]['page'],
139            (int)$data['ids'][0],
140            $data['documents'][0],
141            $data['embeddings'][0],
142            $data['metadatas'][0]['language'] ?? '',
143            $data['metadatas'][0]['created']
144        );
145    }
146
147    /** @inheritdoc */
148    public function startCreation($clear = false)
149    {
150        if ($clear) {
151            $this->runQuery('/collections/' . $this->collection, '', 'DELETE');
152            $this->collectionID = '';
153        }
154    }
155
156    /** @inheritdoc */
157    public function reusePageChunks($page, $firstChunkID)
158    {
159        // no-op
160    }
161
162    /** @inheritdoc */
163    public function deletePageChunks($page, $firstChunkID)
164    {
165        // delete all possible chunk IDs
166        $ids = range($firstChunkID, $firstChunkID + 99, 1);
167        $ids = array_map(static fn($id) => (string)$id, $ids);
168        $this->runQuery(
169            '/collections/' . $this->getCollectionID() . '/delete',
170            [
171                'ids' => $ids
172            ]
173        );
174    }
175
176    /** @inheritdoc */
177    public function addPageChunks($chunks)
178    {
179        $ids = [];
180        $embeddings = [];
181        $metadatas = [];
182        $documents = [];
183
184        foreach ($chunks as $chunk) {
185            $ids[] = (string)$chunk->getId();
186            $embeddings[] = $chunk->getEmbedding();
187            $metadatas[] = [
188                'page' => $chunk->getPage(),
189                'created' => $chunk->getCreated(),
190                'language' => $chunk->getLanguage()
191            ];
192            $documents[] = $chunk->getText();
193        }
194
195        $this->runQuery(
196            '/collections/' . $this->getCollectionID() . '/upsert',
197            [
198                'ids' => $ids,
199                'embeddings' => $embeddings,
200                'metadatas' => $metadatas,
201                'documents' => $documents
202            ]
203        );
204    }
205
206    /** @inheritdoc */
207    public function finalizeCreation()
208    {
209        // no-op
210    }
211
212    /** @inheritdoc */
213    public function runMaintenance()
214    {
215        // no-op
216    }
217
218    /** @inheritdoc */
219    public function getPageChunks($page, $firstChunkID)
220    {
221        $ids = range($firstChunkID, $firstChunkID + 99, 1);
222        $ids = array_map(static fn($id) => (string)$id, $ids);
223
224        $data = $this->runQuery(
225            '/collections/' . $this->getCollectionID() . '/get',
226            [
227                'ids' => $ids,
228                'include' => [
229                    'metadatas',
230                    'documents',
231                    'embeddings'
232                ],
233                'limit' => 100,
234            ]
235        );
236
237        if (!$data) return [];
238        if (!$data['ids']) return null;
239
240        $chunks = [];
241        foreach ($data['ids'] as $idx => $id) {
242            $chunks[] = new Chunk(
243                $data['metadatas'][$idx]['page'],
244                (int)$id,
245                $data['documents'][$idx],
246                $data['embeddings'][$idx],
247                $data['metadatas'][$idx]['language'] ?? '',
248                $data['metadatas'][$idx]['created']
249            );
250        }
251        return $chunks;
252    }
253
254    /** @inheritdoc */
255    public function getSimilarChunks($vector, $lang = '', $limit = 4)
256    {
257        $limit *= 2; // we can't check ACLs, so we return more than requested
258
259        if ($lang) {
260            $filter = ['language' => ['$eq', $lang]];
261        } else {
262            $filter = null;
263        }
264
265        $data = $this->runQuery(
266            '/collections/' . $this->getCollectionID() . '/query',
267            [
268                'query_embeddings' => [$vector],
269                'n_results' => (int)$limit,
270                'where' => $filter,
271                'include' => [
272                    'metadatas',
273                    'documents',
274                    'embeddings',
275                    'distances',
276                ]
277            ]
278        );
279
280        $chunks = [];
281        foreach ($data['ids'][0] as $idx => $id) {
282            $chunks[] = new Chunk(
283                $data['metadatas'][0][$idx]['page'],
284                (int)$id,
285                $data['documents'][0][$idx],
286                $data['embeddings'][0][$idx],
287                $data['metadatas'][0][$idx]['language'] ?? '',
288                $data['metadatas'][0][$idx]['created'],
289                $data['distances'][0][$idx]
290            );
291        }
292        return $chunks;
293    }
294
295    /** @inheritdoc */
296    public function statistics()
297    {
298        $count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET');
299        $version = $this->runQuery('/version', '', 'GET');
300
301        return [
302            'chroma_version' => $version,
303            'collection_id' => $this->getCollectionID(),
304            'chunks' => $count
305        ];
306    }
307}
308