1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Chroma DB in server mode
10 */
11class ChromaStorage extends AbstractStorage
12{
13    /** @var string URL to the chroma server instance */
14    protected $baseurl;
15
16    /** @var DokuHTTPClient http client */
17    protected $http;
18
19    protected $tenant = 'default_tenant';
20    protected $database = 'default_database';
21    protected $collection = '';
22    protected $collectionID = '';
23
24    /** @inheritdoc */
25    public function __construct(array $config)
26    {
27        $this->baseurl = $config['chroma_baseurl'] ?? '';
28        $this->tenant = $config['chroma_tenant'] ?? '';
29        $this->database = $config['chroma_database'] ?? '';
30        $this->collection = $config['chroma_collection'] ?? '';
31
32        $this->http = new DokuHTTPClient();
33        $this->http->headers['Content-Type'] = 'application/json';
34        $this->http->headers['Accept'] = 'application/json';
35        $this->http->keep_alive = false;
36        $this->http->timeout = 30;
37
38        if (!empty($config['chroma_apikey'])) {
39            $this->http->headers['Authorization'] = 'Bearer ' . $config['chroma_apikey'];
40        }
41    }
42
43    /**
44     * Execute a query against the Chroma API
45     *
46     * @param string $endpoint API endpoint, will be added to the base URL
47     * @param mixed $data The data to send, will be JSON encoded
48     * @param string $method POST|GET
49     * @return mixed
50     * @throws \Exception
51     */
52    protected function runQuery($endpoint, mixed $data, $method = 'POST')
53    {
54        $url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database;
55
56        if ($data === []) {
57            $json = '{}';
58        } else {
59            $json = json_encode($data, JSON_THROW_ON_ERROR);
60        }
61
62        $this->http->sendRequest($url, $json, $method);
63        $response = $this->http->resp_body;
64
65        if (!$response) {
66            throw new \Exception('Chroma API returned no response. ' . $this->http->error);
67        }
68
69        try {
70            $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR);
71        } catch (\Exception $e) {
72            throw new \Exception('Chroma API returned invalid JSON. ' . $response, 0, $e);
73        }
74
75        if ((int)$this->http->status !== 200) {
76            if (isset($result['detail'][0]['msg'])) {
77                $error = $result['detail'][0]['msg'];
78            } elseif (isset($result['detail']['msg'])) {
79                $error = $result['detail']['msg'];
80            } elseif (isset($result['detail']) && is_string($result['detail'])) {
81                $error = $result['detail'];
82            } elseif (isset($result['error'])) {
83                $error = $result['error'];
84            } else {
85                $error = $this->http->error;
86            }
87
88            throw new \Exception('Chroma API returned error. ' . $error);
89        }
90
91        return $result;
92    }
93
94    /**
95     * Get the collection ID for the configured collection
96     *
97     * @return string
98     * @throws \Exception
99     */
100    protected function getCollectionID()
101    {
102        if ($this->collectionID) return $this->collectionID;
103
104        $result = $this->runQuery(
105            '/collections/',
106            [
107                'name' => $this->collection,
108                'get_or_create' => true
109            ]
110        );
111        $this->collectionID = $result['id'];
112        return $this->collectionID;
113    }
114
115    /** @inheritdoc */
116    public function getChunk($chunkID)
117    {
118        $data = $this->runQuery(
119            '/collections/' . $this->getCollectionID() . '/get',
120            [
121                'ids' => [(string)$chunkID],
122                'include' => [
123                    'metadatas',
124                    'documents',
125                    'embeddings'
126                ]
127            ]
128        );
129
130        if (!$data) return null;
131        if (!$data['ids']) return null;
132
133        return new Chunk(
134            $data['metadatas'][0]['page'],
135            (int)$data['ids'][0],
136            $data['documents'][0],
137            $data['embeddings'][0],
138            $data['metadatas'][0]['language'] ?? '',
139            $data['metadatas'][0]['created']
140        );
141    }
142
143    /** @inheritdoc */
144    public function startCreation($clear = false)
145    {
146        if ($clear) {
147            $this->runQuery('/collections/' . $this->collection, '', 'DELETE');
148            $this->collectionID = '';
149        }
150    }
151
152    /** @inheritdoc */
153    public function reusePageChunks($page, $firstChunkID)
154    {
155        // no-op
156    }
157
158    /** @inheritdoc */
159    public function deletePageChunks($page, $firstChunkID)
160    {
161        // delete all possible chunk IDs
162        $ids = range($firstChunkID, $firstChunkID + 99, 1);
163        $ids = array_map(static fn($id) => (string)$id, $ids);
164        $this->runQuery(
165            '/collections/' . $this->getCollectionID() . '/delete',
166            [
167                'ids' => $ids
168            ]
169        );
170    }
171
172    /** @inheritdoc */
173    public function addPageChunks($chunks)
174    {
175        $ids = [];
176        $embeddings = [];
177        $metadatas = [];
178        $documents = [];
179
180        foreach ($chunks as $chunk) {
181            $ids[] = (string)$chunk->getId();
182            $embeddings[] = $chunk->getEmbedding();
183            $metadatas[] = [
184                'page' => $chunk->getPage(),
185                'created' => $chunk->getCreated(),
186                'language' => $chunk->getLanguage()
187            ];
188            $documents[] = $chunk->getText();
189        }
190
191        $this->runQuery(
192            '/collections/' . $this->getCollectionID() . '/upsert',
193            [
194                'ids' => $ids,
195                'embeddings' => $embeddings,
196                'metadatas' => $metadatas,
197                'documents' => $documents
198            ]
199        );
200    }
201
202    /** @inheritdoc */
203    public function finalizeCreation()
204    {
205        // no-op
206    }
207
208    /** @inheritdoc */
209    public function runMaintenance()
210    {
211        // no-op
212    }
213
214    /** @inheritdoc */
215    public function getPageChunks($page, $firstChunkID)
216    {
217        $ids = range($firstChunkID, $firstChunkID + 99, 1);
218        $ids = array_map(static fn($id) => (string)$id, $ids);
219
220        $data = $this->runQuery(
221            '/collections/' . $this->getCollectionID() . '/get',
222            [
223                'ids' => $ids,
224                'include' => [
225                    'metadatas',
226                    'documents',
227                    'embeddings'
228                ],
229                'limit' => 100,
230            ]
231        );
232
233        if (!$data) return [];
234        if (!$data['ids']) return null;
235
236        $chunks = [];
237        foreach ($data['ids'] as $idx => $id) {
238            $chunks[] = new Chunk(
239                $data['metadatas'][$idx]['page'],
240                (int)$id,
241                $data['documents'][$idx],
242                $data['embeddings'][$idx],
243                $data['metadatas'][$idx]['language'] ?? '',
244                $data['metadatas'][$idx]['created']
245            );
246        }
247        return $chunks;
248    }
249
250    /** @inheritdoc */
251    public function getSimilarChunks($vector, $lang = '', $limit = 4)
252    {
253        $limit *= 2; // we can't check ACLs, so we return more than requested
254
255        if ($lang) {
256            $filter = ['language' => $lang];
257        } else {
258            $filter = null;
259        }
260
261        $data = $this->runQuery(
262            '/collections/' . $this->getCollectionID() . '/query',
263            [
264                'query_embeddings' => [$vector],
265                'n_results' => (int)$limit,
266                'where' => $filter,
267                'include' => [
268                    'metadatas',
269                    'documents',
270                    'embeddings',
271                    'distances',
272                ]
273            ]
274        );
275
276        $chunks = [];
277        foreach ($data['ids'][0] as $idx => $id) {
278            $chunks[] = new Chunk(
279                $data['metadatas'][0][$idx]['page'],
280                (int)$id,
281                $data['documents'][0][$idx],
282                $data['embeddings'][0][$idx],
283                $data['metadatas'][0][$idx]['language'] ?? '',
284                $data['metadatas'][0][$idx]['created'],
285                $data['distances'][0][$idx]
286            );
287        }
288        return $chunks;
289    }
290
291    /** @inheritdoc */
292    public function statistics()
293    {
294        $count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET');
295        $version = $this->runQuery('/version', '', 'GET');
296
297        return [
298            'chroma_version' => $version,
299            'collection_id' => $this->getCollectionID(),
300            'chunks' => $count
301        ];
302    }
303}
304