1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Chroma DB in server mode 10 */ 11class ChromaStorage extends AbstractStorage 12{ 13 /** @var string URL to the chroma server instance */ 14 protected $baseurl; 15 16 /** @var DokuHTTPClient http client */ 17 protected $http; 18 19 protected $tenant = 'default_tenant'; 20 protected $database = 'default_database'; 21 protected $collection = ''; 22 protected $collectionID = ''; 23 24 /** @inheritdoc */ 25 public function __construct(array $config) 26 { 27 $this->baseurl = $config['chroma_baseurl'] ?? ''; 28 $this->tenant = $config['chroma_tenant'] ?? ''; 29 $this->database = $config['chroma_database'] ?? ''; 30 $this->collection = $config['chroma_collection'] ?? ''; 31 32 $this->http = new DokuHTTPClient(); 33 $this->http->headers['Content-Type'] = 'application/json'; 34 $this->http->headers['Accept'] = 'application/json'; 35 $this->http->keep_alive = false; 36 $this->http->timeout = 30; 37 38 if (!empty($config['chroma_apikey'])) { 39 $this->http->headers['Authorization'] = 'Bearer ' . $config['chroma_apikey']; 40 } 41 } 42 43 /** 44 * Execute a query against the Chroma API 45 * 46 * @param string $endpoint API endpoint, will be added to the base URL 47 * @param mixed $data The data to send, will be JSON encoded 48 * @param string $method POST|GET 49 * @return mixed 50 * @throws \Exception 51 */ 52 protected function runQuery($endpoint, mixed $data, $method = 'POST') 53 { 54 $url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database; 55 56 if ($data === []) { 57 $json = '{}'; 58 } else { 59 $json = json_encode($data, JSON_THROW_ON_ERROR); 60 } 61 62 $this->http->sendRequest($url, $json, $method); 63 $response = $this->http->resp_body; 64 65 if (!$response) { 66 throw new \Exception('Chroma API returned no response. ' . $this->http->error); 67 } 68 69 try { 70 $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 71 } catch (\Exception $e) { 72 throw new \Exception('Chroma API returned invalid JSON. ' . $response, 0, $e); 73 } 74 75 if ((int)$this->http->status !== 200) { 76 if (isset($result['detail'][0]['msg'])) { 77 $error = $result['detail'][0]['msg']; 78 } elseif (isset($result['detail']['msg'])) { 79 $error = $result['detail']['msg']; 80 } elseif (isset($result['detail']) && is_string($result['detail'])) { 81 $error = $result['detail']; 82 } elseif (isset($result['error'])) { 83 $error = $result['error']; 84 } else { 85 $error = $this->http->error; 86 } 87 88 throw new \Exception('Chroma API returned error. ' . $error); 89 } 90 91 return $result; 92 } 93 94 /** 95 * Get the collection ID for the configured collection 96 * 97 * @return string 98 * @throws \Exception 99 */ 100 protected function getCollectionID() 101 { 102 if ($this->collectionID) return $this->collectionID; 103 104 $result = $this->runQuery( 105 '/collections/', 106 [ 107 'name' => $this->collection, 108 'get_or_create' => true 109 ] 110 ); 111 $this->collectionID = $result['id']; 112 return $this->collectionID; 113 } 114 115 /** @inheritdoc */ 116 public function getChunk($chunkID) 117 { 118 $data = $this->runQuery( 119 '/collections/' . $this->getCollectionID() . '/get', 120 [ 121 'ids' => [(string)$chunkID], 122 'include' => [ 123 'metadatas', 124 'documents', 125 'embeddings' 126 ] 127 ] 128 ); 129 130 if (!$data) return null; 131 if (!$data['ids']) return null; 132 133 return new Chunk( 134 $data['metadatas'][0]['page'], 135 (int)$data['ids'][0], 136 $data['documents'][0], 137 $data['embeddings'][0], 138 $data['metadatas'][0]['language'] ?? '', 139 $data['metadatas'][0]['created'] 140 ); 141 } 142 143 /** @inheritdoc */ 144 public function startCreation($clear = false) 145 { 146 if ($clear) { 147 $this->runQuery('/collections/' . $this->collection, '', 'DELETE'); 148 $this->collectionID = ''; 149 } 150 } 151 152 /** @inheritdoc */ 153 public function reusePageChunks($page, $firstChunkID) 154 { 155 // no-op 156 } 157 158 /** @inheritdoc */ 159 public function deletePageChunks($page, $firstChunkID) 160 { 161 // delete all possible chunk IDs 162 $ids = range($firstChunkID, $firstChunkID + 99, 1); 163 $ids = array_map(static fn($id) => (string)$id, $ids); 164 $this->runQuery( 165 '/collections/' . $this->getCollectionID() . '/delete', 166 [ 167 'ids' => $ids 168 ] 169 ); 170 } 171 172 /** @inheritdoc */ 173 public function addPageChunks($chunks) 174 { 175 $ids = []; 176 $embeddings = []; 177 $metadatas = []; 178 $documents = []; 179 180 foreach ($chunks as $chunk) { 181 $ids[] = (string)$chunk->getId(); 182 $embeddings[] = $chunk->getEmbedding(); 183 $metadatas[] = [ 184 'page' => $chunk->getPage(), 185 'created' => $chunk->getCreated(), 186 'language' => $chunk->getLanguage() 187 ]; 188 $documents[] = $chunk->getText(); 189 } 190 191 $this->runQuery( 192 '/collections/' . $this->getCollectionID() . '/upsert', 193 [ 194 'ids' => $ids, 195 'embeddings' => $embeddings, 196 'metadatas' => $metadatas, 197 'documents' => $documents 198 ] 199 ); 200 } 201 202 /** @inheritdoc */ 203 public function finalizeCreation() 204 { 205 // no-op 206 } 207 208 /** @inheritdoc */ 209 public function runMaintenance() 210 { 211 // no-op 212 } 213 214 /** @inheritdoc */ 215 public function getPageChunks($page, $firstChunkID) 216 { 217 $ids = range($firstChunkID, $firstChunkID + 99, 1); 218 $ids = array_map(static fn($id) => (string)$id, $ids); 219 220 $data = $this->runQuery( 221 '/collections/' . $this->getCollectionID() . '/get', 222 [ 223 'ids' => $ids, 224 'include' => [ 225 'metadatas', 226 'documents', 227 'embeddings' 228 ], 229 'limit' => 100, 230 ] 231 ); 232 233 if (!$data) return []; 234 if (!$data['ids']) return null; 235 236 $chunks = []; 237 foreach ($data['ids'] as $idx => $id) { 238 $chunks[] = new Chunk( 239 $data['metadatas'][$idx]['page'], 240 (int)$id, 241 $data['documents'][$idx], 242 $data['embeddings'][$idx], 243 $data['metadatas'][$idx]['language'] ?? '', 244 $data['metadatas'][$idx]['created'] 245 ); 246 } 247 return $chunks; 248 } 249 250 /** @inheritdoc */ 251 public function getSimilarChunks($vector, $lang = '', $limit = 4) 252 { 253 $limit *= 2; // we can't check ACLs, so we return more than requested 254 255 if ($lang) { 256 $filter = ['language' => $lang]; 257 } else { 258 $filter = null; 259 } 260 261 $data = $this->runQuery( 262 '/collections/' . $this->getCollectionID() . '/query', 263 [ 264 'query_embeddings' => [$vector], 265 'n_results' => (int)$limit, 266 'where' => $filter, 267 'include' => [ 268 'metadatas', 269 'documents', 270 'embeddings', 271 'distances', 272 ] 273 ] 274 ); 275 276 $chunks = []; 277 foreach ($data['ids'][0] as $idx => $id) { 278 $chunks[] = new Chunk( 279 $data['metadatas'][0][$idx]['page'], 280 (int)$id, 281 $data['documents'][0][$idx], 282 $data['embeddings'][0][$idx], 283 $data['metadatas'][0][$idx]['language'] ?? '', 284 $data['metadatas'][0][$idx]['created'], 285 $data['distances'][0][$idx] 286 ); 287 } 288 return $chunks; 289 } 290 291 /** @inheritdoc */ 292 public function statistics() 293 { 294 $count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET'); 295 $version = $this->runQuery('/version', '', 'GET'); 296 297 return [ 298 'chroma_version' => $version, 299 'collection_id' => $this->getCollectionID(), 300 'chunks' => $count 301 ]; 302 } 303} 304