1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Chroma DB in server mode 10 */ 11class ChromaStorage extends AbstractStorage 12{ 13 /** @var string URL to the chroma server instance */ 14 protected $baseurl; 15 16 /** @var DokuHTTPClient http client */ 17 protected $http; 18 19 protected $tenant = 'default_tenant'; 20 protected $database = 'default_database'; 21 protected $collection = ''; 22 protected $collectionID = ''; 23 24 /** 25 * PineconeStorage constructor. 26 */ 27 public function __construct() 28 { 29 $helper = plugin_load('helper', 'aichat'); 30 31 $this->baseurl = $helper->getConf('chroma_baseurl'); 32 $this->tenant = $helper->getConf('chroma_tenant'); 33 $this->database = $helper->getConf('chroma_database'); 34 $this->collection = $helper->getConf('chroma_collection'); 35 36 $this->http = new DokuHTTPClient(); 37 $this->http->headers['Content-Type'] = 'application/json'; 38 $this->http->headers['Accept'] = 'application/json'; 39 $this->http->keep_alive = false; 40 $this->http->timeout = 30; 41 42 if ($helper->getConf('chroma_apikey')) { 43 $this->http->headers['Authorization'] = 'Bearer ' . $helper->getConf('chroma_apikey'); 44 } 45 } 46 47 /** 48 * Execute a query against the Chroma API 49 * 50 * @param string $endpoint API endpoint, will be added to the base URL 51 * @param mixed $data The data to send, will be JSON encoded 52 * @param string $method POST|GET 53 * @return mixed 54 * @throws \Exception 55 */ 56 protected function runQuery($endpoint, mixed $data, $method = 'POST') 57 { 58 $url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database; 59 60 if (is_array($data) && $data === []) { 61 $json = '{}'; 62 } else { 63 $json = json_encode($data, JSON_THROW_ON_ERROR); 64 } 65 66 $this->http->sendRequest($url, $json, $method); 67 $response = $this->http->resp_body; 68 69 if (!$response) { 70 throw new \Exception('Chroma API returned no response. ' . $this->http->error); 71 } 72 73 try { 74 $result = json_decode((string) $response, true, 512, JSON_THROW_ON_ERROR); 75 } catch (\Exception) { 76 throw new \Exception('Chroma API returned invalid JSON. ' . $response); 77 } 78 79 if ((int)$this->http->status !== 200) { 80 if (isset($result['detail'][0]['msg'])) { 81 $error = $result['detail'][0]['msg']; 82 } elseif (isset($result['detail']['msg'])) { 83 $error = $result['detail']['msg']; 84 } elseif (isset($result['detail']) && is_string($result['detail'])) { 85 $error = $result['detail']; 86 } elseif (isset($result['error'])) { 87 $error = $result['error']; 88 } else { 89 $error = $this->http->error; 90 } 91 92 throw new \Exception('Chroma API returned error. ' . $error); 93 } 94 95 return $result; 96 } 97 98 /** 99 * Get the collection ID for the configured collection 100 * 101 * @return string 102 * @throws \Exception 103 */ 104 protected function getCollectionID() 105 { 106 if ($this->collectionID) return $this->collectionID; 107 108 $result = $this->runQuery( 109 '/collections/', 110 [ 111 'name' => $this->collection, 112 'get_or_create' => true 113 ] 114 ); 115 $this->collectionID = $result['id']; 116 return $this->collectionID; 117 } 118 119 /** @inheritdoc */ 120 public function getChunk($chunkID) 121 { 122 $data = $this->runQuery( 123 '/collections/' . $this->getCollectionID() . '/get', 124 [ 125 'ids' => [(string)$chunkID], 126 'include' => [ 127 'metadatas', 128 'documents', 129 'embeddings' 130 ] 131 ] 132 ); 133 134 if (!$data) return null; 135 if (!$data['ids']) return null; 136 137 return new Chunk( 138 $data['metadatas'][0]['page'], 139 (int)$data['ids'][0], 140 $data['documents'][0], 141 $data['embeddings'][0], 142 $data['metadatas'][0]['language'] ?? '', 143 $data['metadatas'][0]['created'] 144 ); 145 } 146 147 /** @inheritdoc */ 148 public function startCreation($clear = false) 149 { 150 if ($clear) { 151 $this->runQuery('/collections/' . $this->collection, '', 'DELETE'); 152 $this->collectionID = ''; 153 } 154 } 155 156 /** @inheritdoc */ 157 public function reusePageChunks($page, $firstChunkID) 158 { 159 // no-op 160 } 161 162 /** @inheritdoc */ 163 public function deletePageChunks($page, $firstChunkID) 164 { 165 // delete all possible chunk IDs 166 $ids = range($firstChunkID, $firstChunkID + 99, 1); 167 $ids = array_map(static fn($id) => (string)$id, $ids); 168 $this->runQuery( 169 '/collections/' . $this->getCollectionID() . '/delete', 170 [ 171 'ids' => $ids 172 ] 173 ); 174 } 175 176 /** @inheritdoc */ 177 public function addPageChunks($chunks) 178 { 179 $ids = []; 180 $embeddings = []; 181 $metadatas = []; 182 $documents = []; 183 184 foreach ($chunks as $chunk) { 185 $ids[] = (string)$chunk->getId(); 186 $embeddings[] = $chunk->getEmbedding(); 187 $metadatas[] = [ 188 'page' => $chunk->getPage(), 189 'created' => $chunk->getCreated(), 190 'language' => $chunk->getLanguage() 191 ]; 192 $documents[] = $chunk->getText(); 193 } 194 195 $this->runQuery( 196 '/collections/' . $this->getCollectionID() . '/upsert', 197 [ 198 'ids' => $ids, 199 'embeddings' => $embeddings, 200 'metadatas' => $metadatas, 201 'documents' => $documents 202 ] 203 ); 204 } 205 206 /** @inheritdoc */ 207 public function finalizeCreation() 208 { 209 // no-op 210 } 211 212 /** @inheritdoc */ 213 public function runMaintenance() 214 { 215 // no-op 216 } 217 218 /** @inheritdoc */ 219 public function getPageChunks($page, $firstChunkID) 220 { 221 $ids = range($firstChunkID, $firstChunkID + 99, 1); 222 $ids = array_map(static fn($id) => (string)$id, $ids); 223 224 $data = $this->runQuery( 225 '/collections/' . $this->getCollectionID() . '/get', 226 [ 227 'ids' => $ids, 228 'include' => [ 229 'metadatas', 230 'documents', 231 'embeddings' 232 ], 233 'limit' => 100, 234 ] 235 ); 236 237 if (!$data) return []; 238 if (!$data['ids']) return null; 239 240 $chunks = []; 241 foreach ($data['ids'] as $idx => $id) { 242 $chunks[] = new Chunk( 243 $data['metadatas'][$idx]['page'], 244 (int)$id, 245 $data['documents'][$idx], 246 $data['embeddings'][$idx], 247 $data['metadatas'][$idx]['language'] ?? '', 248 $data['metadatas'][$idx]['created'] 249 ); 250 } 251 return $chunks; 252 } 253 254 /** @inheritdoc */ 255 public function getSimilarChunks($vector, $lang = '', $limit = 4) 256 { 257 $limit *= 2; // we can't check ACLs, so we return more than requested 258 259 if ($lang) { 260 $filter = ['language' => ['$eq', $lang]]; 261 } else { 262 $filter = null; 263 } 264 265 $data = $this->runQuery( 266 '/collections/' . $this->getCollectionID() . '/query', 267 [ 268 'query_embeddings' => [$vector], 269 'n_results' => (int)$limit, 270 'where' => $filter, 271 'include' => [ 272 'metadatas', 273 'documents', 274 'embeddings', 275 'distances', 276 ] 277 ] 278 ); 279 280 $chunks = []; 281 foreach ($data['ids'][0] as $idx => $id) { 282 $chunks[] = new Chunk( 283 $data['metadatas'][0][$idx]['page'], 284 (int)$id, 285 $data['documents'][0][$idx], 286 $data['embeddings'][0][$idx], 287 $data['metadatas'][0][$idx]['language'] ?? '', 288 $data['metadatas'][0][$idx]['created'], 289 $data['distances'][0][$idx] 290 ); 291 } 292 return $chunks; 293 } 294 295 /** @inheritdoc */ 296 public function statistics() 297 { 298 $count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET'); 299 $version = $this->runQuery('/version', '', 'GET'); 300 301 return [ 302 'chroma_version' => $version, 303 'collection_id' => $this->getCollectionID(), 304 'chunks' => $count 305 ]; 306 } 307} 308