1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Chroma DB in server mode 10 */ 11class ChromaStorage extends AbstractStorage 12{ 13 /** @var string URL to the chroma server instance */ 14 protected $baseurl; 15 16 /** @var DokuHTTPClient http client */ 17 protected $http; 18 19 protected $tenant = 'default_tenant'; 20 protected $database = 'default_database'; 21 protected $collection = ''; 22 protected $collectionID = ''; 23 24 /** 25 * PineconeStorage constructor. 26 */ 27 public function __construct() 28 { 29 $helper = plugin_load('helper', 'aichat'); 30 31 $this->baseurl = $helper->getConf('chroma_baseurl'); 32 $this->tenant = $helper->getConf('chroma_tenant'); 33 $this->database = $helper->getConf('chroma_database'); 34 $this->collection = $helper->getConf('chroma_collection'); 35 36 $this->http = new DokuHTTPClient(); 37 $this->http->headers['Content-Type'] = 'application/json'; 38 $this->http->headers['Accept'] = 'application/json'; 39 $this->http->keep_alive = false; 40 $this->http->timeout = 30; 41 42 if($helper->getConf('chroma_apikey')) { 43 $this->http->headers['Authorization'] = 'Bearer ' . $helper->getConf('chroma_apikey'); 44 } 45 } 46 47 /** 48 * Execute a query against the Chroma API 49 * 50 * @param string $endpoint API endpoint, will be added to the base URL 51 * @param mixed $data The data to send, will be JSON encoded 52 * @param string $method POST|GET 53 * @return mixed 54 * @throws \Exception 55 */ 56 protected function runQuery($endpoint, $data, $method = 'POST') 57 { 58 $url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database; 59 60 if (is_array($data) && $data === []) { 61 $json = '{}'; 62 } else { 63 $json = json_encode($data); 64 } 65 66 $this->http->sendRequest($url, $json, $method); 67 $response = $this->http->resp_body; 68 69 if (!$response) { 70 throw new \Exception('Chroma API returned no response. ' . $this->http->error); 71 } 72 73 try { 74 $result = json_decode($response, true, 512, JSON_THROW_ON_ERROR); 75 } catch (\Exception $e) { 76 throw new \Exception('Chroma API returned invalid JSON. ' . $response); 77 } 78 79 if ((int)$this->http->status !== 200) { 80 if (isset($result['detail'][0]['msg'])) { 81 $error = $result['detail'][0]['msg']; 82 } else if (isset($result['detail']['msg'])) { 83 $error = $result['detail']['msg']; 84 } else if (isset($result['detail']) && is_string($result['detail'])) { 85 $error = $result['detail']; 86 } else if (isset($result['error'])) { 87 $error = $result['error']; 88 } else { 89 $error = $this->http->error; 90 } 91 92 throw new \Exception('Chroma API returned error. ' . $error); 93 } 94 95 return $result; 96 } 97 98 /** 99 * Get the collection ID for the configured collection 100 * 101 * @return string 102 * @throws \Exception 103 */ 104 protected function getCollectionID() 105 { 106 if ($this->collectionID) return $this->collectionID; 107 108 $result = $this->runQuery( 109 '/collections/', 110 [ 111 'name' => $this->collection, 112 'get_or_create' => true 113 ] 114 ); 115 $this->collectionID = $result['id']; 116 return $this->collectionID; 117 } 118 119 /** @inheritdoc */ 120 public function getChunk($chunkID) 121 { 122 $data = $this->runQuery( 123 '/collections/' . $this->getCollectionID() . '/get', 124 [ 125 'ids' => [(string)$chunkID], 126 'include' => [ 127 'metadatas', 128 'documents', 129 'embeddings' 130 ] 131 ] 132 ); 133 134 if (!$data) return null; 135 if (!$data['ids']) return null; 136 137 return new Chunk( 138 $data['metadatas'][0]['page'], 139 (int)$data['ids'][0], 140 $data['documents'][0], 141 $data['embeddings'][0], 142 $data['metadatas'][0]['language'] ?? '', 143 $data['metadatas'][0]['created'] 144 ); 145 } 146 147 /** @inheritdoc */ 148 public function startCreation($clear = false) 149 { 150 if ($clear) { 151 $this->runQuery('/collections/' . $this->collection, '', 'DELETE'); 152 $this->collectionID = ''; 153 } 154 } 155 156 /** @inheritdoc */ 157 public function reusePageChunks($page, $firstChunkID) 158 { 159 // no-op 160 } 161 162 /** @inheritdoc */ 163 public function deletePageChunks($page, $firstChunkID) 164 { 165 // delete all possible chunk IDs 166 $ids = range($firstChunkID, $firstChunkID + 99, 1); 167 $ids = array_map(function ($id) { 168 return (string)$id; 169 }, $ids); 170 $this->runQuery( 171 '/collections/' . $this->getCollectionID() . '/delete', 172 [ 173 'ids' => $ids 174 ] 175 ); 176 } 177 178 /** @inheritdoc */ 179 public function addPageChunks($chunks) 180 { 181 $ids = []; 182 $embeddings = []; 183 $metadatas = []; 184 $documents = []; 185 186 foreach ($chunks as $chunk) { 187 $ids[] = (string)$chunk->getId(); 188 $embeddings[] = $chunk->getEmbedding(); 189 $metadatas[] = [ 190 'page' => $chunk->getPage(), 191 'created' => $chunk->getCreated(), 192 'language' => $chunk->getLanguage() 193 ]; 194 $documents[] = $chunk->getText(); 195 196 } 197 198 $this->runQuery( 199 '/collections/' . $this->getCollectionID() . '/upsert', 200 [ 201 'ids' => $ids, 202 'embeddings' => $embeddings, 203 'metadatas' => $metadatas, 204 'documents' => $documents 205 ] 206 ); 207 } 208 209 /** @inheritdoc */ 210 public function finalizeCreation() 211 { 212 // no-op 213 } 214 215 /** @inheritdoc */ 216 public function runMaintenance() 217 { 218 // no-op 219 } 220 221 /** @inheritdoc */ 222 public function getPageChunks($page, $firstChunkID) 223 { 224 $ids = range($firstChunkID, $firstChunkID + 99, 1); 225 $ids = array_map(function ($id) { 226 return (string)$id; 227 }, $ids); 228 229 $data = $this->runQuery( 230 '/collections/' . $this->getCollectionID() . '/get', 231 [ 232 'ids' => $ids, 233 'include' => [ 234 'metadatas', 235 'documents', 236 'embeddings' 237 ], 238 'limit' => 100, 239 ] 240 ); 241 242 if (!$data) return []; 243 if (!$data['ids']) return null; 244 245 $chunks = []; 246 foreach ($data['ids'] as $idx => $id) { 247 $chunks[] = new Chunk( 248 $data['metadatas'][$idx]['page'], 249 (int)$id, 250 $data['documents'][$idx], 251 $data['embeddings'][$idx], 252 $data['metadatas'][$idx]['language'] ?? '', 253 $data['metadatas'][$idx]['created'] 254 ); 255 } 256 return $chunks; 257 } 258 259 /** @inheritdoc */ 260 public function getSimilarChunks($vector, $lang = '', $limit = 4) 261 { 262 $limit *= 2; // we can't check ACLs, so we return more than requested 263 264 if ($lang) { 265 $filter = ['language' => ['$eq', $lang]]; 266 } else { 267 $filter = null; 268 } 269 270 $data = $this->runQuery( 271 '/collections/' . $this->getCollectionID() . '/query', 272 [ 273 'query_embeddings' => [$vector], 274 'n_results' => (int)$limit, 275 'where' => $filter, 276 'include' => [ 277 'metadatas', 278 'documents', 279 'embeddings', 280 'distances', 281 ] 282 ] 283 ); 284 285 $chunks = []; 286 foreach ($data['ids'][0] as $idx => $id) { 287 $chunks[] = new Chunk( 288 $data['metadatas'][0][$idx]['page'], 289 (int)$id, 290 $data['documents'][0][$idx], 291 $data['embeddings'][0][$idx], 292 $data['metadatas'][0][$idx]['language'] ?? '', 293 $data['metadatas'][0][$idx]['created'], 294 $data['distances'][0][$idx] 295 ); 296 } 297 return $chunks; 298 } 299 300 /** @inheritdoc */ 301 public function statistics() 302 { 303 $count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET'); 304 $version = $this->runQuery('/version', '', 'GET'); 305 306 return [ 307 'chroma_version' => $version, 308 'collection_id' => $this->getCollectionID(), 309 'chunks' => $count 310 ]; 311 } 312} 313