1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Pinecone index 10 */ 11class PineconeStorage extends AbstractStorage 12{ 13 /** @var DokuHTTPClient preauthed client */ 14 protected $http; 15 /** @var string full URL to the index instance */ 16 protected $baseurl; 17 /** @var bool set to true when no chunks should be reused */ 18 protected $overwrite = false; 19 20 /** @inheritdoc */ 21 public function __construct(array $config) 22 { 23 $this->baseurl = $config['pinecone_baseurl'] ?? ''; 24 25 $this->http = new DokuHTTPClient(); 26 $this->http->headers['Api-Key'] = $config['pinecone_apikey']; 27 $this->http->headers['Content-Type'] = 'application/json'; 28 $this->http->headers['Accept'] = 'application/json'; 29 } 30 31 /** 32 * Execute a query against the Pinecone API 33 * 34 * @param string $endpoint API endpoint, will be added to the base URL 35 * @param mixed $data The data to send, will be JSON encoded 36 * @param string $method POST|GET 37 * @return mixed 38 * @throws \Exception 39 */ 40 protected function runQuery($endpoint, mixed $data, $method = 'POST') 41 { 42 $url = $this->baseurl . $endpoint; 43 44 if (is_array($data) && $data === []) { 45 $json = '{}'; 46 } else { 47 $json = json_encode($data, JSON_THROW_ON_ERROR); 48 } 49 50 $this->http->sendRequest($url, $json, $method); 51 $response = $this->http->resp_body; 52 if ($response === false) { 53 throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 54 } 55 56 try { 57 $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 58 } catch (\JsonException $e) { 59 throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 0, $e); 60 } 61 62 if (isset($result['message'])) { 63 throw new \Exception('Pinecone API returned error. ' . $result['message'], $result['code'] ?? 0); 64 } 65 66 return $result; 67 } 68 69 /** @inheritdoc */ 70 public function getChunk($chunkID) 71 { 72 if ($this->overwrite) return null; // no reuse allowed 73 74 $data = $this->runQuery( 75 '/vectors/fetch?ids=' . $chunkID, 76 '', 77 'GET' 78 ); 79 if (!$data) return null; 80 $vector = array_shift($data['vectors']); 81 if (!$vector) return null; 82 83 return new Chunk( 84 $vector['metadata']['page'], 85 $chunkID, 86 $vector['metadata']['text'], 87 $vector['values'], 88 $vector['metadata']['language'] ?? '', 89 $vector['metadata']['created'] 90 ); 91 } 92 93 /** 94 * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 95 * not reuse any existing vectors. 96 * 97 * @inheritdoc 98 */ 99 public function startCreation($clear = false) 100 { 101 if ($clear) { 102 try { 103 $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 104 } catch (\Exception) { 105 // delete all seems not supported -> starter edition 106 $this->overwrite = true; 107 } 108 } 109 } 110 111 /** @inheritdoc */ 112 public function reusePageChunks($page, $firstChunkID) 113 { 114 // no-op 115 } 116 117 /** @inheritdoc */ 118 public function deletePageChunks($page, $firstChunkID) 119 { 120 // delete all possible chunk IDs 121 $ids = range($firstChunkID, $firstChunkID + 99, 1); 122 $ids = array_map(static fn($id) => (string)$id, $ids); 123 try { 124 $this->runQuery('/vectors/delete', ['ids' => $ids]); 125 } catch (\Exception $e) { 126 // 5 is the code for "namespace not found" See #12 127 if ($e->getCode() !== 5) throw $e; 128 } 129 } 130 131 /** @inheritdoc */ 132 public function addPageChunks($chunks) 133 { 134 $vectors = []; 135 foreach ($chunks as $chunk) { 136 $vectors[] = [ 137 'id' => (string)$chunk->getId(), 138 'values' => $chunk->getEmbedding(), 139 'metadata' => [ 140 'page' => $chunk->getPage(), 141 'created' => $chunk->getCreated(), 142 'text' => $chunk->getText(), 143 ] 144 ]; 145 } 146 147 $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 148 } 149 150 /** @inheritdoc */ 151 public function finalizeCreation() 152 { 153 $this->overwrite = false; 154 } 155 156 /** @inheritdoc */ 157 public function runMaintenance() 158 { 159 // no-op 160 } 161 162 163 /** 164 * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 165 * 166 * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 167 * @inheritdoc 168 */ 169 public function getPageChunks($page, $firstChunkID) 170 { 171 $ids = range($firstChunkID, $firstChunkID + 99, 1); 172 $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 173 174 $data = $this->runQuery( 175 '/vectors/fetch?' . $ids, 176 '', 177 'GET' 178 ); 179 if (!$data) return []; 180 181 $chunks = []; 182 foreach ($data['vectors'] as $vector) { 183 $chunks[] = new Chunk( 184 $vector['metadata']['page'], 185 $vector['id'], 186 $vector['metadata']['text'], 187 $vector['values'], 188 $vector['metadata']['language'] ?? '', 189 $vector['metadata']['created'] 190 ); 191 } 192 return $chunks; 193 } 194 195 /** @inheritdoc */ 196 public function getSimilarChunks($vector, $lang = '', $limit = 4) 197 { 198 $limit *= 2; // we can't check ACLs, so we return more than requested 199 200 $query = [ 201 'vector' => $vector, 202 'topK' => (int)$limit, 203 'includeMetadata' => true, 204 'includeValues' => true, 205 ]; 206 207 if ($lang) { 208 $query['filter'] = ['language' => ['$eq', $lang]]; 209 } 210 211 $response = $this->runQuery('/query', $query); 212 $chunks = []; 213 foreach ($response['matches'] as $vector) { 214 $chunks[] = new Chunk( 215 $vector['metadata']['page'], 216 $vector['id'], 217 $vector['metadata']['text'], 218 $vector['values'], 219 $vector['metadata']['language'] ?? '', 220 $vector['metadata']['created'], 221 $vector['score'] 222 ); 223 } 224 return $chunks; 225 } 226 227 /** @inheritdoc */ 228 public function statistics() 229 { 230 $data = $this->runQuery('/describe_index_stats', []); 231 232 return [ 233 'storage type' => 'Pinecone', 234 'chunks' => $data['totalVectorCount'], 235 'fullness' => $data['indexFullness'], 236 ]; 237 } 238} 239