1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Pinecone index 10 */ 11class PineconeStorage extends AbstractStorage 12{ 13 /** @var DokuHTTPClient preauthed client */ 14 protected $http; 15 /** @var string full URL to the index instance */ 16 protected $baseurl; 17 /** @var bool set to true when no chunks should be reused */ 18 protected $overwrite = false; 19 20 /** @inheritdoc */ 21 public function __construct(array $config) 22 { 23 $this->baseurl = $config['pinecone_baseurl'] ?? ''; 24 25 $this->http = new DokuHTTPClient(); 26 $this->http->headers['Api-Key'] = $config['pinecone_apikey']; 27 $this->http->headers['Content-Type'] = 'application/json'; 28 $this->http->headers['Accept'] = 'application/json'; 29 } 30 31 /** 32 * Execute a query against the Pinecone API 33 * 34 * @param string $endpoint API endpoint, will be added to the base URL 35 * @param mixed $data The data to send, will be JSON encoded 36 * @param string $method POST|GET 37 * @return mixed 38 * @throws \Exception 39 */ 40 protected function runQuery($endpoint, mixed $data, $method = 'POST') 41 { 42 $url = $this->baseurl . $endpoint; 43 44 if (is_array($data) && $data === []) { 45 $json = '{}'; 46 } else { 47 $json = json_encode($data, JSON_THROW_ON_ERROR); 48 } 49 50 $this->http->sendRequest($url, $json, $method); 51 $response = $this->http->resp_body; 52 if ($response === false) { 53 throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 54 } 55 56 try { 57 $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 58 } catch (\JsonException $e) { 59 throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 0, $e); 60 } 61 62 if (isset($result['message'])) { 63 throw new \Exception('Pinecone API returned error. ' . $result['message']); 64 } 65 66 return $result; 67 } 68 69 /** @inheritdoc */ 70 public function getChunk($chunkID) 71 { 72 if ($this->overwrite) return null; // no reuse allowed 73 74 $data = $this->runQuery( 75 '/vectors/fetch?ids=' . $chunkID, 76 '', 77 'GET' 78 ); 79 if (!$data) return null; 80 $vector = array_shift($data['vectors']); 81 if (!$vector) return null; 82 83 return new Chunk( 84 $vector['metadata']['page'], 85 $chunkID, 86 $vector['metadata']['text'], 87 $vector['values'], 88 $vector['metadata']['language'] ?? '', 89 $vector['metadata']['created'] 90 ); 91 } 92 93 /** 94 * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 95 * not reuse any existing vectors. 96 * 97 * @inheritdoc 98 */ 99 public function startCreation($clear = false) 100 { 101 if ($clear) { 102 try { 103 $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 104 } catch (\Exception) { 105 // delete all seems not supported -> starter edition 106 $this->overwrite = true; 107 } 108 } 109 } 110 111 /** @inheritdoc */ 112 public function reusePageChunks($page, $firstChunkID) 113 { 114 // no-op 115 } 116 117 /** @inheritdoc */ 118 public function deletePageChunks($page, $firstChunkID) 119 { 120 // delete all possible chunk IDs 121 $ids = range($firstChunkID, $firstChunkID + 99, 1); 122 $ids = array_map(static fn($id) => (string)$id, $ids); 123 $this->runQuery('/vectors/delete', ['ids' => $ids]); 124 } 125 126 /** @inheritdoc */ 127 public function addPageChunks($chunks) 128 { 129 $vectors = []; 130 foreach ($chunks as $chunk) { 131 $vectors[] = [ 132 'id' => (string)$chunk->getId(), 133 'values' => $chunk->getEmbedding(), 134 'metadata' => [ 135 'page' => $chunk->getPage(), 136 'created' => $chunk->getCreated(), 137 'text' => $chunk->getText(), 138 ] 139 ]; 140 } 141 142 $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 143 } 144 145 /** @inheritdoc */ 146 public function finalizeCreation() 147 { 148 $this->overwrite = false; 149 } 150 151 /** @inheritdoc */ 152 public function runMaintenance() 153 { 154 // no-op 155 } 156 157 158 /** 159 * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 160 * 161 * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 162 * @inheritdoc 163 */ 164 public function getPageChunks($page, $firstChunkID) 165 { 166 $ids = range($firstChunkID, $firstChunkID + 99, 1); 167 $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 168 169 $data = $this->runQuery( 170 '/vectors/fetch?' . $ids, 171 '', 172 'GET' 173 ); 174 if (!$data) return []; 175 176 $chunks = []; 177 foreach ($data['vectors'] as $vector) { 178 $chunks[] = new Chunk( 179 $vector['metadata']['page'], 180 $vector['id'], 181 $vector['metadata']['text'], 182 $vector['values'], 183 $vector['metadata']['language'] ?? '', 184 $vector['metadata']['created'] 185 ); 186 } 187 return $chunks; 188 } 189 190 /** @inheritdoc */ 191 public function getSimilarChunks($vector, $lang = '', $limit = 4) 192 { 193 $limit *= 2; // we can't check ACLs, so we return more than requested 194 195 if ($lang) { 196 $filter = ['language' => ['$eq', $lang]]; 197 } else { 198 $filter = []; 199 } 200 201 $response = $this->runQuery( 202 '/query', 203 [ 204 'vector' => $vector, 205 'topK' => (int)$limit, 206 'include_metadata' => true, 207 'include_values' => true, 208 'filter' => $filter, 209 ] 210 ); 211 $chunks = []; 212 foreach ($response['matches'] as $vector) { 213 $chunks[] = new Chunk( 214 $vector['metadata']['page'], 215 $vector['id'], 216 $vector['metadata']['text'], 217 $vector['values'], 218 $vector['metadata']['language'] ?? '', 219 $vector['metadata']['created'], 220 $vector['score'] 221 ); 222 } 223 return $chunks; 224 } 225 226 /** @inheritdoc */ 227 public function statistics() 228 { 229 $data = $this->runQuery('/describe_index_stats', []); 230 231 return [ 232 'storage type' => 'Pinecone', 233 'chunks' => $data['totalVectorCount'], 234 'fullness' => $data['indexFullness'], 235 ]; 236 } 237} 238