1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Pinecone index 10 */ 11class PineconeStorage extends AbstractStorage 12{ 13 /** @var DokuHTTPClient preauthed client */ 14 protected $http; 15 /** @var string full URL to the index instance */ 16 protected $baseurl; 17 /** @var bool set to true when no chunks should be reused */ 18 protected $overwrite = false; 19 20 /** @inheritdoc */ 21 public function __construct(array $config) 22 { 23 $this->baseurl = $config['pinecone_baseurl'] ?? ''; 24 25 $this->http = new DokuHTTPClient(); 26 $this->http->headers['Api-Key'] = $config['pinecone_apikey']; 27 $this->http->headers['Content-Type'] = 'application/json'; 28 $this->http->headers['Accept'] = 'application/json'; 29 } 30 31 /** 32 * Execute a query against the Pinecone API 33 * 34 * @param string $endpoint API endpoint, will be added to the base URL 35 * @param mixed $data The data to send, will be JSON encoded 36 * @param string $method POST|GET 37 * @return mixed 38 * @throws \Exception 39 */ 40 protected function runQuery($endpoint, mixed $data, $method = 'POST') 41 { 42 $url = $this->baseurl . $endpoint; 43 44 if (is_array($data) && $data === []) { 45 $json = '{}'; 46 } else { 47 $json = json_encode($data, JSON_THROW_ON_ERROR); 48 } 49 50 $this->http->sendRequest($url, $json, $method); 51 $response = $this->http->resp_body; 52 if ($response === false) { 53 throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 54 } 55 56 $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 57 if ($result === null) { 58 throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 59 } 60 61 if (isset($result['message'])) { 62 throw new \Exception('Pinecone API returned error. ' . $result['message']); 63 } 64 65 return $result; 66 } 67 68 /** @inheritdoc */ 69 public function getChunk($chunkID) 70 { 71 if ($this->overwrite) return null; // no reuse allowed 72 73 $data = $this->runQuery( 74 '/vectors/fetch?ids=' . $chunkID, 75 '', 76 'GET' 77 ); 78 if (!$data) return null; 79 $vector = array_shift($data['vectors']); 80 if (!$vector) return null; 81 82 return new Chunk( 83 $vector['metadata']['page'], 84 $chunkID, 85 $vector['metadata']['text'], 86 $vector['values'], 87 $vector['metadata']['language'] ?? '', 88 $vector['metadata']['created'] 89 ); 90 } 91 92 /** 93 * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 94 * not reuse any existing vectors. 95 * 96 * @inheritdoc 97 */ 98 public function startCreation($clear = false) 99 { 100 if ($clear) { 101 try { 102 $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 103 } catch (\Exception) { 104 // delete all seems not supported -> starter edition 105 $this->overwrite = true; 106 } 107 } 108 } 109 110 /** @inheritdoc */ 111 public function reusePageChunks($page, $firstChunkID) 112 { 113 // no-op 114 } 115 116 /** @inheritdoc */ 117 public function deletePageChunks($page, $firstChunkID) 118 { 119 // delete all possible chunk IDs 120 $ids = range($firstChunkID, $firstChunkID + 99, 1); 121 $ids = array_map(static fn($id) => (string)$id, $ids); 122 $this->runQuery('/vectors/delete', ['ids' => $ids]); 123 } 124 125 /** @inheritdoc */ 126 public function addPageChunks($chunks) 127 { 128 $vectors = []; 129 foreach ($chunks as $chunk) { 130 $vectors[] = [ 131 'id' => (string)$chunk->getId(), 132 'values' => $chunk->getEmbedding(), 133 'metadata' => [ 134 'page' => $chunk->getPage(), 135 'created' => $chunk->getCreated(), 136 'text' => $chunk->getText(), 137 ] 138 ]; 139 } 140 141 $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 142 } 143 144 /** @inheritdoc */ 145 public function finalizeCreation() 146 { 147 $this->overwrite = false; 148 } 149 150 /** @inheritdoc */ 151 public function runMaintenance() 152 { 153 // no-op 154 } 155 156 157 /** 158 * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 159 * 160 * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 161 * @inheritdoc 162 */ 163 public function getPageChunks($page, $firstChunkID) 164 { 165 $ids = range($firstChunkID, $firstChunkID + 99, 1); 166 $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 167 168 $data = $this->runQuery( 169 '/vectors/fetch?' . $ids, 170 '', 171 'GET' 172 ); 173 if (!$data) return []; 174 175 $chunks = []; 176 foreach ($data['vectors'] as $vector) { 177 $chunks[] = new Chunk( 178 $vector['metadata']['page'], 179 $vector['id'], 180 $vector['metadata']['text'], 181 $vector['values'], 182 $vector['metadata']['language'] ?? '', 183 $vector['metadata']['created'] 184 ); 185 } 186 return $chunks; 187 } 188 189 /** @inheritdoc */ 190 public function getSimilarChunks($vector, $lang = '', $limit = 4) 191 { 192 $limit *= 2; // we can't check ACLs, so we return more than requested 193 194 if ($lang) { 195 $filter = ['language' => ['$eq', $lang]]; 196 } else { 197 $filter = []; 198 } 199 200 $response = $this->runQuery( 201 '/query', 202 [ 203 'vector' => $vector, 204 'topK' => (int)$limit, 205 'include_metadata' => true, 206 'include_values' => true, 207 'filter' => $filter, 208 ] 209 ); 210 $chunks = []; 211 foreach ($response['matches'] as $vector) { 212 $chunks[] = new Chunk( 213 $vector['metadata']['page'], 214 $vector['id'], 215 $vector['metadata']['text'], 216 $vector['values'], 217 $vector['metadata']['language'] ?? '', 218 $vector['metadata']['created'], 219 $vector['score'] 220 ); 221 } 222 return $chunks; 223 } 224 225 /** @inheritdoc */ 226 public function statistics() 227 { 228 $data = $this->runQuery('/describe_index_stats', []); 229 230 return [ 231 'storage type' => 'Pinecone', 232 'chunks' => $data['totalVectorCount'], 233 'fullness' => $data['indexFullness'], 234 ]; 235 } 236} 237