1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Pinecone index 10 */ 11class PineconeStorage extends AbstractStorage 12{ 13 /** @var DokuHTTPClient preauthed client */ 14 protected $http; 15 /** @var string full URL to the index instance */ 16 protected $baseurl; 17 /** @var bool set to true when no chunks should be reused */ 18 protected $overwrite = false; 19 20 /** 21 * PineconeStorage constructor. 22 */ 23 public function __construct() 24 { 25 $helper = plugin_load('helper', 'aichat'); 26 27 $this->baseurl = $helper->getConf('pinecone_baseurl'); 28 29 $this->http = new DokuHTTPClient(); 30 $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey'); 31 $this->http->headers['Content-Type'] = 'application/json'; 32 $this->http->headers['Accept'] = 'application/json'; 33 } 34 35 /** 36 * Execute a query against the Pinecone API 37 * 38 * @param string $endpoint API endpoint, will be added to the base URL 39 * @param mixed $data The data to send, will be JSON encoded 40 * @param string $method POST|GET 41 * @return mixed 42 * @throws \Exception 43 */ 44 protected function runQuery($endpoint, mixed $data, $method = 'POST') 45 { 46 $url = $this->baseurl . $endpoint; 47 48 if (is_array($data) && $data === []) { 49 $json = '{}'; 50 } else { 51 $json = json_encode($data, JSON_THROW_ON_ERROR); 52 } 53 54 $this->http->sendRequest($url, $json, $method); 55 $response = $this->http->resp_body; 56 if ($response === false) { 57 throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 58 } 59 60 $result = json_decode((string) $response, true, 512, JSON_THROW_ON_ERROR); 61 if ($result === null) { 62 throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 63 } 64 65 if (isset($result['message'])) { 66 throw new \Exception('Pinecone API returned error. ' . $result['message']); 67 } 68 69 return $result; 70 } 71 72 /** @inheritdoc */ 73 public function getChunk($chunkID) 74 { 75 if ($this->overwrite) return null; // no reuse allowed 76 77 $data = $this->runQuery( 78 '/vectors/fetch?ids=' . $chunkID, 79 '', 80 'GET' 81 ); 82 if (!$data) return null; 83 $vector = array_shift($data['vectors']); 84 if (!$vector) return null; 85 86 return new Chunk( 87 $vector['metadata']['page'], 88 $chunkID, 89 $vector['metadata']['text'], 90 $vector['values'], 91 $vector['metadata']['language'] ?? '', 92 $vector['metadata']['created'] 93 ); 94 } 95 96 /** 97 * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 98 * not reuse any existing vectors. 99 * 100 * @inheritdoc 101 */ 102 public function startCreation($clear = false) 103 { 104 if ($clear) { 105 try { 106 $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 107 } catch (\Exception) { 108 // delete all seems not supported -> starter edition 109 $this->overwrite = true; 110 } 111 } 112 } 113 114 /** @inheritdoc */ 115 public function reusePageChunks($page, $firstChunkID) 116 { 117 // no-op 118 } 119 120 /** @inheritdoc */ 121 public function deletePageChunks($page, $firstChunkID) 122 { 123 // delete all possible chunk IDs 124 $ids = range($firstChunkID, $firstChunkID + 99, 1); 125 $ids = array_map(static fn($id) => (string)$id, $ids); 126 $this->runQuery('/vectors/delete', ['ids' => $ids]); 127 } 128 129 /** @inheritdoc */ 130 public function addPageChunks($chunks) 131 { 132 $vectors = []; 133 foreach ($chunks as $chunk) { 134 $vectors[] = [ 135 'id' => (string)$chunk->getId(), 136 'values' => $chunk->getEmbedding(), 137 'metadata' => [ 138 'page' => $chunk->getPage(), 139 'created' => $chunk->getCreated(), 140 'text' => $chunk->getText(), 141 ] 142 ]; 143 } 144 145 $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 146 } 147 148 /** @inheritdoc */ 149 public function finalizeCreation() 150 { 151 $this->overwrite = false; 152 } 153 154 /** @inheritdoc */ 155 public function runMaintenance() 156 { 157 // no-op 158 } 159 160 161 /** 162 * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 163 * 164 * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 165 * @inheritdoc 166 */ 167 public function getPageChunks($page, $firstChunkID) 168 { 169 $ids = range($firstChunkID, $firstChunkID + 99, 1); 170 $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 171 172 $data = $this->runQuery( 173 '/vectors/fetch?' . $ids, 174 '', 175 'GET' 176 ); 177 if (!$data) return []; 178 179 $chunks = []; 180 foreach ($data['vectors'] as $vector) { 181 $chunks[] = new Chunk( 182 $vector['metadata']['page'], 183 $vector['id'], 184 $vector['metadata']['text'], 185 $vector['values'], 186 $vector['metadata']['language'] ?? '', 187 $vector['metadata']['created'] 188 ); 189 } 190 return $chunks; 191 } 192 193 /** @inheritdoc */ 194 public function getSimilarChunks($vector, $lang = '', $limit = 4) 195 { 196 $limit *= 2; // we can't check ACLs, so we return more than requested 197 198 if ($lang) { 199 $filter = ['language' => ['$eq', $lang]]; 200 } else { 201 $filter = []; 202 } 203 204 $response = $this->runQuery( 205 '/query', 206 [ 207 'vector' => $vector, 208 'topK' => (int)$limit, 209 'include_metadata' => true, 210 'include_values' => true, 211 'filter' => $filter, 212 ] 213 ); 214 $chunks = []; 215 foreach ($response['matches'] as $vector) { 216 $chunks[] = new Chunk( 217 $vector['metadata']['page'], 218 $vector['id'], 219 $vector['metadata']['text'], 220 $vector['values'], 221 $vector['metadata']['language'] ?? '', 222 $vector['metadata']['created'], 223 $vector['score'] 224 ); 225 } 226 return $chunks; 227 } 228 229 /** @inheritdoc */ 230 public function statistics() 231 { 232 $data = $this->runQuery('/describe_index_stats', []); 233 234 return [ 235 'storage type' => 'Pinecone', 236 'chunks' => $data['totalVectorCount'], 237 'fullness' => $data['indexFullness'], 238 ]; 239 } 240} 241