1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Pinecone index 10 */ 11class PineconeStorage extends AbstractStorage 12{ 13 /** @var DokuHTTPClient preauthed client */ 14 protected $http; 15 /** @var string full URL to the index instance */ 16 protected $baseurl; 17 /** @var bool set to true when no chunks should be reused */ 18 protected $overwrite = false; 19 20 /** 21 * PineconeStorage constructor. 22 */ 23 public function __construct() 24 { 25 $helper = plugin_load('helper', 'aichat'); 26 27 $this->baseurl = $helper->getConf('pinecone_baseurl'); 28 29 $this->http = new DokuHTTPClient(); 30 $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey'); 31 $this->http->headers['Content-Type'] = 'application/json'; 32 $this->http->headers['Accept'] = 'application/json'; 33 } 34 35 /** 36 * Execute a query against the Pinecone API 37 * 38 * @param string $endpoint API endpoint, will be added to the base URL 39 * @param mixed $data The data to send, will be JSON encoded 40 * @param string $method POST|GET 41 * @return mixed 42 * @throws \Exception 43 */ 44 protected function runQuery($endpoint, $data, $method = 'POST') 45 { 46 $url = $this->baseurl . $endpoint; 47 48 if (is_array($data) && $data === []) { 49 $json = '{}'; 50 } else { 51 $json = json_encode($data); 52 } 53 54 $this->http->sendRequest($url, $json, $method); 55 $response = $this->http->resp_body; 56 if ($response === false) { 57 throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 58 } 59 60 $result = json_decode($response, true); 61 if ($result === null) { 62 throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 63 } 64 65 if (isset($result['message'])) { 66 throw new \Exception('Pinecone API returned error. ' . $result['message']); 67 } 68 69 return $result; 70 } 71 72 /** @inheritdoc */ 73 public function getChunk($chunkID) 74 { 75 if ($this->overwrite) return null; // no reuse allowed 76 77 $data = $this->runQuery( 78 '/vectors/fetch?ids=' . $chunkID, 79 '', 80 'GET' 81 ); 82 if (!$data) return null; 83 $vector = array_shift($data['vectors']); 84 if (!$vector) return null; 85 86 return new Chunk( 87 $vector['metadata']['page'], 88 $chunkID, 89 $vector['metadata']['text'], 90 $vector['values'], 91 $vector['metadata']['language'] ?? '', 92 $vector['metadata']['created'] 93 ); 94 } 95 96 /** 97 * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 98 * not reuse any existing vectors. 99 * 100 * @inheritdoc 101 */ 102 public function startCreation($clear = false) 103 { 104 if ($clear) { 105 try { 106 $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 107 } catch (\Exception $e) { 108 // delete all seems not supported -> starter edition 109 $this->overwrite = true; 110 } 111 } 112 } 113 114 /** @inheritdoc */ 115 public function reusePageChunks($page, $firstChunkID) 116 { 117 // no-op 118 } 119 120 /** @inheritdoc */ 121 public function deletePageChunks($page, $firstChunkID) 122 { 123 // delete all possible chunk IDs 124 $ids = range($firstChunkID, $firstChunkID + 99, 1); 125 $ids = array_map(function ($id) { 126 return (string)$id; 127 }, $ids); 128 $this->runQuery('/vectors/delete', ['ids' => $ids]); 129 } 130 131 /** @inheritdoc */ 132 public function addPageChunks($chunks) 133 { 134 $vectors = []; 135 foreach ($chunks as $chunk) { 136 $vectors[] = [ 137 'id' => (string)$chunk->getId(), 138 'values' => $chunk->getEmbedding(), 139 'metadata' => [ 140 'page' => $chunk->getPage(), 141 'created' => $chunk->getCreated(), 142 'text' => $chunk->getText(), 143 ] 144 ]; 145 } 146 147 $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 148 } 149 150 /** @inheritdoc */ 151 public function finalizeCreation() 152 { 153 $this->overwrite = false; 154 } 155 156 /** @inheritdoc */ 157 public function runMaintenance() 158 { 159 // no-op 160 } 161 162 163 /** 164 * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 165 * 166 * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 167 * @inheritdoc 168 */ 169 public function getPageChunks($page, $firstChunkID) 170 { 171 $ids = range($firstChunkID, $firstChunkID + 99, 1); 172 $ids = array_reduce($ids, function ($carry, $item) { 173 return $carry . '&ids=' . $item; 174 }); 175 176 $data = $this->runQuery( 177 '/vectors/fetch?' . $ids, 178 '', 179 'GET' 180 ); 181 if (!$data) return []; 182 183 $chunks = []; 184 foreach ($data['vectors'] as $vector) { 185 $chunks[] = new Chunk( 186 $vector['metadata']['page'], 187 $vector['id'], 188 $vector['metadata']['text'], 189 $vector['values'], 190 $vector['metadata']['language'] ?? '', 191 $vector['metadata']['created'] 192 ); 193 } 194 return $chunks; 195 } 196 197 /** @inheritdoc */ 198 public function getSimilarChunks($vector, $lang = '', $limit = 4) 199 { 200 $limit *= 2; // we can't check ACLs, so we return more than requested 201 202 if ($lang) { 203 $filter = ['language' => ['$eq', $lang]]; 204 } else { 205 $filter = []; 206 } 207 208 $response = $this->runQuery( 209 '/query', 210 [ 211 'vector' => $vector, 212 'topK' => (int)$limit, 213 'include_metadata' => true, 214 'include_values' => true, 215 'filter' => $filter, 216 ] 217 ); 218 $chunks = []; 219 foreach ($response['matches'] as $vector) { 220 $chunks[] = new Chunk( 221 $vector['metadata']['page'], 222 $vector['id'], 223 $vector['metadata']['text'], 224 $vector['values'], 225 $vector['metadata']['language'] ?? '', 226 $vector['metadata']['created'], 227 $vector['score'] 228 ); 229 } 230 return $chunks; 231 } 232 233 /** @inheritdoc */ 234 public function statistics() 235 { 236 $data = $this->runQuery('/describe_index_stats', []); 237 238 return [ 239 'storage type' => 'Pinecone', 240 'chunks' => $data['totalVectorCount'], 241 'fullness' => $data['indexFullness'], 242 ]; 243 } 244} 245