1<?php 2 3namespace dokuwiki\plugin\aichat\Storage; 4 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\plugin\aichat\Chunk; 7 8/** 9 * Implements the storage backend using a Pinecone index 10 */ 11class PineconeStorage extends AbstractStorage 12{ 13 /** @var DokuHTTPClient preauthed client */ 14 protected $http; 15 /** @var string full URL to the index instance */ 16 protected $baseurl; 17 /** @var bool set to true when no chunks should be reused */ 18 protected $overwrite = false; 19 20 /** 21 * PineconeStorage constructor. 22 */ 23 public function __construct() 24 { 25 $helper = plugin_load('helper', 'aichat'); 26 27 $this->baseurl = $helper->getConf('pinecone_baseurl'); 28 29 $this->http = new DokuHTTPClient(); 30 $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey'); 31 $this->http->headers['Content-Type'] = 'application/json'; 32 $this->http->headers['Accept'] = 'application/json'; 33 } 34 35 /** 36 * Execute a query against the Pinecone API 37 * 38 * @param string $endpoint API endpoint, will be added to the base URL 39 * @param mixed $data The data to send, will be JSON encoded 40 * @param string $method POST|GET 41 * @return mixed 42 * @throws \Exception 43 */ 44 protected function runQuery($endpoint, $data, $method = 'POST') 45 { 46 $url = $this->baseurl . $endpoint; 47 48 if (is_array($data) && !count($data)) { 49 $json = '{}'; 50 } else { 51 $json = json_encode($data); 52 } 53 54 $this->http->sendRequest($url, $json, $method); 55 $response = $this->http->resp_body; 56 if ($response === false) { 57 throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 58 } 59 60 $result = json_decode($response, true); 61 if ($result === null) { 62 throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 63 } 64 65 if (isset($result['message'])) { 66 throw new \Exception('Pinecone API returned error. ' . $result['message']); 67 } 68 69 return $result; 70 } 71 72 /** @inheritdoc */ 73 public function getChunk($chunkID) 74 { 75 if ($this->overwrite) return null; // no reuse allowed 76 77 $data = $this->runQuery( 78 '/vectors/fetch?ids=' . $chunkID, 79 '', 80 'GET' 81 ); 82 if (!$data) return null; 83 $vector = array_shift($data['vectors']); 84 if (!$vector) return null; 85 86 return new Chunk( 87 $vector['metadata']['page'], 88 $chunkID, 89 $vector['metadata']['text'], 90 $vector['values'], 91 $vector['metadata']['created'] 92 ); 93 } 94 95 /** 96 * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 97 * not reuse any existing vectors. 98 * 99 * @inheritdoc 100 */ 101 public function startCreation($clear = false) 102 { 103 if ($clear) { 104 try { 105 $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 106 } catch (\Exception $e) { 107 // delete all seems not supported -> starter edition 108 $this->overwrite = true; 109 } 110 } 111 } 112 113 /** @inheritdoc */ 114 public function reusePageChunks($page, $firstChunkID) 115 { 116 // no-op 117 } 118 119 /** @inheritdoc */ 120 public function deletePageChunks($page, $firstChunkID) 121 { 122 // delete all possible chunk IDs 123 $ids = range($firstChunkID, $firstChunkID + 99, 1); 124 $ids = array_map(function ($id) { 125 return (string)$id; 126 }, $ids); 127 $this->runQuery('/vectors/delete', ['ids' => $ids]); 128 } 129 130 /** @inheritdoc */ 131 public function addPageChunks($chunks) 132 { 133 $vectors = []; 134 foreach ($chunks as $chunk) { 135 $vectors[] = [ 136 'id' => (string)$chunk->getId(), 137 'values' => $chunk->getEmbedding(), 138 'metadata' => [ 139 'page' => $chunk->getPage(), 140 'created' => $chunk->getCreated(), 141 'text' => $chunk->getText(), 142 ] 143 ]; 144 } 145 146 $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 147 } 148 149 /** @inheritdoc */ 150 public function finalizeCreation() 151 { 152 $this->overwrite = false; 153 } 154 155 /** @inheritdoc */ 156 public function runMaintenance() 157 { 158 // no-op 159 } 160 161 162 /** 163 * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 164 * 165 * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 166 * @inheritdoc 167 */ 168 public function getPageChunks($page, $firstChunkID) 169 { 170 $ids = range($firstChunkID, $firstChunkID + 99, 1); 171 $ids = array_reduce($ids, function ($carry, $item) { 172 return $carry . '&ids=' . $item; 173 }); 174 175 $data = $this->runQuery( 176 '/vectors/fetch?' . $ids, 177 '', 178 'GET' 179 ); 180 if (!$data) return []; 181 182 $chunks = []; 183 foreach ($data['vectors'] as $vector) { 184 $chunks[] = new Chunk( 185 $vector['metadata']['page'], 186 $vector['id'], 187 $vector['metadata']['text'], 188 $vector['values'], 189 $vector['metadata']['created'] 190 ); 191 } 192 return $chunks; 193 } 194 195 /** @inheritdoc */ 196 public function getSimilarChunks($vector, $limit = 4) 197 { 198 $limit = $limit * 2; // we can't check ACLs, so we return more than requested 199 200 $response = $this->runQuery( 201 '/query', 202 [ 203 'vector' => $vector, 204 'topK' => (int)$limit, 205 'include_metadata' => true, 206 'include_values' => true, 207 ] 208 ); 209 $chunks = []; 210 foreach ($response['matches'] as $vector) { 211 $chunks[] = new Chunk( 212 $vector['metadata']['page'], 213 $vector['id'], 214 $vector['metadata']['text'], 215 $vector['values'], 216 $vector['metadata']['created'], 217 $vector['score'] 218 ); 219 } 220 return $chunks; 221 } 222 223 /** @inheritdoc */ 224 public function statistics() 225 { 226 $data = $this->runQuery('/describe_index_stats', []); 227 228 return [ 229 'storage type' => 'Pinecone', 230 'chunks' => $data['totalVectorCount'], 231 'fullness' => $data['indexFullness'], 232 ]; 233 } 234} 235