113dbfc23SAndreas Gohr<?php 213dbfc23SAndreas Gohr 313dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 413dbfc23SAndreas Gohr 513dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient; 613dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 713dbfc23SAndreas Gohr 813dbfc23SAndreas Gohr/** 913dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index 1013dbfc23SAndreas Gohr */ 1113dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage 1213dbfc23SAndreas Gohr{ 1313dbfc23SAndreas Gohr /** @var DokuHTTPClient preauthed client */ 1413dbfc23SAndreas Gohr protected $http; 1513dbfc23SAndreas Gohr /** @var string full URL to the index instance */ 1613dbfc23SAndreas Gohr protected $baseurl; 1713dbfc23SAndreas Gohr /** @var bool set to true when no chunks should be reused */ 1813dbfc23SAndreas Gohr protected $overwrite = false; 1913dbfc23SAndreas Gohr 2004afb84fSAndreas Gohr /** @inheritdoc */ 2104afb84fSAndreas Gohr public function __construct(array $config) 2213dbfc23SAndreas Gohr { 2304afb84fSAndreas Gohr $this->baseurl = $config['pinecone_baseurl'] ?? ''; 2413dbfc23SAndreas Gohr 2513dbfc23SAndreas Gohr $this->http = new DokuHTTPClient(); 2604afb84fSAndreas Gohr $this->http->headers['Api-Key'] = $config['pinecone_apikey']; 2713dbfc23SAndreas Gohr $this->http->headers['Content-Type'] = 'application/json'; 2813dbfc23SAndreas Gohr $this->http->headers['Accept'] = 'application/json'; 2913dbfc23SAndreas Gohr } 3013dbfc23SAndreas Gohr 3113dbfc23SAndreas Gohr /** 3213dbfc23SAndreas Gohr * Execute a query against the Pinecone API 3313dbfc23SAndreas Gohr * 3413dbfc23SAndreas Gohr * @param string $endpoint API endpoint, will be added to the base URL 3513dbfc23SAndreas Gohr * @param mixed $data The data to send, will be JSON encoded 3613dbfc23SAndreas Gohr * @param string $method POST|GET 3713dbfc23SAndreas Gohr * @return mixed 3813dbfc23SAndreas Gohr * @throws \Exception 3913dbfc23SAndreas Gohr */ 4030b9cbc7Ssplitbrain protected function runQuery($endpoint, mixed $data, $method = 'POST') 4113dbfc23SAndreas Gohr { 4213dbfc23SAndreas Gohr $url = $this->baseurl . $endpoint; 4313dbfc23SAndreas Gohr 447ebc7895Ssplitbrain if (is_array($data) && $data === []) { 4513dbfc23SAndreas Gohr $json = '{}'; 4613dbfc23SAndreas Gohr } else { 4730b9cbc7Ssplitbrain $json = json_encode($data, JSON_THROW_ON_ERROR); 4813dbfc23SAndreas Gohr } 4913dbfc23SAndreas Gohr 5013dbfc23SAndreas Gohr $this->http->sendRequest($url, $json, $method); 5113dbfc23SAndreas Gohr $response = $this->http->resp_body; 5213dbfc23SAndreas Gohr if ($response === false) { 53*42b2c6e8SAndreas Gohr throw new \Exception('Pinecone API returned no response. ' . $this->http->error, 4001); 5413dbfc23SAndreas Gohr } 5513dbfc23SAndreas Gohr 56edf69908SAndreas Gohr try { 5730b9cbc7Ssplitbrain $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 58edf69908SAndreas Gohr } catch (\JsonException $e) { 59*42b2c6e8SAndreas Gohr throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 4003, $e); 6013dbfc23SAndreas Gohr } 6113dbfc23SAndreas Gohr 6213dbfc23SAndreas Gohr if (isset($result['message'])) { 63*42b2c6e8SAndreas Gohr throw new \Exception('Pinecone API returned error. ' . $result['message'], $result['code'] ?: 4002); 6413dbfc23SAndreas Gohr } 6513dbfc23SAndreas Gohr 6613dbfc23SAndreas Gohr return $result; 6713dbfc23SAndreas Gohr } 6813dbfc23SAndreas Gohr 6913dbfc23SAndreas Gohr /** @inheritdoc */ 7013dbfc23SAndreas Gohr public function getChunk($chunkID) 7113dbfc23SAndreas Gohr { 7213dbfc23SAndreas Gohr if ($this->overwrite) return null; // no reuse allowed 7313dbfc23SAndreas Gohr 7413dbfc23SAndreas Gohr $data = $this->runQuery( 7513dbfc23SAndreas Gohr '/vectors/fetch?ids=' . $chunkID, 7613dbfc23SAndreas Gohr '', 7713dbfc23SAndreas Gohr 'GET' 7813dbfc23SAndreas Gohr ); 7913dbfc23SAndreas Gohr if (!$data) return null; 8013dbfc23SAndreas Gohr $vector = array_shift($data['vectors']); 8113dbfc23SAndreas Gohr if (!$vector) return null; 8213dbfc23SAndreas Gohr 8313dbfc23SAndreas Gohr return new Chunk( 8413dbfc23SAndreas Gohr $vector['metadata']['page'], 8513dbfc23SAndreas Gohr $chunkID, 8613dbfc23SAndreas Gohr $vector['metadata']['text'], 8713dbfc23SAndreas Gohr $vector['values'], 88e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 8913dbfc23SAndreas Gohr $vector['metadata']['created'] 9013dbfc23SAndreas Gohr ); 9113dbfc23SAndreas Gohr } 9213dbfc23SAndreas Gohr 9313dbfc23SAndreas Gohr /** 9413dbfc23SAndreas Gohr * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 9513dbfc23SAndreas Gohr * not reuse any existing vectors. 9613dbfc23SAndreas Gohr * 9713dbfc23SAndreas Gohr * @inheritdoc 9813dbfc23SAndreas Gohr */ 9913dbfc23SAndreas Gohr public function startCreation($clear = false) 10013dbfc23SAndreas Gohr { 10113dbfc23SAndreas Gohr if ($clear) { 10213dbfc23SAndreas Gohr try { 10313dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 10430b9cbc7Ssplitbrain } catch (\Exception) { 10513dbfc23SAndreas Gohr // delete all seems not supported -> starter edition 10613dbfc23SAndreas Gohr $this->overwrite = true; 10713dbfc23SAndreas Gohr } 10813dbfc23SAndreas Gohr } 10913dbfc23SAndreas Gohr } 11013dbfc23SAndreas Gohr 11113dbfc23SAndreas Gohr /** @inheritdoc */ 11213dbfc23SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 11313dbfc23SAndreas Gohr { 11413dbfc23SAndreas Gohr // no-op 11513dbfc23SAndreas Gohr } 11613dbfc23SAndreas Gohr 11713dbfc23SAndreas Gohr /** @inheritdoc */ 11813dbfc23SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 11913dbfc23SAndreas Gohr { 12013dbfc23SAndreas Gohr // delete all possible chunk IDs 12113dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 12230b9cbc7Ssplitbrain $ids = array_map(static fn($id) => (string)$id, $ids); 123edf69908SAndreas Gohr try { 12413dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['ids' => $ids]); 125edf69908SAndreas Gohr } catch (\Exception $e) { 126edf69908SAndreas Gohr // 5 is the code for "namespace not found" See #12 127edf69908SAndreas Gohr if ($e->getCode() !== 5) throw $e; 128edf69908SAndreas Gohr } 12913dbfc23SAndreas Gohr } 13013dbfc23SAndreas Gohr 13113dbfc23SAndreas Gohr /** @inheritdoc */ 13213dbfc23SAndreas Gohr public function addPageChunks($chunks) 13313dbfc23SAndreas Gohr { 13413dbfc23SAndreas Gohr $vectors = []; 13513dbfc23SAndreas Gohr foreach ($chunks as $chunk) { 13613dbfc23SAndreas Gohr $vectors[] = [ 13713dbfc23SAndreas Gohr 'id' => (string)$chunk->getId(), 13813dbfc23SAndreas Gohr 'values' => $chunk->getEmbedding(), 13913dbfc23SAndreas Gohr 'metadata' => [ 14013dbfc23SAndreas Gohr 'page' => $chunk->getPage(), 14113dbfc23SAndreas Gohr 'created' => $chunk->getCreated(), 14213dbfc23SAndreas Gohr 'text' => $chunk->getText(), 14313dbfc23SAndreas Gohr ] 14413dbfc23SAndreas Gohr ]; 14513dbfc23SAndreas Gohr } 14613dbfc23SAndreas Gohr 14713dbfc23SAndreas Gohr $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 14813dbfc23SAndreas Gohr } 14913dbfc23SAndreas Gohr 15013dbfc23SAndreas Gohr /** @inheritdoc */ 15113dbfc23SAndreas Gohr public function finalizeCreation() 15213dbfc23SAndreas Gohr { 15313dbfc23SAndreas Gohr $this->overwrite = false; 15413dbfc23SAndreas Gohr } 15513dbfc23SAndreas Gohr 1568285fff9SAndreas Gohr /** @inheritdoc */ 1578285fff9SAndreas Gohr public function runMaintenance() 1588285fff9SAndreas Gohr { 1598285fff9SAndreas Gohr // no-op 1608285fff9SAndreas Gohr } 1618285fff9SAndreas Gohr 1628285fff9SAndreas Gohr 16313dbfc23SAndreas Gohr /** 16413dbfc23SAndreas Gohr * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 16513dbfc23SAndreas Gohr * 16613dbfc23SAndreas Gohr * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 16713dbfc23SAndreas Gohr * @inheritdoc 16813dbfc23SAndreas Gohr */ 16913dbfc23SAndreas Gohr public function getPageChunks($page, $firstChunkID) 17013dbfc23SAndreas Gohr { 17113dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 17230b9cbc7Ssplitbrain $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 17313dbfc23SAndreas Gohr 17413dbfc23SAndreas Gohr $data = $this->runQuery( 17513dbfc23SAndreas Gohr '/vectors/fetch?' . $ids, 17613dbfc23SAndreas Gohr '', 17713dbfc23SAndreas Gohr 'GET' 17813dbfc23SAndreas Gohr ); 17913dbfc23SAndreas Gohr if (!$data) return []; 18013dbfc23SAndreas Gohr 18113dbfc23SAndreas Gohr $chunks = []; 18213dbfc23SAndreas Gohr foreach ($data['vectors'] as $vector) { 18313dbfc23SAndreas Gohr $chunks[] = new Chunk( 18413dbfc23SAndreas Gohr $vector['metadata']['page'], 18513dbfc23SAndreas Gohr $vector['id'], 18613dbfc23SAndreas Gohr $vector['metadata']['text'], 18713dbfc23SAndreas Gohr $vector['values'], 188e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 18913dbfc23SAndreas Gohr $vector['metadata']['created'] 19013dbfc23SAndreas Gohr ); 19113dbfc23SAndreas Gohr } 19213dbfc23SAndreas Gohr return $chunks; 19313dbfc23SAndreas Gohr } 19413dbfc23SAndreas Gohr 19513dbfc23SAndreas Gohr /** @inheritdoc */ 196e33a1d7aSAndreas Gohr public function getSimilarChunks($vector, $lang = '', $limit = 4) 19713dbfc23SAndreas Gohr { 1987ebc7895Ssplitbrain $limit *= 2; // we can't check ACLs, so we return more than requested 19913dbfc23SAndreas Gohr 200edf69908SAndreas Gohr $query = [ 20113dbfc23SAndreas Gohr 'vector' => $vector, 20213dbfc23SAndreas Gohr 'topK' => (int)$limit, 203edf69908SAndreas Gohr 'includeMetadata' => true, 204edf69908SAndreas Gohr 'includeValues' => true, 205edf69908SAndreas Gohr ]; 206edf69908SAndreas Gohr 207edf69908SAndreas Gohr if ($lang) { 208edf69908SAndreas Gohr $query['filter'] = ['language' => ['$eq', $lang]]; 209edf69908SAndreas Gohr } 210edf69908SAndreas Gohr 211edf69908SAndreas Gohr $response = $this->runQuery('/query', $query); 21213dbfc23SAndreas Gohr $chunks = []; 21313dbfc23SAndreas Gohr foreach ($response['matches'] as $vector) { 21413dbfc23SAndreas Gohr $chunks[] = new Chunk( 21513dbfc23SAndreas Gohr $vector['metadata']['page'], 21613dbfc23SAndreas Gohr $vector['id'], 21713dbfc23SAndreas Gohr $vector['metadata']['text'], 21813dbfc23SAndreas Gohr $vector['values'], 219e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 22013dbfc23SAndreas Gohr $vector['metadata']['created'], 22113dbfc23SAndreas Gohr $vector['score'] 22213dbfc23SAndreas Gohr ); 22313dbfc23SAndreas Gohr } 22413dbfc23SAndreas Gohr return $chunks; 22513dbfc23SAndreas Gohr } 22613dbfc23SAndreas Gohr 22713dbfc23SAndreas Gohr /** @inheritdoc */ 22813dbfc23SAndreas Gohr public function statistics() 22913dbfc23SAndreas Gohr { 23013dbfc23SAndreas Gohr $data = $this->runQuery('/describe_index_stats', []); 23113dbfc23SAndreas Gohr 23213dbfc23SAndreas Gohr return [ 23313dbfc23SAndreas Gohr 'storage type' => 'Pinecone', 23413dbfc23SAndreas Gohr 'chunks' => $data['totalVectorCount'], 23513dbfc23SAndreas Gohr 'fullness' => $data['indexFullness'], 23613dbfc23SAndreas Gohr ]; 23713dbfc23SAndreas Gohr } 23813dbfc23SAndreas Gohr} 239