113dbfc23SAndreas Gohr<?php 213dbfc23SAndreas Gohr 313dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 413dbfc23SAndreas Gohr 513dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient; 613dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 713dbfc23SAndreas Gohr 813dbfc23SAndreas Gohr/** 913dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index 1013dbfc23SAndreas Gohr */ 1113dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage 1213dbfc23SAndreas Gohr{ 1313dbfc23SAndreas Gohr /** @var DokuHTTPClient preauthed client */ 1413dbfc23SAndreas Gohr protected $http; 1513dbfc23SAndreas Gohr /** @var string full URL to the index instance */ 1613dbfc23SAndreas Gohr protected $baseurl; 1713dbfc23SAndreas Gohr /** @var bool set to true when no chunks should be reused */ 1813dbfc23SAndreas Gohr protected $overwrite = false; 1913dbfc23SAndreas Gohr 20*04afb84fSAndreas Gohr /** @inheritdoc */ 21*04afb84fSAndreas Gohr public function __construct(array $config) 2213dbfc23SAndreas Gohr { 23*04afb84fSAndreas Gohr $this->baseurl = $config['pinecone_baseurl'] ?? ''; 2413dbfc23SAndreas Gohr 2513dbfc23SAndreas Gohr $this->http = new DokuHTTPClient(); 26*04afb84fSAndreas Gohr $this->http->headers['Api-Key'] = $config['pinecone_apikey']; 2713dbfc23SAndreas Gohr $this->http->headers['Content-Type'] = 'application/json'; 2813dbfc23SAndreas Gohr $this->http->headers['Accept'] = 'application/json'; 2913dbfc23SAndreas Gohr } 3013dbfc23SAndreas Gohr 3113dbfc23SAndreas Gohr /** 3213dbfc23SAndreas Gohr * Execute a query against the Pinecone API 3313dbfc23SAndreas Gohr * 3413dbfc23SAndreas Gohr * @param string $endpoint API endpoint, will be added to the base URL 3513dbfc23SAndreas Gohr * @param mixed $data The data to send, will be JSON encoded 3613dbfc23SAndreas Gohr * @param string $method POST|GET 3713dbfc23SAndreas Gohr * @return mixed 3813dbfc23SAndreas Gohr * @throws \Exception 3913dbfc23SAndreas Gohr */ 4030b9cbc7Ssplitbrain protected function runQuery($endpoint, mixed $data, $method = 'POST') 4113dbfc23SAndreas Gohr { 4213dbfc23SAndreas Gohr $url = $this->baseurl . $endpoint; 4313dbfc23SAndreas Gohr 447ebc7895Ssplitbrain if (is_array($data) && $data === []) { 4513dbfc23SAndreas Gohr $json = '{}'; 4613dbfc23SAndreas Gohr } else { 4730b9cbc7Ssplitbrain $json = json_encode($data, JSON_THROW_ON_ERROR); 4813dbfc23SAndreas Gohr } 4913dbfc23SAndreas Gohr 5013dbfc23SAndreas Gohr $this->http->sendRequest($url, $json, $method); 5113dbfc23SAndreas Gohr $response = $this->http->resp_body; 5213dbfc23SAndreas Gohr if ($response === false) { 5313dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 5413dbfc23SAndreas Gohr } 5513dbfc23SAndreas Gohr 5630b9cbc7Ssplitbrain $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 5713dbfc23SAndreas Gohr if ($result === null) { 5813dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 5913dbfc23SAndreas Gohr } 6013dbfc23SAndreas Gohr 6113dbfc23SAndreas Gohr if (isset($result['message'])) { 6213dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned error. ' . $result['message']); 6313dbfc23SAndreas Gohr } 6413dbfc23SAndreas Gohr 6513dbfc23SAndreas Gohr return $result; 6613dbfc23SAndreas Gohr } 6713dbfc23SAndreas Gohr 6813dbfc23SAndreas Gohr /** @inheritdoc */ 6913dbfc23SAndreas Gohr public function getChunk($chunkID) 7013dbfc23SAndreas Gohr { 7113dbfc23SAndreas Gohr if ($this->overwrite) return null; // no reuse allowed 7213dbfc23SAndreas Gohr 7313dbfc23SAndreas Gohr $data = $this->runQuery( 7413dbfc23SAndreas Gohr '/vectors/fetch?ids=' . $chunkID, 7513dbfc23SAndreas Gohr '', 7613dbfc23SAndreas Gohr 'GET' 7713dbfc23SAndreas Gohr ); 7813dbfc23SAndreas Gohr if (!$data) return null; 7913dbfc23SAndreas Gohr $vector = array_shift($data['vectors']); 8013dbfc23SAndreas Gohr if (!$vector) return null; 8113dbfc23SAndreas Gohr 8213dbfc23SAndreas Gohr return new Chunk( 8313dbfc23SAndreas Gohr $vector['metadata']['page'], 8413dbfc23SAndreas Gohr $chunkID, 8513dbfc23SAndreas Gohr $vector['metadata']['text'], 8613dbfc23SAndreas Gohr $vector['values'], 87e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 8813dbfc23SAndreas Gohr $vector['metadata']['created'] 8913dbfc23SAndreas Gohr ); 9013dbfc23SAndreas Gohr } 9113dbfc23SAndreas Gohr 9213dbfc23SAndreas Gohr /** 9313dbfc23SAndreas Gohr * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 9413dbfc23SAndreas Gohr * not reuse any existing vectors. 9513dbfc23SAndreas Gohr * 9613dbfc23SAndreas Gohr * @inheritdoc 9713dbfc23SAndreas Gohr */ 9813dbfc23SAndreas Gohr public function startCreation($clear = false) 9913dbfc23SAndreas Gohr { 10013dbfc23SAndreas Gohr if ($clear) { 10113dbfc23SAndreas Gohr try { 10213dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 10330b9cbc7Ssplitbrain } catch (\Exception) { 10413dbfc23SAndreas Gohr // delete all seems not supported -> starter edition 10513dbfc23SAndreas Gohr $this->overwrite = true; 10613dbfc23SAndreas Gohr } 10713dbfc23SAndreas Gohr } 10813dbfc23SAndreas Gohr } 10913dbfc23SAndreas Gohr 11013dbfc23SAndreas Gohr /** @inheritdoc */ 11113dbfc23SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 11213dbfc23SAndreas Gohr { 11313dbfc23SAndreas Gohr // no-op 11413dbfc23SAndreas Gohr } 11513dbfc23SAndreas Gohr 11613dbfc23SAndreas Gohr /** @inheritdoc */ 11713dbfc23SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 11813dbfc23SAndreas Gohr { 11913dbfc23SAndreas Gohr // delete all possible chunk IDs 12013dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 12130b9cbc7Ssplitbrain $ids = array_map(static fn($id) => (string)$id, $ids); 12213dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['ids' => $ids]); 12313dbfc23SAndreas Gohr } 12413dbfc23SAndreas Gohr 12513dbfc23SAndreas Gohr /** @inheritdoc */ 12613dbfc23SAndreas Gohr public function addPageChunks($chunks) 12713dbfc23SAndreas Gohr { 12813dbfc23SAndreas Gohr $vectors = []; 12913dbfc23SAndreas Gohr foreach ($chunks as $chunk) { 13013dbfc23SAndreas Gohr $vectors[] = [ 13113dbfc23SAndreas Gohr 'id' => (string)$chunk->getId(), 13213dbfc23SAndreas Gohr 'values' => $chunk->getEmbedding(), 13313dbfc23SAndreas Gohr 'metadata' => [ 13413dbfc23SAndreas Gohr 'page' => $chunk->getPage(), 13513dbfc23SAndreas Gohr 'created' => $chunk->getCreated(), 13613dbfc23SAndreas Gohr 'text' => $chunk->getText(), 13713dbfc23SAndreas Gohr ] 13813dbfc23SAndreas Gohr ]; 13913dbfc23SAndreas Gohr } 14013dbfc23SAndreas Gohr 14113dbfc23SAndreas Gohr $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 14213dbfc23SAndreas Gohr } 14313dbfc23SAndreas Gohr 14413dbfc23SAndreas Gohr /** @inheritdoc */ 14513dbfc23SAndreas Gohr public function finalizeCreation() 14613dbfc23SAndreas Gohr { 14713dbfc23SAndreas Gohr $this->overwrite = false; 14813dbfc23SAndreas Gohr } 14913dbfc23SAndreas Gohr 1508285fff9SAndreas Gohr /** @inheritdoc */ 1518285fff9SAndreas Gohr public function runMaintenance() 1528285fff9SAndreas Gohr { 1538285fff9SAndreas Gohr // no-op 1548285fff9SAndreas Gohr } 1558285fff9SAndreas Gohr 1568285fff9SAndreas Gohr 15713dbfc23SAndreas Gohr /** 15813dbfc23SAndreas Gohr * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 15913dbfc23SAndreas Gohr * 16013dbfc23SAndreas Gohr * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 16113dbfc23SAndreas Gohr * @inheritdoc 16213dbfc23SAndreas Gohr */ 16313dbfc23SAndreas Gohr public function getPageChunks($page, $firstChunkID) 16413dbfc23SAndreas Gohr { 16513dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 16630b9cbc7Ssplitbrain $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 16713dbfc23SAndreas Gohr 16813dbfc23SAndreas Gohr $data = $this->runQuery( 16913dbfc23SAndreas Gohr '/vectors/fetch?' . $ids, 17013dbfc23SAndreas Gohr '', 17113dbfc23SAndreas Gohr 'GET' 17213dbfc23SAndreas Gohr ); 17313dbfc23SAndreas Gohr if (!$data) return []; 17413dbfc23SAndreas Gohr 17513dbfc23SAndreas Gohr $chunks = []; 17613dbfc23SAndreas Gohr foreach ($data['vectors'] as $vector) { 17713dbfc23SAndreas Gohr $chunks[] = new Chunk( 17813dbfc23SAndreas Gohr $vector['metadata']['page'], 17913dbfc23SAndreas Gohr $vector['id'], 18013dbfc23SAndreas Gohr $vector['metadata']['text'], 18113dbfc23SAndreas Gohr $vector['values'], 182e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 18313dbfc23SAndreas Gohr $vector['metadata']['created'] 18413dbfc23SAndreas Gohr ); 18513dbfc23SAndreas Gohr } 18613dbfc23SAndreas Gohr return $chunks; 18713dbfc23SAndreas Gohr } 18813dbfc23SAndreas Gohr 18913dbfc23SAndreas Gohr /** @inheritdoc */ 190e33a1d7aSAndreas Gohr public function getSimilarChunks($vector, $lang = '', $limit = 4) 19113dbfc23SAndreas Gohr { 1927ebc7895Ssplitbrain $limit *= 2; // we can't check ACLs, so we return more than requested 19313dbfc23SAndreas Gohr 194e33a1d7aSAndreas Gohr if ($lang) { 195e33a1d7aSAndreas Gohr $filter = ['language' => ['$eq', $lang]]; 196e33a1d7aSAndreas Gohr } else { 197e33a1d7aSAndreas Gohr $filter = []; 198e33a1d7aSAndreas Gohr } 199e33a1d7aSAndreas Gohr 20013dbfc23SAndreas Gohr $response = $this->runQuery( 20113dbfc23SAndreas Gohr '/query', 20213dbfc23SAndreas Gohr [ 20313dbfc23SAndreas Gohr 'vector' => $vector, 20413dbfc23SAndreas Gohr 'topK' => (int)$limit, 20513dbfc23SAndreas Gohr 'include_metadata' => true, 20613dbfc23SAndreas Gohr 'include_values' => true, 207e33a1d7aSAndreas Gohr 'filter' => $filter, 20813dbfc23SAndreas Gohr ] 20913dbfc23SAndreas Gohr ); 21013dbfc23SAndreas Gohr $chunks = []; 21113dbfc23SAndreas Gohr foreach ($response['matches'] as $vector) { 21213dbfc23SAndreas Gohr $chunks[] = new Chunk( 21313dbfc23SAndreas Gohr $vector['metadata']['page'], 21413dbfc23SAndreas Gohr $vector['id'], 21513dbfc23SAndreas Gohr $vector['metadata']['text'], 21613dbfc23SAndreas Gohr $vector['values'], 217e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 21813dbfc23SAndreas Gohr $vector['metadata']['created'], 21913dbfc23SAndreas Gohr $vector['score'] 22013dbfc23SAndreas Gohr ); 22113dbfc23SAndreas Gohr } 22213dbfc23SAndreas Gohr return $chunks; 22313dbfc23SAndreas Gohr } 22413dbfc23SAndreas Gohr 22513dbfc23SAndreas Gohr /** @inheritdoc */ 22613dbfc23SAndreas Gohr public function statistics() 22713dbfc23SAndreas Gohr { 22813dbfc23SAndreas Gohr $data = $this->runQuery('/describe_index_stats', []); 22913dbfc23SAndreas Gohr 23013dbfc23SAndreas Gohr return [ 23113dbfc23SAndreas Gohr 'storage type' => 'Pinecone', 23213dbfc23SAndreas Gohr 'chunks' => $data['totalVectorCount'], 23313dbfc23SAndreas Gohr 'fullness' => $data['indexFullness'], 23413dbfc23SAndreas Gohr ]; 23513dbfc23SAndreas Gohr } 23613dbfc23SAndreas Gohr} 237