113dbfc23SAndreas Gohr<?php 213dbfc23SAndreas Gohr 313dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 413dbfc23SAndreas Gohr 513dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient; 613dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 713dbfc23SAndreas Gohr 813dbfc23SAndreas Gohr/** 913dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index 1013dbfc23SAndreas Gohr */ 1113dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage 1213dbfc23SAndreas Gohr{ 1313dbfc23SAndreas Gohr /** @var DokuHTTPClient preauthed client */ 1413dbfc23SAndreas Gohr protected $http; 1513dbfc23SAndreas Gohr /** @var string full URL to the index instance */ 1613dbfc23SAndreas Gohr protected $baseurl; 1713dbfc23SAndreas Gohr /** @var bool set to true when no chunks should be reused */ 1813dbfc23SAndreas Gohr protected $overwrite = false; 1913dbfc23SAndreas Gohr 2013dbfc23SAndreas Gohr /** 2113dbfc23SAndreas Gohr * PineconeStorage constructor. 2213dbfc23SAndreas Gohr */ 2313dbfc23SAndreas Gohr public function __construct() 2413dbfc23SAndreas Gohr { 2513dbfc23SAndreas Gohr $helper = plugin_load('helper', 'aichat'); 2613dbfc23SAndreas Gohr 2713dbfc23SAndreas Gohr $this->baseurl = $helper->getConf('pinecone_baseurl'); 2813dbfc23SAndreas Gohr 2913dbfc23SAndreas Gohr $this->http = new DokuHTTPClient(); 3013dbfc23SAndreas Gohr $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey'); 3113dbfc23SAndreas Gohr $this->http->headers['Content-Type'] = 'application/json'; 3213dbfc23SAndreas Gohr $this->http->headers['Accept'] = 'application/json'; 3313dbfc23SAndreas Gohr } 3413dbfc23SAndreas Gohr 3513dbfc23SAndreas Gohr /** 3613dbfc23SAndreas Gohr * Execute a query against the Pinecone API 3713dbfc23SAndreas Gohr * 3813dbfc23SAndreas Gohr * @param string $endpoint API endpoint, will be added to the base URL 3913dbfc23SAndreas Gohr * @param mixed $data The data to send, will be JSON encoded 4013dbfc23SAndreas Gohr * @param string $method POST|GET 4113dbfc23SAndreas Gohr * @return mixed 4213dbfc23SAndreas Gohr * @throws \Exception 4313dbfc23SAndreas Gohr */ 4430b9cbc7Ssplitbrain protected function runQuery($endpoint, mixed $data, $method = 'POST') 4513dbfc23SAndreas Gohr { 4613dbfc23SAndreas Gohr $url = $this->baseurl . $endpoint; 4713dbfc23SAndreas Gohr 487ebc7895Ssplitbrain if (is_array($data) && $data === []) { 4913dbfc23SAndreas Gohr $json = '{}'; 5013dbfc23SAndreas Gohr } else { 5130b9cbc7Ssplitbrain $json = json_encode($data, JSON_THROW_ON_ERROR); 5213dbfc23SAndreas Gohr } 5313dbfc23SAndreas Gohr 5413dbfc23SAndreas Gohr $this->http->sendRequest($url, $json, $method); 5513dbfc23SAndreas Gohr $response = $this->http->resp_body; 5613dbfc23SAndreas Gohr if ($response === false) { 5713dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 5813dbfc23SAndreas Gohr } 5913dbfc23SAndreas Gohr 60*edf69908SAndreas Gohr try { 6130b9cbc7Ssplitbrain $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR); 62*edf69908SAndreas Gohr } catch (\JsonException $e) { 63*edf69908SAndreas Gohr throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 0, $e); 6413dbfc23SAndreas Gohr } 6513dbfc23SAndreas Gohr 6613dbfc23SAndreas Gohr if (isset($result['message'])) { 67*edf69908SAndreas Gohr throw new \Exception('Pinecone API returned error. ' . $result['message'], $result['code'] ?? 0); 6813dbfc23SAndreas Gohr } 6913dbfc23SAndreas Gohr 7013dbfc23SAndreas Gohr return $result; 7113dbfc23SAndreas Gohr } 7213dbfc23SAndreas Gohr 7313dbfc23SAndreas Gohr /** @inheritdoc */ 7413dbfc23SAndreas Gohr public function getChunk($chunkID) 7513dbfc23SAndreas Gohr { 7613dbfc23SAndreas Gohr if ($this->overwrite) return null; // no reuse allowed 7713dbfc23SAndreas Gohr 7813dbfc23SAndreas Gohr $data = $this->runQuery( 7913dbfc23SAndreas Gohr '/vectors/fetch?ids=' . $chunkID, 8013dbfc23SAndreas Gohr '', 8113dbfc23SAndreas Gohr 'GET' 8213dbfc23SAndreas Gohr ); 8313dbfc23SAndreas Gohr if (!$data) return null; 8413dbfc23SAndreas Gohr $vector = array_shift($data['vectors']); 8513dbfc23SAndreas Gohr if (!$vector) return null; 8613dbfc23SAndreas Gohr 8713dbfc23SAndreas Gohr return new Chunk( 8813dbfc23SAndreas Gohr $vector['metadata']['page'], 8913dbfc23SAndreas Gohr $chunkID, 9013dbfc23SAndreas Gohr $vector['metadata']['text'], 9113dbfc23SAndreas Gohr $vector['values'], 92e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 9313dbfc23SAndreas Gohr $vector['metadata']['created'] 9413dbfc23SAndreas Gohr ); 9513dbfc23SAndreas Gohr } 9613dbfc23SAndreas Gohr 9713dbfc23SAndreas Gohr /** 9813dbfc23SAndreas Gohr * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 9913dbfc23SAndreas Gohr * not reuse any existing vectors. 10013dbfc23SAndreas Gohr * 10113dbfc23SAndreas Gohr * @inheritdoc 10213dbfc23SAndreas Gohr */ 10313dbfc23SAndreas Gohr public function startCreation($clear = false) 10413dbfc23SAndreas Gohr { 10513dbfc23SAndreas Gohr if ($clear) { 10613dbfc23SAndreas Gohr try { 10713dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 10830b9cbc7Ssplitbrain } catch (\Exception) { 10913dbfc23SAndreas Gohr // delete all seems not supported -> starter edition 11013dbfc23SAndreas Gohr $this->overwrite = true; 11113dbfc23SAndreas Gohr } 11213dbfc23SAndreas Gohr } 11313dbfc23SAndreas Gohr } 11413dbfc23SAndreas Gohr 11513dbfc23SAndreas Gohr /** @inheritdoc */ 11613dbfc23SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 11713dbfc23SAndreas Gohr { 11813dbfc23SAndreas Gohr // no-op 11913dbfc23SAndreas Gohr } 12013dbfc23SAndreas Gohr 12113dbfc23SAndreas Gohr /** @inheritdoc */ 12213dbfc23SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 12313dbfc23SAndreas Gohr { 12413dbfc23SAndreas Gohr // delete all possible chunk IDs 12513dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 12630b9cbc7Ssplitbrain $ids = array_map(static fn($id) => (string)$id, $ids); 127*edf69908SAndreas Gohr try { 12813dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['ids' => $ids]); 129*edf69908SAndreas Gohr } catch (\Exception $e) { 130*edf69908SAndreas Gohr // 5 is the code for "namespace not found" See #12 131*edf69908SAndreas Gohr if($e->getCode() !== 5) throw $e; 132*edf69908SAndreas Gohr } 13313dbfc23SAndreas Gohr } 13413dbfc23SAndreas Gohr 13513dbfc23SAndreas Gohr /** @inheritdoc */ 13613dbfc23SAndreas Gohr public function addPageChunks($chunks) 13713dbfc23SAndreas Gohr { 13813dbfc23SAndreas Gohr $vectors = []; 13913dbfc23SAndreas Gohr foreach ($chunks as $chunk) { 14013dbfc23SAndreas Gohr $vectors[] = [ 14113dbfc23SAndreas Gohr 'id' => (string)$chunk->getId(), 14213dbfc23SAndreas Gohr 'values' => $chunk->getEmbedding(), 14313dbfc23SAndreas Gohr 'metadata' => [ 14413dbfc23SAndreas Gohr 'page' => $chunk->getPage(), 14513dbfc23SAndreas Gohr 'created' => $chunk->getCreated(), 14613dbfc23SAndreas Gohr 'text' => $chunk->getText(), 14713dbfc23SAndreas Gohr ] 14813dbfc23SAndreas Gohr ]; 14913dbfc23SAndreas Gohr } 15013dbfc23SAndreas Gohr 15113dbfc23SAndreas Gohr $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 15213dbfc23SAndreas Gohr } 15313dbfc23SAndreas Gohr 15413dbfc23SAndreas Gohr /** @inheritdoc */ 15513dbfc23SAndreas Gohr public function finalizeCreation() 15613dbfc23SAndreas Gohr { 15713dbfc23SAndreas Gohr $this->overwrite = false; 15813dbfc23SAndreas Gohr } 15913dbfc23SAndreas Gohr 1608285fff9SAndreas Gohr /** @inheritdoc */ 1618285fff9SAndreas Gohr public function runMaintenance() 1628285fff9SAndreas Gohr { 1638285fff9SAndreas Gohr // no-op 1648285fff9SAndreas Gohr } 1658285fff9SAndreas Gohr 1668285fff9SAndreas Gohr 16713dbfc23SAndreas Gohr /** 16813dbfc23SAndreas Gohr * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 16913dbfc23SAndreas Gohr * 17013dbfc23SAndreas Gohr * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 17113dbfc23SAndreas Gohr * @inheritdoc 17213dbfc23SAndreas Gohr */ 17313dbfc23SAndreas Gohr public function getPageChunks($page, $firstChunkID) 17413dbfc23SAndreas Gohr { 17513dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 17630b9cbc7Ssplitbrain $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item); 17713dbfc23SAndreas Gohr 17813dbfc23SAndreas Gohr $data = $this->runQuery( 17913dbfc23SAndreas Gohr '/vectors/fetch?' . $ids, 18013dbfc23SAndreas Gohr '', 18113dbfc23SAndreas Gohr 'GET' 18213dbfc23SAndreas Gohr ); 18313dbfc23SAndreas Gohr if (!$data) return []; 18413dbfc23SAndreas Gohr 18513dbfc23SAndreas Gohr $chunks = []; 18613dbfc23SAndreas Gohr foreach ($data['vectors'] as $vector) { 18713dbfc23SAndreas Gohr $chunks[] = new Chunk( 18813dbfc23SAndreas Gohr $vector['metadata']['page'], 18913dbfc23SAndreas Gohr $vector['id'], 19013dbfc23SAndreas Gohr $vector['metadata']['text'], 19113dbfc23SAndreas Gohr $vector['values'], 192e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 19313dbfc23SAndreas Gohr $vector['metadata']['created'] 19413dbfc23SAndreas Gohr ); 19513dbfc23SAndreas Gohr } 19613dbfc23SAndreas Gohr return $chunks; 19713dbfc23SAndreas Gohr } 19813dbfc23SAndreas Gohr 19913dbfc23SAndreas Gohr /** @inheritdoc */ 200e33a1d7aSAndreas Gohr public function getSimilarChunks($vector, $lang = '', $limit = 4) 20113dbfc23SAndreas Gohr { 2027ebc7895Ssplitbrain $limit *= 2; // we can't check ACLs, so we return more than requested 20313dbfc23SAndreas Gohr 204*edf69908SAndreas Gohr $query = [ 20513dbfc23SAndreas Gohr 'vector' => $vector, 20613dbfc23SAndreas Gohr 'topK' => (int)$limit, 207*edf69908SAndreas Gohr 'includeMetadata' => true, 208*edf69908SAndreas Gohr 'includeValues' => true, 209*edf69908SAndreas Gohr ]; 210*edf69908SAndreas Gohr 211*edf69908SAndreas Gohr if ($lang) { 212*edf69908SAndreas Gohr $query['filter'] = ['language' => ['$eq', $lang]]; 213*edf69908SAndreas Gohr } 214*edf69908SAndreas Gohr 215*edf69908SAndreas Gohr $response = $this->runQuery('/query', $query); 21613dbfc23SAndreas Gohr $chunks = []; 21713dbfc23SAndreas Gohr foreach ($response['matches'] as $vector) { 21813dbfc23SAndreas Gohr $chunks[] = new Chunk( 21913dbfc23SAndreas Gohr $vector['metadata']['page'], 22013dbfc23SAndreas Gohr $vector['id'], 22113dbfc23SAndreas Gohr $vector['metadata']['text'], 22213dbfc23SAndreas Gohr $vector['values'], 223e33a1d7aSAndreas Gohr $vector['metadata']['language'] ?? '', 22413dbfc23SAndreas Gohr $vector['metadata']['created'], 22513dbfc23SAndreas Gohr $vector['score'] 22613dbfc23SAndreas Gohr ); 22713dbfc23SAndreas Gohr } 22813dbfc23SAndreas Gohr return $chunks; 22913dbfc23SAndreas Gohr } 23013dbfc23SAndreas Gohr 23113dbfc23SAndreas Gohr /** @inheritdoc */ 23213dbfc23SAndreas Gohr public function statistics() 23313dbfc23SAndreas Gohr { 23413dbfc23SAndreas Gohr $data = $this->runQuery('/describe_index_stats', []); 23513dbfc23SAndreas Gohr 23613dbfc23SAndreas Gohr return [ 23713dbfc23SAndreas Gohr 'storage type' => 'Pinecone', 23813dbfc23SAndreas Gohr 'chunks' => $data['totalVectorCount'], 23913dbfc23SAndreas Gohr 'fullness' => $data['indexFullness'], 24013dbfc23SAndreas Gohr ]; 24113dbfc23SAndreas Gohr } 24213dbfc23SAndreas Gohr} 243