113dbfc23SAndreas Gohr<?php 213dbfc23SAndreas Gohr 313dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 413dbfc23SAndreas Gohr 513dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient; 613dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 713dbfc23SAndreas Gohr 813dbfc23SAndreas Gohr/** 913dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index 1013dbfc23SAndreas Gohr */ 1113dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage 1213dbfc23SAndreas Gohr{ 1313dbfc23SAndreas Gohr /** @var DokuHTTPClient preauthed client */ 1413dbfc23SAndreas Gohr protected $http; 1513dbfc23SAndreas Gohr /** @var string full URL to the index instance */ 1613dbfc23SAndreas Gohr protected $baseurl; 1713dbfc23SAndreas Gohr /** @var bool set to true when no chunks should be reused */ 1813dbfc23SAndreas Gohr protected $overwrite = false; 1913dbfc23SAndreas Gohr 2013dbfc23SAndreas Gohr /** 2113dbfc23SAndreas Gohr * PineconeStorage constructor. 2213dbfc23SAndreas Gohr */ 2313dbfc23SAndreas Gohr public function __construct() 2413dbfc23SAndreas Gohr { 2513dbfc23SAndreas Gohr $helper = plugin_load('helper', 'aichat'); 2613dbfc23SAndreas Gohr 2713dbfc23SAndreas Gohr $this->baseurl = $helper->getConf('pinecone_baseurl'); 2813dbfc23SAndreas Gohr 2913dbfc23SAndreas Gohr $this->http = new DokuHTTPClient(); 3013dbfc23SAndreas Gohr $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey'); 3113dbfc23SAndreas Gohr $this->http->headers['Content-Type'] = 'application/json'; 3213dbfc23SAndreas Gohr $this->http->headers['Accept'] = 'application/json'; 3313dbfc23SAndreas Gohr } 3413dbfc23SAndreas Gohr 3513dbfc23SAndreas Gohr /** 3613dbfc23SAndreas Gohr * Execute a query against the Pinecone API 3713dbfc23SAndreas Gohr * 3813dbfc23SAndreas Gohr * @param string $endpoint API endpoint, will be added to the base URL 3913dbfc23SAndreas Gohr * @param mixed $data The data to send, will be JSON encoded 4013dbfc23SAndreas Gohr * @param string $method POST|GET 4113dbfc23SAndreas Gohr * @return mixed 4213dbfc23SAndreas Gohr * @throws \Exception 4313dbfc23SAndreas Gohr */ 4413dbfc23SAndreas Gohr protected function runQuery($endpoint, $data, $method = 'POST') 4513dbfc23SAndreas Gohr { 4613dbfc23SAndreas Gohr $url = $this->baseurl . $endpoint; 4713dbfc23SAndreas Gohr 4813dbfc23SAndreas Gohr if (is_array($data) && !count($data)) { 4913dbfc23SAndreas Gohr $json = '{}'; 5013dbfc23SAndreas Gohr } else { 5113dbfc23SAndreas Gohr $json = json_encode($data); 5213dbfc23SAndreas Gohr } 5313dbfc23SAndreas Gohr 5413dbfc23SAndreas Gohr $this->http->sendRequest($url, $json, $method); 5513dbfc23SAndreas Gohr $response = $this->http->resp_body; 5613dbfc23SAndreas Gohr if ($response === false) { 5713dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 5813dbfc23SAndreas Gohr } 5913dbfc23SAndreas Gohr 6013dbfc23SAndreas Gohr $result = json_decode($response, true); 6113dbfc23SAndreas Gohr if ($result === null) { 6213dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 6313dbfc23SAndreas Gohr } 6413dbfc23SAndreas Gohr 6513dbfc23SAndreas Gohr if (isset($result['message'])) { 6613dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned error. ' . $result['message']); 6713dbfc23SAndreas Gohr } 6813dbfc23SAndreas Gohr 6913dbfc23SAndreas Gohr return $result; 7013dbfc23SAndreas Gohr } 7113dbfc23SAndreas Gohr 7213dbfc23SAndreas Gohr /** @inheritdoc */ 7313dbfc23SAndreas Gohr public function getChunk($chunkID) 7413dbfc23SAndreas Gohr { 7513dbfc23SAndreas Gohr if ($this->overwrite) return null; // no reuse allowed 7613dbfc23SAndreas Gohr 7713dbfc23SAndreas Gohr $data = $this->runQuery( 7813dbfc23SAndreas Gohr '/vectors/fetch?ids=' . $chunkID, 7913dbfc23SAndreas Gohr '', 8013dbfc23SAndreas Gohr 'GET' 8113dbfc23SAndreas Gohr ); 8213dbfc23SAndreas Gohr if (!$data) return null; 8313dbfc23SAndreas Gohr $vector = array_shift($data['vectors']); 8413dbfc23SAndreas Gohr if (!$vector) return null; 8513dbfc23SAndreas Gohr 8613dbfc23SAndreas Gohr return new Chunk( 8713dbfc23SAndreas Gohr $vector['metadata']['page'], 8813dbfc23SAndreas Gohr $chunkID, 8913dbfc23SAndreas Gohr $vector['metadata']['text'], 9013dbfc23SAndreas Gohr $vector['values'], 9113dbfc23SAndreas Gohr $vector['metadata']['created'] 9213dbfc23SAndreas Gohr ); 9313dbfc23SAndreas Gohr } 9413dbfc23SAndreas Gohr 9513dbfc23SAndreas Gohr /** 9613dbfc23SAndreas Gohr * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 9713dbfc23SAndreas Gohr * not reuse any existing vectors. 9813dbfc23SAndreas Gohr * 9913dbfc23SAndreas Gohr * @inheritdoc 10013dbfc23SAndreas Gohr */ 10113dbfc23SAndreas Gohr public function startCreation($clear = false) 10213dbfc23SAndreas Gohr { 10313dbfc23SAndreas Gohr if ($clear) { 10413dbfc23SAndreas Gohr try { 10513dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 10613dbfc23SAndreas Gohr } catch (\Exception $e) { 10713dbfc23SAndreas Gohr // delete all seems not supported -> starter edition 10813dbfc23SAndreas Gohr $this->overwrite = true; 10913dbfc23SAndreas Gohr } 11013dbfc23SAndreas Gohr } 11113dbfc23SAndreas Gohr } 11213dbfc23SAndreas Gohr 11313dbfc23SAndreas Gohr /** @inheritdoc */ 11413dbfc23SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 11513dbfc23SAndreas Gohr { 11613dbfc23SAndreas Gohr // no-op 11713dbfc23SAndreas Gohr } 11813dbfc23SAndreas Gohr 11913dbfc23SAndreas Gohr /** @inheritdoc */ 12013dbfc23SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 12113dbfc23SAndreas Gohr { 12213dbfc23SAndreas Gohr // delete all possible chunk IDs 12313dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 12413dbfc23SAndreas Gohr $ids = array_map(function ($id) { 12513dbfc23SAndreas Gohr return (string)$id; 12613dbfc23SAndreas Gohr }, $ids); 12713dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['ids' => $ids]); 12813dbfc23SAndreas Gohr } 12913dbfc23SAndreas Gohr 13013dbfc23SAndreas Gohr /** @inheritdoc */ 13113dbfc23SAndreas Gohr public function addPageChunks($chunks) 13213dbfc23SAndreas Gohr { 13313dbfc23SAndreas Gohr $vectors = []; 13413dbfc23SAndreas Gohr foreach ($chunks as $chunk) { 13513dbfc23SAndreas Gohr $vectors[] = [ 13613dbfc23SAndreas Gohr 'id' => (string)$chunk->getId(), 13713dbfc23SAndreas Gohr 'values' => $chunk->getEmbedding(), 13813dbfc23SAndreas Gohr 'metadata' => [ 13913dbfc23SAndreas Gohr 'page' => $chunk->getPage(), 14013dbfc23SAndreas Gohr 'created' => $chunk->getCreated(), 14113dbfc23SAndreas Gohr 'text' => $chunk->getText(), 14213dbfc23SAndreas Gohr ] 14313dbfc23SAndreas Gohr ]; 14413dbfc23SAndreas Gohr } 14513dbfc23SAndreas Gohr 14613dbfc23SAndreas Gohr $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 14713dbfc23SAndreas Gohr } 14813dbfc23SAndreas Gohr 14913dbfc23SAndreas Gohr /** @inheritdoc */ 15013dbfc23SAndreas Gohr public function finalizeCreation() 15113dbfc23SAndreas Gohr { 15213dbfc23SAndreas Gohr $this->overwrite = false; 15313dbfc23SAndreas Gohr } 15413dbfc23SAndreas Gohr 155*8285fff9SAndreas Gohr /** @inheritdoc */ 156*8285fff9SAndreas Gohr public function runMaintenance() 157*8285fff9SAndreas Gohr { 158*8285fff9SAndreas Gohr // no-op 159*8285fff9SAndreas Gohr } 160*8285fff9SAndreas Gohr 161*8285fff9SAndreas Gohr 16213dbfc23SAndreas Gohr /** 16313dbfc23SAndreas Gohr * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 16413dbfc23SAndreas Gohr * 16513dbfc23SAndreas Gohr * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 16613dbfc23SAndreas Gohr * @inheritdoc 16713dbfc23SAndreas Gohr */ 16813dbfc23SAndreas Gohr public function getPageChunks($page, $firstChunkID) 16913dbfc23SAndreas Gohr { 17013dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 17113dbfc23SAndreas Gohr $ids = array_reduce($ids, function ($carry, $item) { 17213dbfc23SAndreas Gohr return $carry . '&ids=' . $item; 17313dbfc23SAndreas Gohr }); 17413dbfc23SAndreas Gohr 17513dbfc23SAndreas Gohr $data = $this->runQuery( 17613dbfc23SAndreas Gohr '/vectors/fetch?' . $ids, 17713dbfc23SAndreas Gohr '', 17813dbfc23SAndreas Gohr 'GET' 17913dbfc23SAndreas Gohr ); 18013dbfc23SAndreas Gohr if (!$data) return []; 18113dbfc23SAndreas Gohr 18213dbfc23SAndreas Gohr $chunks = []; 18313dbfc23SAndreas Gohr foreach ($data['vectors'] as $vector) { 18413dbfc23SAndreas Gohr $chunks[] = new Chunk( 18513dbfc23SAndreas Gohr $vector['metadata']['page'], 18613dbfc23SAndreas Gohr $vector['id'], 18713dbfc23SAndreas Gohr $vector['metadata']['text'], 18813dbfc23SAndreas Gohr $vector['values'], 18913dbfc23SAndreas Gohr $vector['metadata']['created'] 19013dbfc23SAndreas Gohr ); 19113dbfc23SAndreas Gohr } 19213dbfc23SAndreas Gohr return $chunks; 19313dbfc23SAndreas Gohr } 19413dbfc23SAndreas Gohr 19513dbfc23SAndreas Gohr /** @inheritdoc */ 19613dbfc23SAndreas Gohr public function getSimilarChunks($vector, $limit = 4) 19713dbfc23SAndreas Gohr { 19813dbfc23SAndreas Gohr $limit = $limit * 2; // we can't check ACLs, so we return more than requested 19913dbfc23SAndreas Gohr 20013dbfc23SAndreas Gohr $response = $this->runQuery( 20113dbfc23SAndreas Gohr '/query', 20213dbfc23SAndreas Gohr [ 20313dbfc23SAndreas Gohr 'vector' => $vector, 20413dbfc23SAndreas Gohr 'topK' => (int)$limit, 20513dbfc23SAndreas Gohr 'include_metadata' => true, 20613dbfc23SAndreas Gohr 'include_values' => true, 20713dbfc23SAndreas Gohr ] 20813dbfc23SAndreas Gohr ); 20913dbfc23SAndreas Gohr $chunks = []; 21013dbfc23SAndreas Gohr foreach ($response['matches'] as $vector) { 21113dbfc23SAndreas Gohr $chunks[] = new Chunk( 21213dbfc23SAndreas Gohr $vector['metadata']['page'], 21313dbfc23SAndreas Gohr $vector['id'], 21413dbfc23SAndreas Gohr $vector['metadata']['text'], 21513dbfc23SAndreas Gohr $vector['values'], 21613dbfc23SAndreas Gohr $vector['metadata']['created'], 21713dbfc23SAndreas Gohr $vector['score'] 21813dbfc23SAndreas Gohr ); 21913dbfc23SAndreas Gohr } 22013dbfc23SAndreas Gohr return $chunks; 22113dbfc23SAndreas Gohr } 22213dbfc23SAndreas Gohr 22313dbfc23SAndreas Gohr /** @inheritdoc */ 22413dbfc23SAndreas Gohr public function statistics() 22513dbfc23SAndreas Gohr { 22613dbfc23SAndreas Gohr $data = $this->runQuery('/describe_index_stats', []); 22713dbfc23SAndreas Gohr 22813dbfc23SAndreas Gohr return [ 22913dbfc23SAndreas Gohr 'storage type' => 'Pinecone', 23013dbfc23SAndreas Gohr 'chunks' => $data['totalVectorCount'], 23113dbfc23SAndreas Gohr 'fullness' => $data['indexFullness'], 23213dbfc23SAndreas Gohr ]; 23313dbfc23SAndreas Gohr } 23413dbfc23SAndreas Gohr} 235