1*13dbfc23SAndreas Gohr<?php 2*13dbfc23SAndreas Gohr 3*13dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 4*13dbfc23SAndreas Gohr 5*13dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient; 6*13dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 7*13dbfc23SAndreas Gohr 8*13dbfc23SAndreas Gohr/** 9*13dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index 10*13dbfc23SAndreas Gohr */ 11*13dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage 12*13dbfc23SAndreas Gohr{ 13*13dbfc23SAndreas Gohr /** @var DokuHTTPClient preauthed client */ 14*13dbfc23SAndreas Gohr protected $http; 15*13dbfc23SAndreas Gohr /** @var string full URL to the index instance */ 16*13dbfc23SAndreas Gohr protected $baseurl; 17*13dbfc23SAndreas Gohr /** @var bool set to true when no chunks should be reused */ 18*13dbfc23SAndreas Gohr protected $overwrite = false; 19*13dbfc23SAndreas Gohr 20*13dbfc23SAndreas Gohr /** 21*13dbfc23SAndreas Gohr * PineconeStorage constructor. 22*13dbfc23SAndreas Gohr */ 23*13dbfc23SAndreas Gohr public function __construct() 24*13dbfc23SAndreas Gohr { 25*13dbfc23SAndreas Gohr $helper = plugin_load('helper', 'aichat'); 26*13dbfc23SAndreas Gohr 27*13dbfc23SAndreas Gohr $this->baseurl = $helper->getConf('pinecone_baseurl'); 28*13dbfc23SAndreas Gohr 29*13dbfc23SAndreas Gohr $this->http = new DokuHTTPClient(); 30*13dbfc23SAndreas Gohr $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey'); 31*13dbfc23SAndreas Gohr $this->http->headers['Content-Type'] = 'application/json'; 32*13dbfc23SAndreas Gohr $this->http->headers['Accept'] = 'application/json'; 33*13dbfc23SAndreas Gohr } 34*13dbfc23SAndreas Gohr 35*13dbfc23SAndreas Gohr /** 36*13dbfc23SAndreas Gohr * Execute a query against the Pinecone API 37*13dbfc23SAndreas Gohr * 38*13dbfc23SAndreas Gohr * @param string $endpoint API endpoint, will be added to the base URL 39*13dbfc23SAndreas Gohr * @param mixed $data The data to send, will be JSON encoded 40*13dbfc23SAndreas Gohr * @param string $method POST|GET 41*13dbfc23SAndreas Gohr * @return mixed 42*13dbfc23SAndreas Gohr * @throws \Exception 43*13dbfc23SAndreas Gohr */ 44*13dbfc23SAndreas Gohr protected function runQuery($endpoint, $data, $method = 'POST') 45*13dbfc23SAndreas Gohr { 46*13dbfc23SAndreas Gohr $url = $this->baseurl . $endpoint; 47*13dbfc23SAndreas Gohr 48*13dbfc23SAndreas Gohr if (is_array($data) && !count($data)) { 49*13dbfc23SAndreas Gohr $json = '{}'; 50*13dbfc23SAndreas Gohr } else { 51*13dbfc23SAndreas Gohr $json = json_encode($data); 52*13dbfc23SAndreas Gohr } 53*13dbfc23SAndreas Gohr 54*13dbfc23SAndreas Gohr $this->http->sendRequest($url, $json, $method); 55*13dbfc23SAndreas Gohr $response = $this->http->resp_body; 56*13dbfc23SAndreas Gohr if ($response === false) { 57*13dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned no response. ' . $this->http->error); 58*13dbfc23SAndreas Gohr } 59*13dbfc23SAndreas Gohr 60*13dbfc23SAndreas Gohr $result = json_decode($response, true); 61*13dbfc23SAndreas Gohr if ($result === null) { 62*13dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned invalid JSON. ' . $response); 63*13dbfc23SAndreas Gohr } 64*13dbfc23SAndreas Gohr 65*13dbfc23SAndreas Gohr if (isset($result['message'])) { 66*13dbfc23SAndreas Gohr throw new \Exception('Pinecone API returned error. ' . $result['message']); 67*13dbfc23SAndreas Gohr } 68*13dbfc23SAndreas Gohr 69*13dbfc23SAndreas Gohr return $result; 70*13dbfc23SAndreas Gohr } 71*13dbfc23SAndreas Gohr 72*13dbfc23SAndreas Gohr /** @inheritdoc */ 73*13dbfc23SAndreas Gohr public function getChunk($chunkID) 74*13dbfc23SAndreas Gohr { 75*13dbfc23SAndreas Gohr if ($this->overwrite) return null; // no reuse allowed 76*13dbfc23SAndreas Gohr 77*13dbfc23SAndreas Gohr $data = $this->runQuery( 78*13dbfc23SAndreas Gohr '/vectors/fetch?ids=' . $chunkID, 79*13dbfc23SAndreas Gohr '', 80*13dbfc23SAndreas Gohr 'GET' 81*13dbfc23SAndreas Gohr ); 82*13dbfc23SAndreas Gohr if (!$data) return null; 83*13dbfc23SAndreas Gohr $vector = array_shift($data['vectors']); 84*13dbfc23SAndreas Gohr if (!$vector) return null; 85*13dbfc23SAndreas Gohr 86*13dbfc23SAndreas Gohr return new Chunk( 87*13dbfc23SAndreas Gohr $vector['metadata']['page'], 88*13dbfc23SAndreas Gohr $chunkID, 89*13dbfc23SAndreas Gohr $vector['metadata']['text'], 90*13dbfc23SAndreas Gohr $vector['values'], 91*13dbfc23SAndreas Gohr $vector['metadata']['created'] 92*13dbfc23SAndreas Gohr ); 93*13dbfc23SAndreas Gohr } 94*13dbfc23SAndreas Gohr 95*13dbfc23SAndreas Gohr /** 96*13dbfc23SAndreas Gohr * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply 97*13dbfc23SAndreas Gohr * not reuse any existing vectors. 98*13dbfc23SAndreas Gohr * 99*13dbfc23SAndreas Gohr * @inheritdoc 100*13dbfc23SAndreas Gohr */ 101*13dbfc23SAndreas Gohr public function startCreation($clear = false) 102*13dbfc23SAndreas Gohr { 103*13dbfc23SAndreas Gohr if ($clear) { 104*13dbfc23SAndreas Gohr try { 105*13dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['delete_all' => 'True']); 106*13dbfc23SAndreas Gohr } catch (\Exception $e) { 107*13dbfc23SAndreas Gohr // delete all seems not supported -> starter edition 108*13dbfc23SAndreas Gohr $this->overwrite = true; 109*13dbfc23SAndreas Gohr } 110*13dbfc23SAndreas Gohr } 111*13dbfc23SAndreas Gohr } 112*13dbfc23SAndreas Gohr 113*13dbfc23SAndreas Gohr /** @inheritdoc */ 114*13dbfc23SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 115*13dbfc23SAndreas Gohr { 116*13dbfc23SAndreas Gohr // no-op 117*13dbfc23SAndreas Gohr } 118*13dbfc23SAndreas Gohr 119*13dbfc23SAndreas Gohr /** @inheritdoc */ 120*13dbfc23SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 121*13dbfc23SAndreas Gohr { 122*13dbfc23SAndreas Gohr // delete all possible chunk IDs 123*13dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 124*13dbfc23SAndreas Gohr $ids = array_map(function ($id) { 125*13dbfc23SAndreas Gohr return (string)$id; 126*13dbfc23SAndreas Gohr }, $ids); 127*13dbfc23SAndreas Gohr $this->runQuery('/vectors/delete', ['ids' => $ids]); 128*13dbfc23SAndreas Gohr } 129*13dbfc23SAndreas Gohr 130*13dbfc23SAndreas Gohr /** @inheritdoc */ 131*13dbfc23SAndreas Gohr public function addPageChunks($chunks) 132*13dbfc23SAndreas Gohr { 133*13dbfc23SAndreas Gohr $vectors = []; 134*13dbfc23SAndreas Gohr foreach ($chunks as $chunk) { 135*13dbfc23SAndreas Gohr $vectors[] = [ 136*13dbfc23SAndreas Gohr 'id' => (string)$chunk->getId(), 137*13dbfc23SAndreas Gohr 'values' => $chunk->getEmbedding(), 138*13dbfc23SAndreas Gohr 'metadata' => [ 139*13dbfc23SAndreas Gohr 'page' => $chunk->getPage(), 140*13dbfc23SAndreas Gohr 'created' => $chunk->getCreated(), 141*13dbfc23SAndreas Gohr 'text' => $chunk->getText(), 142*13dbfc23SAndreas Gohr ] 143*13dbfc23SAndreas Gohr ]; 144*13dbfc23SAndreas Gohr } 145*13dbfc23SAndreas Gohr 146*13dbfc23SAndreas Gohr $this->runQuery('/vectors/upsert', ['vectors' => $vectors]); 147*13dbfc23SAndreas Gohr } 148*13dbfc23SAndreas Gohr 149*13dbfc23SAndreas Gohr /** @inheritdoc */ 150*13dbfc23SAndreas Gohr public function finalizeCreation() 151*13dbfc23SAndreas Gohr { 152*13dbfc23SAndreas Gohr $this->overwrite = false; 153*13dbfc23SAndreas Gohr } 154*13dbfc23SAndreas Gohr 155*13dbfc23SAndreas Gohr /** 156*13dbfc23SAndreas Gohr * Pinecone can't query based on metadata, so we have to get all possible chunks by ID 157*13dbfc23SAndreas Gohr * 158*13dbfc23SAndreas Gohr * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140 159*13dbfc23SAndreas Gohr * @inheritdoc 160*13dbfc23SAndreas Gohr */ 161*13dbfc23SAndreas Gohr public function getPageChunks($page, $firstChunkID) 162*13dbfc23SAndreas Gohr { 163*13dbfc23SAndreas Gohr $ids = range($firstChunkID, $firstChunkID + 99, 1); 164*13dbfc23SAndreas Gohr $ids = array_reduce($ids, function ($carry, $item) { 165*13dbfc23SAndreas Gohr return $carry . '&ids=' . $item; 166*13dbfc23SAndreas Gohr }); 167*13dbfc23SAndreas Gohr 168*13dbfc23SAndreas Gohr $data = $this->runQuery( 169*13dbfc23SAndreas Gohr '/vectors/fetch?' . $ids, 170*13dbfc23SAndreas Gohr '', 171*13dbfc23SAndreas Gohr 'GET' 172*13dbfc23SAndreas Gohr ); 173*13dbfc23SAndreas Gohr if (!$data) return []; 174*13dbfc23SAndreas Gohr 175*13dbfc23SAndreas Gohr $chunks = []; 176*13dbfc23SAndreas Gohr foreach ($data['vectors'] as $vector) { 177*13dbfc23SAndreas Gohr $chunks[] = new Chunk( 178*13dbfc23SAndreas Gohr $vector['metadata']['page'], 179*13dbfc23SAndreas Gohr $vector['id'], 180*13dbfc23SAndreas Gohr $vector['metadata']['text'], 181*13dbfc23SAndreas Gohr $vector['values'], 182*13dbfc23SAndreas Gohr $vector['metadata']['created'] 183*13dbfc23SAndreas Gohr ); 184*13dbfc23SAndreas Gohr } 185*13dbfc23SAndreas Gohr return $chunks; 186*13dbfc23SAndreas Gohr } 187*13dbfc23SAndreas Gohr 188*13dbfc23SAndreas Gohr /** @inheritdoc */ 189*13dbfc23SAndreas Gohr public function getSimilarChunks($vector, $limit = 4) 190*13dbfc23SAndreas Gohr { 191*13dbfc23SAndreas Gohr $limit = $limit * 2; // we can't check ACLs, so we return more than requested 192*13dbfc23SAndreas Gohr 193*13dbfc23SAndreas Gohr $response = $this->runQuery( 194*13dbfc23SAndreas Gohr '/query', 195*13dbfc23SAndreas Gohr [ 196*13dbfc23SAndreas Gohr 'vector' => $vector, 197*13dbfc23SAndreas Gohr 'topK' => (int)$limit, 198*13dbfc23SAndreas Gohr 'include_metadata' => true, 199*13dbfc23SAndreas Gohr 'include_values' => true, 200*13dbfc23SAndreas Gohr ] 201*13dbfc23SAndreas Gohr ); 202*13dbfc23SAndreas Gohr $chunks = []; 203*13dbfc23SAndreas Gohr foreach ($response['matches'] as $vector) { 204*13dbfc23SAndreas Gohr $chunks[] = new Chunk( 205*13dbfc23SAndreas Gohr $vector['metadata']['page'], 206*13dbfc23SAndreas Gohr $vector['id'], 207*13dbfc23SAndreas Gohr $vector['metadata']['text'], 208*13dbfc23SAndreas Gohr $vector['values'], 209*13dbfc23SAndreas Gohr $vector['metadata']['created'], 210*13dbfc23SAndreas Gohr $vector['score'] 211*13dbfc23SAndreas Gohr ); 212*13dbfc23SAndreas Gohr } 213*13dbfc23SAndreas Gohr return $chunks; 214*13dbfc23SAndreas Gohr } 215*13dbfc23SAndreas Gohr 216*13dbfc23SAndreas Gohr /** @inheritdoc */ 217*13dbfc23SAndreas Gohr public function statistics() 218*13dbfc23SAndreas Gohr { 219*13dbfc23SAndreas Gohr $data = $this->runQuery('/describe_index_stats', []); 220*13dbfc23SAndreas Gohr 221*13dbfc23SAndreas Gohr return [ 222*13dbfc23SAndreas Gohr 'storage type' => 'Pinecone', 223*13dbfc23SAndreas Gohr 'chunks' => $data['totalVectorCount'], 224*13dbfc23SAndreas Gohr 'fullness' => $data['indexFullness'], 225*13dbfc23SAndreas Gohr ]; 226*13dbfc23SAndreas Gohr } 227*13dbfc23SAndreas Gohr} 228