xref: /plugin/aichat/Storage/PineconeStorage.php (revision 13dbfc23174f3d6b4ae0540553675c666472eae2)
1*13dbfc23SAndreas Gohr<?php
2*13dbfc23SAndreas Gohr
3*13dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
4*13dbfc23SAndreas Gohr
5*13dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient;
6*13dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7*13dbfc23SAndreas Gohr
8*13dbfc23SAndreas Gohr/**
9*13dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index
10*13dbfc23SAndreas Gohr */
11*13dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage
12*13dbfc23SAndreas Gohr{
13*13dbfc23SAndreas Gohr    /** @var DokuHTTPClient preauthed client */
14*13dbfc23SAndreas Gohr    protected $http;
15*13dbfc23SAndreas Gohr    /** @var string full URL to the index instance */
16*13dbfc23SAndreas Gohr    protected $baseurl;
17*13dbfc23SAndreas Gohr    /** @var bool set to true when no chunks should be reused */
18*13dbfc23SAndreas Gohr    protected $overwrite = false;
19*13dbfc23SAndreas Gohr
20*13dbfc23SAndreas Gohr    /**
21*13dbfc23SAndreas Gohr     * PineconeStorage constructor.
22*13dbfc23SAndreas Gohr     */
23*13dbfc23SAndreas Gohr    public function __construct()
24*13dbfc23SAndreas Gohr    {
25*13dbfc23SAndreas Gohr        $helper = plugin_load('helper', 'aichat');
26*13dbfc23SAndreas Gohr
27*13dbfc23SAndreas Gohr        $this->baseurl = $helper->getConf('pinecone_baseurl');
28*13dbfc23SAndreas Gohr
29*13dbfc23SAndreas Gohr        $this->http = new DokuHTTPClient();
30*13dbfc23SAndreas Gohr        $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey');
31*13dbfc23SAndreas Gohr        $this->http->headers['Content-Type'] = 'application/json';
32*13dbfc23SAndreas Gohr        $this->http->headers['Accept'] = 'application/json';
33*13dbfc23SAndreas Gohr    }
34*13dbfc23SAndreas Gohr
35*13dbfc23SAndreas Gohr    /**
36*13dbfc23SAndreas Gohr     * Execute a query against the Pinecone API
37*13dbfc23SAndreas Gohr     *
38*13dbfc23SAndreas Gohr     * @param string $endpoint API endpoint, will be added to the base URL
39*13dbfc23SAndreas Gohr     * @param mixed $data The data to send, will be JSON encoded
40*13dbfc23SAndreas Gohr     * @param string $method POST|GET
41*13dbfc23SAndreas Gohr     * @return mixed
42*13dbfc23SAndreas Gohr     * @throws \Exception
43*13dbfc23SAndreas Gohr     */
44*13dbfc23SAndreas Gohr    protected function runQuery($endpoint, $data, $method = 'POST')
45*13dbfc23SAndreas Gohr    {
46*13dbfc23SAndreas Gohr        $url = $this->baseurl . $endpoint;
47*13dbfc23SAndreas Gohr
48*13dbfc23SAndreas Gohr        if (is_array($data) && !count($data)) {
49*13dbfc23SAndreas Gohr            $json = '{}';
50*13dbfc23SAndreas Gohr        } else {
51*13dbfc23SAndreas Gohr            $json = json_encode($data);
52*13dbfc23SAndreas Gohr        }
53*13dbfc23SAndreas Gohr
54*13dbfc23SAndreas Gohr        $this->http->sendRequest($url, $json, $method);
55*13dbfc23SAndreas Gohr        $response = $this->http->resp_body;
56*13dbfc23SAndreas Gohr        if ($response === false) {
57*13dbfc23SAndreas Gohr            throw new \Exception('Pinecone API returned no response. ' . $this->http->error);
58*13dbfc23SAndreas Gohr        }
59*13dbfc23SAndreas Gohr
60*13dbfc23SAndreas Gohr        $result = json_decode($response, true);
61*13dbfc23SAndreas Gohr        if ($result === null) {
62*13dbfc23SAndreas Gohr            throw new \Exception('Pinecone API returned invalid JSON. ' . $response);
63*13dbfc23SAndreas Gohr        }
64*13dbfc23SAndreas Gohr
65*13dbfc23SAndreas Gohr        if (isset($result['message'])) {
66*13dbfc23SAndreas Gohr            throw new \Exception('Pinecone API returned error. ' . $result['message']);
67*13dbfc23SAndreas Gohr        }
68*13dbfc23SAndreas Gohr
69*13dbfc23SAndreas Gohr        return $result;
70*13dbfc23SAndreas Gohr    }
71*13dbfc23SAndreas Gohr
72*13dbfc23SAndreas Gohr    /** @inheritdoc */
73*13dbfc23SAndreas Gohr    public function getChunk($chunkID)
74*13dbfc23SAndreas Gohr    {
75*13dbfc23SAndreas Gohr        if ($this->overwrite) return null; // no reuse allowed
76*13dbfc23SAndreas Gohr
77*13dbfc23SAndreas Gohr        $data = $this->runQuery(
78*13dbfc23SAndreas Gohr            '/vectors/fetch?ids=' . $chunkID,
79*13dbfc23SAndreas Gohr            '',
80*13dbfc23SAndreas Gohr            'GET'
81*13dbfc23SAndreas Gohr        );
82*13dbfc23SAndreas Gohr        if (!$data) return null;
83*13dbfc23SAndreas Gohr        $vector = array_shift($data['vectors']);
84*13dbfc23SAndreas Gohr        if (!$vector) return null;
85*13dbfc23SAndreas Gohr
86*13dbfc23SAndreas Gohr        return new Chunk(
87*13dbfc23SAndreas Gohr            $vector['metadata']['page'],
88*13dbfc23SAndreas Gohr            $chunkID,
89*13dbfc23SAndreas Gohr            $vector['metadata']['text'],
90*13dbfc23SAndreas Gohr            $vector['values'],
91*13dbfc23SAndreas Gohr            $vector['metadata']['created']
92*13dbfc23SAndreas Gohr        );
93*13dbfc23SAndreas Gohr    }
94*13dbfc23SAndreas Gohr
95*13dbfc23SAndreas Gohr    /**
96*13dbfc23SAndreas Gohr     * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply
97*13dbfc23SAndreas Gohr     * not reuse any existing vectors.
98*13dbfc23SAndreas Gohr     *
99*13dbfc23SAndreas Gohr     * @inheritdoc
100*13dbfc23SAndreas Gohr     */
101*13dbfc23SAndreas Gohr    public function startCreation($clear = false)
102*13dbfc23SAndreas Gohr    {
103*13dbfc23SAndreas Gohr        if ($clear) {
104*13dbfc23SAndreas Gohr            try {
105*13dbfc23SAndreas Gohr                $this->runQuery('/vectors/delete', ['delete_all' => 'True']);
106*13dbfc23SAndreas Gohr            } catch (\Exception $e) {
107*13dbfc23SAndreas Gohr                // delete all seems not supported -> starter edition
108*13dbfc23SAndreas Gohr                $this->overwrite = true;
109*13dbfc23SAndreas Gohr            }
110*13dbfc23SAndreas Gohr        }
111*13dbfc23SAndreas Gohr    }
112*13dbfc23SAndreas Gohr
113*13dbfc23SAndreas Gohr    /** @inheritdoc */
114*13dbfc23SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
115*13dbfc23SAndreas Gohr    {
116*13dbfc23SAndreas Gohr        // no-op
117*13dbfc23SAndreas Gohr    }
118*13dbfc23SAndreas Gohr
119*13dbfc23SAndreas Gohr    /** @inheritdoc */
120*13dbfc23SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
121*13dbfc23SAndreas Gohr    {
122*13dbfc23SAndreas Gohr        // delete all possible chunk IDs
123*13dbfc23SAndreas Gohr        $ids = range($firstChunkID, $firstChunkID + 99, 1);
124*13dbfc23SAndreas Gohr        $ids = array_map(function ($id) {
125*13dbfc23SAndreas Gohr            return (string)$id;
126*13dbfc23SAndreas Gohr        }, $ids);
127*13dbfc23SAndreas Gohr        $this->runQuery('/vectors/delete', ['ids' => $ids]);
128*13dbfc23SAndreas Gohr    }
129*13dbfc23SAndreas Gohr
130*13dbfc23SAndreas Gohr    /** @inheritdoc */
131*13dbfc23SAndreas Gohr    public function addPageChunks($chunks)
132*13dbfc23SAndreas Gohr    {
133*13dbfc23SAndreas Gohr        $vectors = [];
134*13dbfc23SAndreas Gohr        foreach ($chunks as $chunk) {
135*13dbfc23SAndreas Gohr            $vectors[] = [
136*13dbfc23SAndreas Gohr                'id' => (string)$chunk->getId(),
137*13dbfc23SAndreas Gohr                'values' => $chunk->getEmbedding(),
138*13dbfc23SAndreas Gohr                'metadata' => [
139*13dbfc23SAndreas Gohr                    'page' => $chunk->getPage(),
140*13dbfc23SAndreas Gohr                    'created' => $chunk->getCreated(),
141*13dbfc23SAndreas Gohr                    'text' => $chunk->getText(),
142*13dbfc23SAndreas Gohr                ]
143*13dbfc23SAndreas Gohr            ];
144*13dbfc23SAndreas Gohr        }
145*13dbfc23SAndreas Gohr
146*13dbfc23SAndreas Gohr        $this->runQuery('/vectors/upsert', ['vectors' => $vectors]);
147*13dbfc23SAndreas Gohr    }
148*13dbfc23SAndreas Gohr
149*13dbfc23SAndreas Gohr    /** @inheritdoc */
150*13dbfc23SAndreas Gohr    public function finalizeCreation()
151*13dbfc23SAndreas Gohr    {
152*13dbfc23SAndreas Gohr        $this->overwrite = false;
153*13dbfc23SAndreas Gohr    }
154*13dbfc23SAndreas Gohr
155*13dbfc23SAndreas Gohr    /**
156*13dbfc23SAndreas Gohr     * Pinecone can't query based on metadata, so we have to get all possible chunks by ID
157*13dbfc23SAndreas Gohr     *
158*13dbfc23SAndreas Gohr     * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140
159*13dbfc23SAndreas Gohr     * @inheritdoc
160*13dbfc23SAndreas Gohr     */
161*13dbfc23SAndreas Gohr    public function getPageChunks($page, $firstChunkID)
162*13dbfc23SAndreas Gohr    {
163*13dbfc23SAndreas Gohr        $ids = range($firstChunkID, $firstChunkID + 99, 1);
164*13dbfc23SAndreas Gohr        $ids = array_reduce($ids, function ($carry, $item) {
165*13dbfc23SAndreas Gohr            return $carry . '&ids=' . $item;
166*13dbfc23SAndreas Gohr        });
167*13dbfc23SAndreas Gohr
168*13dbfc23SAndreas Gohr        $data = $this->runQuery(
169*13dbfc23SAndreas Gohr            '/vectors/fetch?' . $ids,
170*13dbfc23SAndreas Gohr            '',
171*13dbfc23SAndreas Gohr            'GET'
172*13dbfc23SAndreas Gohr        );
173*13dbfc23SAndreas Gohr        if (!$data) return [];
174*13dbfc23SAndreas Gohr
175*13dbfc23SAndreas Gohr        $chunks = [];
176*13dbfc23SAndreas Gohr        foreach ($data['vectors'] as $vector) {
177*13dbfc23SAndreas Gohr            $chunks[] = new Chunk(
178*13dbfc23SAndreas Gohr                $vector['metadata']['page'],
179*13dbfc23SAndreas Gohr                $vector['id'],
180*13dbfc23SAndreas Gohr                $vector['metadata']['text'],
181*13dbfc23SAndreas Gohr                $vector['values'],
182*13dbfc23SAndreas Gohr                $vector['metadata']['created']
183*13dbfc23SAndreas Gohr            );
184*13dbfc23SAndreas Gohr        }
185*13dbfc23SAndreas Gohr        return $chunks;
186*13dbfc23SAndreas Gohr    }
187*13dbfc23SAndreas Gohr
188*13dbfc23SAndreas Gohr    /** @inheritdoc */
189*13dbfc23SAndreas Gohr    public function getSimilarChunks($vector, $limit = 4)
190*13dbfc23SAndreas Gohr    {
191*13dbfc23SAndreas Gohr        $limit = $limit * 2; // we can't check ACLs, so we return more than requested
192*13dbfc23SAndreas Gohr
193*13dbfc23SAndreas Gohr        $response = $this->runQuery(
194*13dbfc23SAndreas Gohr            '/query',
195*13dbfc23SAndreas Gohr            [
196*13dbfc23SAndreas Gohr                'vector' => $vector,
197*13dbfc23SAndreas Gohr                'topK' => (int)$limit,
198*13dbfc23SAndreas Gohr                'include_metadata' => true,
199*13dbfc23SAndreas Gohr                'include_values' => true,
200*13dbfc23SAndreas Gohr            ]
201*13dbfc23SAndreas Gohr        );
202*13dbfc23SAndreas Gohr        $chunks = [];
203*13dbfc23SAndreas Gohr        foreach ($response['matches'] as $vector) {
204*13dbfc23SAndreas Gohr            $chunks[] = new Chunk(
205*13dbfc23SAndreas Gohr                $vector['metadata']['page'],
206*13dbfc23SAndreas Gohr                $vector['id'],
207*13dbfc23SAndreas Gohr                $vector['metadata']['text'],
208*13dbfc23SAndreas Gohr                $vector['values'],
209*13dbfc23SAndreas Gohr                $vector['metadata']['created'],
210*13dbfc23SAndreas Gohr                $vector['score']
211*13dbfc23SAndreas Gohr            );
212*13dbfc23SAndreas Gohr        }
213*13dbfc23SAndreas Gohr        return $chunks;
214*13dbfc23SAndreas Gohr    }
215*13dbfc23SAndreas Gohr
216*13dbfc23SAndreas Gohr    /** @inheritdoc */
217*13dbfc23SAndreas Gohr    public function statistics()
218*13dbfc23SAndreas Gohr    {
219*13dbfc23SAndreas Gohr        $data = $this->runQuery('/describe_index_stats', []);
220*13dbfc23SAndreas Gohr
221*13dbfc23SAndreas Gohr        return [
222*13dbfc23SAndreas Gohr            'storage type' => 'Pinecone',
223*13dbfc23SAndreas Gohr            'chunks' => $data['totalVectorCount'],
224*13dbfc23SAndreas Gohr            'fullness' => $data['indexFullness'],
225*13dbfc23SAndreas Gohr        ];
226*13dbfc23SAndreas Gohr    }
227*13dbfc23SAndreas Gohr}
228