xref: /plugin/aichat/Storage/PineconeStorage.php (revision 04afb84f6cb8a0c9b1d4d807e18f90fe739ec371)
1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Pinecone index
10 */
11class PineconeStorage extends AbstractStorage
12{
13    /** @var DokuHTTPClient preauthed client */
14    protected $http;
15    /** @var string full URL to the index instance */
16    protected $baseurl;
17    /** @var bool set to true when no chunks should be reused */
18    protected $overwrite = false;
19
20    /** @inheritdoc */
21    public function __construct(array $config)
22    {
23        $this->baseurl = $config['pinecone_baseurl'] ?? '';
24
25        $this->http = new DokuHTTPClient();
26        $this->http->headers['Api-Key'] = $config['pinecone_apikey'];
27        $this->http->headers['Content-Type'] = 'application/json';
28        $this->http->headers['Accept'] = 'application/json';
29    }
30
31    /**
32     * Execute a query against the Pinecone API
33     *
34     * @param string $endpoint API endpoint, will be added to the base URL
35     * @param mixed $data The data to send, will be JSON encoded
36     * @param string $method POST|GET
37     * @return mixed
38     * @throws \Exception
39     */
40    protected function runQuery($endpoint, mixed $data, $method = 'POST')
41    {
42        $url = $this->baseurl . $endpoint;
43
44        if (is_array($data) && $data === []) {
45            $json = '{}';
46        } else {
47            $json = json_encode($data, JSON_THROW_ON_ERROR);
48        }
49
50        $this->http->sendRequest($url, $json, $method);
51        $response = $this->http->resp_body;
52        if ($response === false) {
53            throw new \Exception('Pinecone API returned no response. ' . $this->http->error);
54        }
55
56        $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR);
57        if ($result === null) {
58            throw new \Exception('Pinecone API returned invalid JSON. ' . $response);
59        }
60
61        if (isset($result['message'])) {
62            throw new \Exception('Pinecone API returned error. ' . $result['message']);
63        }
64
65        return $result;
66    }
67
68    /** @inheritdoc */
69    public function getChunk($chunkID)
70    {
71        if ($this->overwrite) return null; // no reuse allowed
72
73        $data = $this->runQuery(
74            '/vectors/fetch?ids=' . $chunkID,
75            '',
76            'GET'
77        );
78        if (!$data) return null;
79        $vector = array_shift($data['vectors']);
80        if (!$vector) return null;
81
82        return new Chunk(
83            $vector['metadata']['page'],
84            $chunkID,
85            $vector['metadata']['text'],
86            $vector['values'],
87            $vector['metadata']['language'] ?? '',
88            $vector['metadata']['created']
89        );
90    }
91
92    /**
93     * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply
94     * not reuse any existing vectors.
95     *
96     * @inheritdoc
97     */
98    public function startCreation($clear = false)
99    {
100        if ($clear) {
101            try {
102                $this->runQuery('/vectors/delete', ['delete_all' => 'True']);
103            } catch (\Exception) {
104                // delete all seems not supported -> starter edition
105                $this->overwrite = true;
106            }
107        }
108    }
109
110    /** @inheritdoc */
111    public function reusePageChunks($page, $firstChunkID)
112    {
113        // no-op
114    }
115
116    /** @inheritdoc */
117    public function deletePageChunks($page, $firstChunkID)
118    {
119        // delete all possible chunk IDs
120        $ids = range($firstChunkID, $firstChunkID + 99, 1);
121        $ids = array_map(static fn($id) => (string)$id, $ids);
122        $this->runQuery('/vectors/delete', ['ids' => $ids]);
123    }
124
125    /** @inheritdoc */
126    public function addPageChunks($chunks)
127    {
128        $vectors = [];
129        foreach ($chunks as $chunk) {
130            $vectors[] = [
131                'id' => (string)$chunk->getId(),
132                'values' => $chunk->getEmbedding(),
133                'metadata' => [
134                    'page' => $chunk->getPage(),
135                    'created' => $chunk->getCreated(),
136                    'text' => $chunk->getText(),
137                ]
138            ];
139        }
140
141        $this->runQuery('/vectors/upsert', ['vectors' => $vectors]);
142    }
143
144    /** @inheritdoc */
145    public function finalizeCreation()
146    {
147        $this->overwrite = false;
148    }
149
150    /** @inheritdoc */
151    public function runMaintenance()
152    {
153        // no-op
154    }
155
156
157    /**
158     * Pinecone can't query based on metadata, so we have to get all possible chunks by ID
159     *
160     * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140
161     * @inheritdoc
162     */
163    public function getPageChunks($page, $firstChunkID)
164    {
165        $ids = range($firstChunkID, $firstChunkID + 99, 1);
166        $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item);
167
168        $data = $this->runQuery(
169            '/vectors/fetch?' . $ids,
170            '',
171            'GET'
172        );
173        if (!$data) return [];
174
175        $chunks = [];
176        foreach ($data['vectors'] as $vector) {
177            $chunks[] = new Chunk(
178                $vector['metadata']['page'],
179                $vector['id'],
180                $vector['metadata']['text'],
181                $vector['values'],
182                $vector['metadata']['language'] ?? '',
183                $vector['metadata']['created']
184            );
185        }
186        return $chunks;
187    }
188
189    /** @inheritdoc */
190    public function getSimilarChunks($vector, $lang = '', $limit = 4)
191    {
192        $limit *= 2; // we can't check ACLs, so we return more than requested
193
194        if ($lang) {
195            $filter = ['language' => ['$eq', $lang]];
196        } else {
197            $filter = [];
198        }
199
200        $response = $this->runQuery(
201            '/query',
202            [
203                'vector' => $vector,
204                'topK' => (int)$limit,
205                'include_metadata' => true,
206                'include_values' => true,
207                'filter' => $filter,
208            ]
209        );
210        $chunks = [];
211        foreach ($response['matches'] as $vector) {
212            $chunks[] = new Chunk(
213                $vector['metadata']['page'],
214                $vector['id'],
215                $vector['metadata']['text'],
216                $vector['values'],
217                $vector['metadata']['language'] ?? '',
218                $vector['metadata']['created'],
219                $vector['score']
220            );
221        }
222        return $chunks;
223    }
224
225    /** @inheritdoc */
226    public function statistics()
227    {
228        $data = $this->runQuery('/describe_index_stats', []);
229
230        return [
231            'storage type' => 'Pinecone',
232            'chunks' => $data['totalVectorCount'],
233            'fullness' => $data['indexFullness'],
234        ];
235    }
236}
237