xref: /plugin/aichat/Storage/PineconeStorage.php (revision edf69908aba1160784aaa80b226c29da73ea4978)
1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Pinecone index
10 */
11class PineconeStorage extends AbstractStorage
12{
13    /** @var DokuHTTPClient preauthed client */
14    protected $http;
15    /** @var string full URL to the index instance */
16    protected $baseurl;
17    /** @var bool set to true when no chunks should be reused */
18    protected $overwrite = false;
19
20    /**
21     * PineconeStorage constructor.
22     */
23    public function __construct()
24    {
25        $helper = plugin_load('helper', 'aichat');
26
27        $this->baseurl = $helper->getConf('pinecone_baseurl');
28
29        $this->http = new DokuHTTPClient();
30        $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey');
31        $this->http->headers['Content-Type'] = 'application/json';
32        $this->http->headers['Accept'] = 'application/json';
33    }
34
35    /**
36     * Execute a query against the Pinecone API
37     *
38     * @param string $endpoint API endpoint, will be added to the base URL
39     * @param mixed $data The data to send, will be JSON encoded
40     * @param string $method POST|GET
41     * @return mixed
42     * @throws \Exception
43     */
44    protected function runQuery($endpoint, mixed $data, $method = 'POST')
45    {
46        $url = $this->baseurl . $endpoint;
47
48        if (is_array($data) && $data === []) {
49            $json = '{}';
50        } else {
51            $json = json_encode($data, JSON_THROW_ON_ERROR);
52        }
53
54        $this->http->sendRequest($url, $json, $method);
55        $response = $this->http->resp_body;
56        if ($response === false) {
57            throw new \Exception('Pinecone API returned no response. ' . $this->http->error);
58        }
59
60        try {
61            $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR);
62        } catch (\JsonException $e) {
63            throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 0, $e);
64        }
65
66        if (isset($result['message'])) {
67            throw new \Exception('Pinecone API returned error. ' . $result['message'], $result['code'] ?? 0);
68        }
69
70        return $result;
71    }
72
73    /** @inheritdoc */
74    public function getChunk($chunkID)
75    {
76        if ($this->overwrite) return null; // no reuse allowed
77
78        $data = $this->runQuery(
79            '/vectors/fetch?ids=' . $chunkID,
80            '',
81            'GET'
82        );
83        if (!$data) return null;
84        $vector = array_shift($data['vectors']);
85        if (!$vector) return null;
86
87        return new Chunk(
88            $vector['metadata']['page'],
89            $chunkID,
90            $vector['metadata']['text'],
91            $vector['values'],
92            $vector['metadata']['language'] ?? '',
93            $vector['metadata']['created']
94        );
95    }
96
97    /**
98     * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply
99     * not reuse any existing vectors.
100     *
101     * @inheritdoc
102     */
103    public function startCreation($clear = false)
104    {
105        if ($clear) {
106            try {
107                $this->runQuery('/vectors/delete', ['delete_all' => 'True']);
108            } catch (\Exception) {
109                // delete all seems not supported -> starter edition
110                $this->overwrite = true;
111            }
112        }
113    }
114
115    /** @inheritdoc */
116    public function reusePageChunks($page, $firstChunkID)
117    {
118        // no-op
119    }
120
121    /** @inheritdoc */
122    public function deletePageChunks($page, $firstChunkID)
123    {
124        // delete all possible chunk IDs
125        $ids = range($firstChunkID, $firstChunkID + 99, 1);
126        $ids = array_map(static fn($id) => (string)$id, $ids);
127        try {
128            $this->runQuery('/vectors/delete', ['ids' => $ids]);
129        } catch (\Exception $e) {
130            // 5 is the code for "namespace not found" See #12
131            if($e->getCode() !== 5) throw $e;
132        }
133    }
134
135    /** @inheritdoc */
136    public function addPageChunks($chunks)
137    {
138        $vectors = [];
139        foreach ($chunks as $chunk) {
140            $vectors[] = [
141                'id' => (string)$chunk->getId(),
142                'values' => $chunk->getEmbedding(),
143                'metadata' => [
144                    'page' => $chunk->getPage(),
145                    'created' => $chunk->getCreated(),
146                    'text' => $chunk->getText(),
147                ]
148            ];
149        }
150
151        $this->runQuery('/vectors/upsert', ['vectors' => $vectors]);
152    }
153
154    /** @inheritdoc */
155    public function finalizeCreation()
156    {
157        $this->overwrite = false;
158    }
159
160    /** @inheritdoc */
161    public function runMaintenance()
162    {
163        // no-op
164    }
165
166
167    /**
168     * Pinecone can't query based on metadata, so we have to get all possible chunks by ID
169     *
170     * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140
171     * @inheritdoc
172     */
173    public function getPageChunks($page, $firstChunkID)
174    {
175        $ids = range($firstChunkID, $firstChunkID + 99, 1);
176        $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item);
177
178        $data = $this->runQuery(
179            '/vectors/fetch?' . $ids,
180            '',
181            'GET'
182        );
183        if (!$data) return [];
184
185        $chunks = [];
186        foreach ($data['vectors'] as $vector) {
187            $chunks[] = new Chunk(
188                $vector['metadata']['page'],
189                $vector['id'],
190                $vector['metadata']['text'],
191                $vector['values'],
192                $vector['metadata']['language'] ?? '',
193                $vector['metadata']['created']
194            );
195        }
196        return $chunks;
197    }
198
199    /** @inheritdoc */
200    public function getSimilarChunks($vector, $lang = '', $limit = 4)
201    {
202        $limit *= 2; // we can't check ACLs, so we return more than requested
203
204        $query = [
205            'vector' => $vector,
206            'topK' => (int)$limit,
207            'includeMetadata' => true,
208            'includeValues' => true,
209        ];
210
211        if ($lang) {
212            $query['filter'] = ['language' => ['$eq', $lang]];
213        }
214
215        $response = $this->runQuery('/query', $query);
216        $chunks = [];
217        foreach ($response['matches'] as $vector) {
218            $chunks[] = new Chunk(
219                $vector['metadata']['page'],
220                $vector['id'],
221                $vector['metadata']['text'],
222                $vector['values'],
223                $vector['metadata']['language'] ?? '',
224                $vector['metadata']['created'],
225                $vector['score']
226            );
227        }
228        return $chunks;
229    }
230
231    /** @inheritdoc */
232    public function statistics()
233    {
234        $data = $this->runQuery('/describe_index_stats', []);
235
236        return [
237            'storage type' => 'Pinecone',
238            'chunks' => $data['totalVectorCount'],
239            'fullness' => $data['indexFullness'],
240        ];
241    }
242}
243