1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Pinecone index
10 */
11class PineconeStorage extends AbstractStorage
12{
13    /** @var DokuHTTPClient preauthed client */
14    protected $http;
15    /** @var string full URL to the index instance */
16    protected $baseurl;
17    /** @var bool set to true when no chunks should be reused */
18    protected $overwrite = false;
19
20    /** @inheritdoc */
21    public function __construct(array $config)
22    {
23        $this->baseurl = $config['pinecone_baseurl'] ?? '';
24
25        $this->http = new DokuHTTPClient();
26        $this->http->headers['Api-Key'] = $config['pinecone_apikey'];
27        $this->http->headers['Content-Type'] = 'application/json';
28        $this->http->headers['Accept'] = 'application/json';
29    }
30
31    /**
32     * Execute a query against the Pinecone API
33     *
34     * @param string $endpoint API endpoint, will be added to the base URL
35     * @param mixed $data The data to send, will be JSON encoded
36     * @param string $method POST|GET
37     * @return mixed
38     * @throws \Exception
39     */
40    protected function runQuery($endpoint, mixed $data, $method = 'POST')
41    {
42        $url = $this->baseurl . $endpoint;
43
44        if (is_array($data) && $data === []) {
45            $json = '{}';
46        } else {
47            $json = json_encode($data, JSON_THROW_ON_ERROR);
48        }
49
50        $this->http->sendRequest($url, $json, $method);
51        $response = $this->http->resp_body;
52        if ($response === false) {
53            throw new \Exception('Pinecone API returned no response. ' . $this->http->error);
54        }
55
56        try {
57            $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR);
58        } catch (\JsonException $e) {
59            throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 0, $e);
60        }
61
62        if (isset($result['message'])) {
63            throw new \Exception('Pinecone API returned error. ' . $result['message'], $result['code'] ?? 0);
64        }
65
66        return $result;
67    }
68
69    /** @inheritdoc */
70    public function getChunk($chunkID)
71    {
72        if ($this->overwrite) return null; // no reuse allowed
73
74        $data = $this->runQuery(
75            '/vectors/fetch?ids=' . $chunkID,
76            '',
77            'GET'
78        );
79        if (!$data) return null;
80        $vector = array_shift($data['vectors']);
81        if (!$vector) return null;
82
83        return new Chunk(
84            $vector['metadata']['page'],
85            $chunkID,
86            $vector['metadata']['text'],
87            $vector['values'],
88            $vector['metadata']['language'] ?? '',
89            $vector['metadata']['created']
90        );
91    }
92
93    /**
94     * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply
95     * not reuse any existing vectors.
96     *
97     * @inheritdoc
98     */
99    public function startCreation($clear = false)
100    {
101        if ($clear) {
102            try {
103                $this->runQuery('/vectors/delete', ['delete_all' => 'True']);
104            } catch (\Exception) {
105                // delete all seems not supported -> starter edition
106                $this->overwrite = true;
107            }
108        }
109    }
110
111    /** @inheritdoc */
112    public function reusePageChunks($page, $firstChunkID)
113    {
114        // no-op
115    }
116
117    /** @inheritdoc */
118    public function deletePageChunks($page, $firstChunkID)
119    {
120        // delete all possible chunk IDs
121        $ids = range($firstChunkID, $firstChunkID + 99, 1);
122        $ids = array_map(static fn($id) => (string)$id, $ids);
123        try {
124            $this->runQuery('/vectors/delete', ['ids' => $ids]);
125        } catch (\Exception $e) {
126            // 5 is the code for "namespace not found" See #12
127            if ($e->getCode() !== 5) throw $e;
128        }
129    }
130
131    /** @inheritdoc */
132    public function addPageChunks($chunks)
133    {
134        $vectors = [];
135        foreach ($chunks as $chunk) {
136            $vectors[] = [
137                'id' => (string)$chunk->getId(),
138                'values' => $chunk->getEmbedding(),
139                'metadata' => [
140                    'page' => $chunk->getPage(),
141                    'created' => $chunk->getCreated(),
142                    'text' => $chunk->getText(),
143                ]
144            ];
145        }
146
147        $this->runQuery('/vectors/upsert', ['vectors' => $vectors]);
148    }
149
150    /** @inheritdoc */
151    public function finalizeCreation()
152    {
153        $this->overwrite = false;
154    }
155
156    /** @inheritdoc */
157    public function runMaintenance()
158    {
159        // no-op
160    }
161
162
163    /**
164     * Pinecone can't query based on metadata, so we have to get all possible chunks by ID
165     *
166     * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140
167     * @inheritdoc
168     */
169    public function getPageChunks($page, $firstChunkID)
170    {
171        $ids = range($firstChunkID, $firstChunkID + 99, 1);
172        $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item);
173
174        $data = $this->runQuery(
175            '/vectors/fetch?' . $ids,
176            '',
177            'GET'
178        );
179        if (!$data) return [];
180
181        $chunks = [];
182        foreach ($data['vectors'] as $vector) {
183            $chunks[] = new Chunk(
184                $vector['metadata']['page'],
185                $vector['id'],
186                $vector['metadata']['text'],
187                $vector['values'],
188                $vector['metadata']['language'] ?? '',
189                $vector['metadata']['created']
190            );
191        }
192        return $chunks;
193    }
194
195    /** @inheritdoc */
196    public function getSimilarChunks($vector, $lang = '', $limit = 4)
197    {
198        $limit *= 2; // we can't check ACLs, so we return more than requested
199
200        $query = [
201            'vector' => $vector,
202            'topK' => (int)$limit,
203            'includeMetadata' => true,
204            'includeValues' => true,
205        ];
206
207        if ($lang) {
208            $query['filter'] = ['language' => ['$eq', $lang]];
209        }
210
211        $response = $this->runQuery('/query', $query);
212        $chunks = [];
213        foreach ($response['matches'] as $vector) {
214            $chunks[] = new Chunk(
215                $vector['metadata']['page'],
216                $vector['id'],
217                $vector['metadata']['text'],
218                $vector['values'],
219                $vector['metadata']['language'] ?? '',
220                $vector['metadata']['created'],
221                $vector['score']
222            );
223        }
224        return $chunks;
225    }
226
227    /** @inheritdoc */
228    public function statistics()
229    {
230        $data = $this->runQuery('/describe_index_stats', []);
231
232        return [
233            'storage type' => 'Pinecone',
234            'chunks' => $data['totalVectorCount'],
235            'fullness' => $data['indexFullness'],
236        ];
237    }
238}
239