xref: /plugin/aichat/Storage/PineconeStorage.php (revision 7ebc78955c65af90e7ee0afbd07adc15271113ba)
1<?php
2
3namespace dokuwiki\plugin\aichat\Storage;
4
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\plugin\aichat\Chunk;
7
8/**
9 * Implements the storage backend using a Pinecone index
10 */
11class PineconeStorage extends AbstractStorage
12{
13    /** @var DokuHTTPClient preauthed client */
14    protected $http;
15    /** @var string full URL to the index instance */
16    protected $baseurl;
17    /** @var bool set to true when no chunks should be reused */
18    protected $overwrite = false;
19
20    /**
21     * PineconeStorage constructor.
22     */
23    public function __construct()
24    {
25        $helper = plugin_load('helper', 'aichat');
26
27        $this->baseurl = $helper->getConf('pinecone_baseurl');
28
29        $this->http = new DokuHTTPClient();
30        $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey');
31        $this->http->headers['Content-Type'] = 'application/json';
32        $this->http->headers['Accept'] = 'application/json';
33    }
34
35    /**
36     * Execute a query against the Pinecone API
37     *
38     * @param string $endpoint API endpoint, will be added to the base URL
39     * @param mixed $data The data to send, will be JSON encoded
40     * @param string $method POST|GET
41     * @return mixed
42     * @throws \Exception
43     */
44    protected function runQuery($endpoint, $data, $method = 'POST')
45    {
46        $url = $this->baseurl . $endpoint;
47
48        if (is_array($data) && $data === []) {
49            $json = '{}';
50        } else {
51            $json = json_encode($data);
52        }
53
54        $this->http->sendRequest($url, $json, $method);
55        $response = $this->http->resp_body;
56        if ($response === false) {
57            throw new \Exception('Pinecone API returned no response. ' . $this->http->error);
58        }
59
60        $result = json_decode($response, true);
61        if ($result === null) {
62            throw new \Exception('Pinecone API returned invalid JSON. ' . $response);
63        }
64
65        if (isset($result['message'])) {
66            throw new \Exception('Pinecone API returned error. ' . $result['message']);
67        }
68
69        return $result;
70    }
71
72    /** @inheritdoc */
73    public function getChunk($chunkID)
74    {
75        if ($this->overwrite) return null; // no reuse allowed
76
77        $data = $this->runQuery(
78            '/vectors/fetch?ids=' . $chunkID,
79            '',
80            'GET'
81        );
82        if (!$data) return null;
83        $vector = array_shift($data['vectors']);
84        if (!$vector) return null;
85
86        return new Chunk(
87            $vector['metadata']['page'],
88            $chunkID,
89            $vector['metadata']['text'],
90            $vector['values'],
91            $vector['metadata']['language'] ?? '',
92            $vector['metadata']['created']
93        );
94    }
95
96    /**
97     * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply
98     * not reuse any existing vectors.
99     *
100     * @inheritdoc
101     */
102    public function startCreation($clear = false)
103    {
104        if ($clear) {
105            try {
106                $this->runQuery('/vectors/delete', ['delete_all' => 'True']);
107            } catch (\Exception $e) {
108                // delete all seems not supported -> starter edition
109                $this->overwrite = true;
110            }
111        }
112    }
113
114    /** @inheritdoc */
115    public function reusePageChunks($page, $firstChunkID)
116    {
117        // no-op
118    }
119
120    /** @inheritdoc */
121    public function deletePageChunks($page, $firstChunkID)
122    {
123        // delete all possible chunk IDs
124        $ids = range($firstChunkID, $firstChunkID + 99, 1);
125        $ids = array_map(function ($id) {
126            return (string)$id;
127        }, $ids);
128        $this->runQuery('/vectors/delete', ['ids' => $ids]);
129    }
130
131    /** @inheritdoc */
132    public function addPageChunks($chunks)
133    {
134        $vectors = [];
135        foreach ($chunks as $chunk) {
136            $vectors[] = [
137                'id' => (string)$chunk->getId(),
138                'values' => $chunk->getEmbedding(),
139                'metadata' => [
140                    'page' => $chunk->getPage(),
141                    'created' => $chunk->getCreated(),
142                    'text' => $chunk->getText(),
143                ]
144            ];
145        }
146
147        $this->runQuery('/vectors/upsert', ['vectors' => $vectors]);
148    }
149
150    /** @inheritdoc */
151    public function finalizeCreation()
152    {
153        $this->overwrite = false;
154    }
155
156    /** @inheritdoc */
157    public function runMaintenance()
158    {
159        // no-op
160    }
161
162
163    /**
164     * Pinecone can't query based on metadata, so we have to get all possible chunks by ID
165     *
166     * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140
167     * @inheritdoc
168     */
169    public function getPageChunks($page, $firstChunkID)
170    {
171        $ids = range($firstChunkID, $firstChunkID + 99, 1);
172        $ids = array_reduce($ids, function ($carry, $item) {
173            return $carry . '&ids=' . $item;
174        });
175
176        $data = $this->runQuery(
177            '/vectors/fetch?' . $ids,
178            '',
179            'GET'
180        );
181        if (!$data) return [];
182
183        $chunks = [];
184        foreach ($data['vectors'] as $vector) {
185            $chunks[] = new Chunk(
186                $vector['metadata']['page'],
187                $vector['id'],
188                $vector['metadata']['text'],
189                $vector['values'],
190                $vector['metadata']['language'] ?? '',
191                $vector['metadata']['created']
192            );
193        }
194        return $chunks;
195    }
196
197    /** @inheritdoc */
198    public function getSimilarChunks($vector, $lang = '', $limit = 4)
199    {
200        $limit *= 2; // we can't check ACLs, so we return more than requested
201
202        if ($lang) {
203            $filter = ['language' => ['$eq', $lang]];
204        } else {
205            $filter = [];
206        }
207
208        $response = $this->runQuery(
209            '/query',
210            [
211                'vector' => $vector,
212                'topK' => (int)$limit,
213                'include_metadata' => true,
214                'include_values' => true,
215                'filter' => $filter,
216            ]
217        );
218        $chunks = [];
219        foreach ($response['matches'] as $vector) {
220            $chunks[] = new Chunk(
221                $vector['metadata']['page'],
222                $vector['id'],
223                $vector['metadata']['text'],
224                $vector['values'],
225                $vector['metadata']['language'] ?? '',
226                $vector['metadata']['created'],
227                $vector['score']
228            );
229        }
230        return $chunks;
231    }
232
233    /** @inheritdoc */
234    public function statistics()
235    {
236        $data = $this->runQuery('/describe_index_stats', []);
237
238        return [
239            'storage type' => 'Pinecone',
240            'chunks' => $data['totalVectorCount'],
241            'fullness' => $data['indexFullness'],
242        ];
243    }
244}
245