xref: /plugin/aichat/Storage/PineconeStorage.php (revision edf69908aba1160784aaa80b226c29da73ea4978)
113dbfc23SAndreas Gohr<?php
213dbfc23SAndreas Gohr
313dbfc23SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
413dbfc23SAndreas Gohr
513dbfc23SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient;
613dbfc23SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
713dbfc23SAndreas Gohr
813dbfc23SAndreas Gohr/**
913dbfc23SAndreas Gohr * Implements the storage backend using a Pinecone index
1013dbfc23SAndreas Gohr */
1113dbfc23SAndreas Gohrclass PineconeStorage extends AbstractStorage
1213dbfc23SAndreas Gohr{
1313dbfc23SAndreas Gohr    /** @var DokuHTTPClient preauthed client */
1413dbfc23SAndreas Gohr    protected $http;
1513dbfc23SAndreas Gohr    /** @var string full URL to the index instance */
1613dbfc23SAndreas Gohr    protected $baseurl;
1713dbfc23SAndreas Gohr    /** @var bool set to true when no chunks should be reused */
1813dbfc23SAndreas Gohr    protected $overwrite = false;
1913dbfc23SAndreas Gohr
2013dbfc23SAndreas Gohr    /**
2113dbfc23SAndreas Gohr     * PineconeStorage constructor.
2213dbfc23SAndreas Gohr     */
2313dbfc23SAndreas Gohr    public function __construct()
2413dbfc23SAndreas Gohr    {
2513dbfc23SAndreas Gohr        $helper = plugin_load('helper', 'aichat');
2613dbfc23SAndreas Gohr
2713dbfc23SAndreas Gohr        $this->baseurl = $helper->getConf('pinecone_baseurl');
2813dbfc23SAndreas Gohr
2913dbfc23SAndreas Gohr        $this->http = new DokuHTTPClient();
3013dbfc23SAndreas Gohr        $this->http->headers['Api-Key'] = $helper->getConf('pinecone_apikey');
3113dbfc23SAndreas Gohr        $this->http->headers['Content-Type'] = 'application/json';
3213dbfc23SAndreas Gohr        $this->http->headers['Accept'] = 'application/json';
3313dbfc23SAndreas Gohr    }
3413dbfc23SAndreas Gohr
3513dbfc23SAndreas Gohr    /**
3613dbfc23SAndreas Gohr     * Execute a query against the Pinecone API
3713dbfc23SAndreas Gohr     *
3813dbfc23SAndreas Gohr     * @param string $endpoint API endpoint, will be added to the base URL
3913dbfc23SAndreas Gohr     * @param mixed $data The data to send, will be JSON encoded
4013dbfc23SAndreas Gohr     * @param string $method POST|GET
4113dbfc23SAndreas Gohr     * @return mixed
4213dbfc23SAndreas Gohr     * @throws \Exception
4313dbfc23SAndreas Gohr     */
4430b9cbc7Ssplitbrain    protected function runQuery($endpoint, mixed $data, $method = 'POST')
4513dbfc23SAndreas Gohr    {
4613dbfc23SAndreas Gohr        $url = $this->baseurl . $endpoint;
4713dbfc23SAndreas Gohr
487ebc7895Ssplitbrain        if (is_array($data) && $data === []) {
4913dbfc23SAndreas Gohr            $json = '{}';
5013dbfc23SAndreas Gohr        } else {
5130b9cbc7Ssplitbrain            $json = json_encode($data, JSON_THROW_ON_ERROR);
5213dbfc23SAndreas Gohr        }
5313dbfc23SAndreas Gohr
5413dbfc23SAndreas Gohr        $this->http->sendRequest($url, $json, $method);
5513dbfc23SAndreas Gohr        $response = $this->http->resp_body;
5613dbfc23SAndreas Gohr        if ($response === false) {
5713dbfc23SAndreas Gohr            throw new \Exception('Pinecone API returned no response. ' . $this->http->error);
5813dbfc23SAndreas Gohr        }
5913dbfc23SAndreas Gohr
60*edf69908SAndreas Gohr        try {
6130b9cbc7Ssplitbrain            $result = json_decode((string)$response, true, 512, JSON_THROW_ON_ERROR);
62*edf69908SAndreas Gohr        } catch (\JsonException $e) {
63*edf69908SAndreas Gohr            throw new \Exception('Pinecone API returned invalid JSON. ' . $response, 0, $e);
6413dbfc23SAndreas Gohr        }
6513dbfc23SAndreas Gohr
6613dbfc23SAndreas Gohr        if (isset($result['message'])) {
67*edf69908SAndreas Gohr            throw new \Exception('Pinecone API returned error. ' . $result['message'], $result['code'] ?? 0);
6813dbfc23SAndreas Gohr        }
6913dbfc23SAndreas Gohr
7013dbfc23SAndreas Gohr        return $result;
7113dbfc23SAndreas Gohr    }
7213dbfc23SAndreas Gohr
7313dbfc23SAndreas Gohr    /** @inheritdoc */
7413dbfc23SAndreas Gohr    public function getChunk($chunkID)
7513dbfc23SAndreas Gohr    {
7613dbfc23SAndreas Gohr        if ($this->overwrite) return null; // no reuse allowed
7713dbfc23SAndreas Gohr
7813dbfc23SAndreas Gohr        $data = $this->runQuery(
7913dbfc23SAndreas Gohr            '/vectors/fetch?ids=' . $chunkID,
8013dbfc23SAndreas Gohr            '',
8113dbfc23SAndreas Gohr            'GET'
8213dbfc23SAndreas Gohr        );
8313dbfc23SAndreas Gohr        if (!$data) return null;
8413dbfc23SAndreas Gohr        $vector = array_shift($data['vectors']);
8513dbfc23SAndreas Gohr        if (!$vector) return null;
8613dbfc23SAndreas Gohr
8713dbfc23SAndreas Gohr        return new Chunk(
8813dbfc23SAndreas Gohr            $vector['metadata']['page'],
8913dbfc23SAndreas Gohr            $chunkID,
9013dbfc23SAndreas Gohr            $vector['metadata']['text'],
9113dbfc23SAndreas Gohr            $vector['values'],
92e33a1d7aSAndreas Gohr            $vector['metadata']['language'] ?? '',
9313dbfc23SAndreas Gohr            $vector['metadata']['created']
9413dbfc23SAndreas Gohr        );
9513dbfc23SAndreas Gohr    }
9613dbfc23SAndreas Gohr
9713dbfc23SAndreas Gohr    /**
9813dbfc23SAndreas Gohr     * Proper clearing is not supported in the starter edition of pinecone. If clearing fails, we will simply
9913dbfc23SAndreas Gohr     * not reuse any existing vectors.
10013dbfc23SAndreas Gohr     *
10113dbfc23SAndreas Gohr     * @inheritdoc
10213dbfc23SAndreas Gohr     */
10313dbfc23SAndreas Gohr    public function startCreation($clear = false)
10413dbfc23SAndreas Gohr    {
10513dbfc23SAndreas Gohr        if ($clear) {
10613dbfc23SAndreas Gohr            try {
10713dbfc23SAndreas Gohr                $this->runQuery('/vectors/delete', ['delete_all' => 'True']);
10830b9cbc7Ssplitbrain            } catch (\Exception) {
10913dbfc23SAndreas Gohr                // delete all seems not supported -> starter edition
11013dbfc23SAndreas Gohr                $this->overwrite = true;
11113dbfc23SAndreas Gohr            }
11213dbfc23SAndreas Gohr        }
11313dbfc23SAndreas Gohr    }
11413dbfc23SAndreas Gohr
11513dbfc23SAndreas Gohr    /** @inheritdoc */
11613dbfc23SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
11713dbfc23SAndreas Gohr    {
11813dbfc23SAndreas Gohr        // no-op
11913dbfc23SAndreas Gohr    }
12013dbfc23SAndreas Gohr
12113dbfc23SAndreas Gohr    /** @inheritdoc */
12213dbfc23SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
12313dbfc23SAndreas Gohr    {
12413dbfc23SAndreas Gohr        // delete all possible chunk IDs
12513dbfc23SAndreas Gohr        $ids = range($firstChunkID, $firstChunkID + 99, 1);
12630b9cbc7Ssplitbrain        $ids = array_map(static fn($id) => (string)$id, $ids);
127*edf69908SAndreas Gohr        try {
12813dbfc23SAndreas Gohr            $this->runQuery('/vectors/delete', ['ids' => $ids]);
129*edf69908SAndreas Gohr        } catch (\Exception $e) {
130*edf69908SAndreas Gohr            // 5 is the code for "namespace not found" See #12
131*edf69908SAndreas Gohr            if($e->getCode() !== 5) throw $e;
132*edf69908SAndreas Gohr        }
13313dbfc23SAndreas Gohr    }
13413dbfc23SAndreas Gohr
13513dbfc23SAndreas Gohr    /** @inheritdoc */
13613dbfc23SAndreas Gohr    public function addPageChunks($chunks)
13713dbfc23SAndreas Gohr    {
13813dbfc23SAndreas Gohr        $vectors = [];
13913dbfc23SAndreas Gohr        foreach ($chunks as $chunk) {
14013dbfc23SAndreas Gohr            $vectors[] = [
14113dbfc23SAndreas Gohr                'id' => (string)$chunk->getId(),
14213dbfc23SAndreas Gohr                'values' => $chunk->getEmbedding(),
14313dbfc23SAndreas Gohr                'metadata' => [
14413dbfc23SAndreas Gohr                    'page' => $chunk->getPage(),
14513dbfc23SAndreas Gohr                    'created' => $chunk->getCreated(),
14613dbfc23SAndreas Gohr                    'text' => $chunk->getText(),
14713dbfc23SAndreas Gohr                ]
14813dbfc23SAndreas Gohr            ];
14913dbfc23SAndreas Gohr        }
15013dbfc23SAndreas Gohr
15113dbfc23SAndreas Gohr        $this->runQuery('/vectors/upsert', ['vectors' => $vectors]);
15213dbfc23SAndreas Gohr    }
15313dbfc23SAndreas Gohr
15413dbfc23SAndreas Gohr    /** @inheritdoc */
15513dbfc23SAndreas Gohr    public function finalizeCreation()
15613dbfc23SAndreas Gohr    {
15713dbfc23SAndreas Gohr        $this->overwrite = false;
15813dbfc23SAndreas Gohr    }
15913dbfc23SAndreas Gohr
1608285fff9SAndreas Gohr    /** @inheritdoc */
1618285fff9SAndreas Gohr    public function runMaintenance()
1628285fff9SAndreas Gohr    {
1638285fff9SAndreas Gohr        // no-op
1648285fff9SAndreas Gohr    }
1658285fff9SAndreas Gohr
1668285fff9SAndreas Gohr
16713dbfc23SAndreas Gohr    /**
16813dbfc23SAndreas Gohr     * Pinecone can't query based on metadata, so we have to get all possible chunks by ID
16913dbfc23SAndreas Gohr     *
17013dbfc23SAndreas Gohr     * @link https://community.pinecone.io/t/fetch-vectors-based-only-on-metadata-filters/2140
17113dbfc23SAndreas Gohr     * @inheritdoc
17213dbfc23SAndreas Gohr     */
17313dbfc23SAndreas Gohr    public function getPageChunks($page, $firstChunkID)
17413dbfc23SAndreas Gohr    {
17513dbfc23SAndreas Gohr        $ids = range($firstChunkID, $firstChunkID + 99, 1);
17630b9cbc7Ssplitbrain        $ids = array_reduce($ids, static fn($carry, $item) => $carry . '&ids=' . $item);
17713dbfc23SAndreas Gohr
17813dbfc23SAndreas Gohr        $data = $this->runQuery(
17913dbfc23SAndreas Gohr            '/vectors/fetch?' . $ids,
18013dbfc23SAndreas Gohr            '',
18113dbfc23SAndreas Gohr            'GET'
18213dbfc23SAndreas Gohr        );
18313dbfc23SAndreas Gohr        if (!$data) return [];
18413dbfc23SAndreas Gohr
18513dbfc23SAndreas Gohr        $chunks = [];
18613dbfc23SAndreas Gohr        foreach ($data['vectors'] as $vector) {
18713dbfc23SAndreas Gohr            $chunks[] = new Chunk(
18813dbfc23SAndreas Gohr                $vector['metadata']['page'],
18913dbfc23SAndreas Gohr                $vector['id'],
19013dbfc23SAndreas Gohr                $vector['metadata']['text'],
19113dbfc23SAndreas Gohr                $vector['values'],
192e33a1d7aSAndreas Gohr                $vector['metadata']['language'] ?? '',
19313dbfc23SAndreas Gohr                $vector['metadata']['created']
19413dbfc23SAndreas Gohr            );
19513dbfc23SAndreas Gohr        }
19613dbfc23SAndreas Gohr        return $chunks;
19713dbfc23SAndreas Gohr    }
19813dbfc23SAndreas Gohr
19913dbfc23SAndreas Gohr    /** @inheritdoc */
200e33a1d7aSAndreas Gohr    public function getSimilarChunks($vector, $lang = '', $limit = 4)
20113dbfc23SAndreas Gohr    {
2027ebc7895Ssplitbrain        $limit *= 2; // we can't check ACLs, so we return more than requested
20313dbfc23SAndreas Gohr
204*edf69908SAndreas Gohr        $query = [
20513dbfc23SAndreas Gohr            'vector' => $vector,
20613dbfc23SAndreas Gohr            'topK' => (int)$limit,
207*edf69908SAndreas Gohr            'includeMetadata' => true,
208*edf69908SAndreas Gohr            'includeValues' => true,
209*edf69908SAndreas Gohr        ];
210*edf69908SAndreas Gohr
211*edf69908SAndreas Gohr        if ($lang) {
212*edf69908SAndreas Gohr            $query['filter'] = ['language' => ['$eq', $lang]];
213*edf69908SAndreas Gohr        }
214*edf69908SAndreas Gohr
215*edf69908SAndreas Gohr        $response = $this->runQuery('/query', $query);
21613dbfc23SAndreas Gohr        $chunks = [];
21713dbfc23SAndreas Gohr        foreach ($response['matches'] as $vector) {
21813dbfc23SAndreas Gohr            $chunks[] = new Chunk(
21913dbfc23SAndreas Gohr                $vector['metadata']['page'],
22013dbfc23SAndreas Gohr                $vector['id'],
22113dbfc23SAndreas Gohr                $vector['metadata']['text'],
22213dbfc23SAndreas Gohr                $vector['values'],
223e33a1d7aSAndreas Gohr                $vector['metadata']['language'] ?? '',
22413dbfc23SAndreas Gohr                $vector['metadata']['created'],
22513dbfc23SAndreas Gohr                $vector['score']
22613dbfc23SAndreas Gohr            );
22713dbfc23SAndreas Gohr        }
22813dbfc23SAndreas Gohr        return $chunks;
22913dbfc23SAndreas Gohr    }
23013dbfc23SAndreas Gohr
23113dbfc23SAndreas Gohr    /** @inheritdoc */
23213dbfc23SAndreas Gohr    public function statistics()
23313dbfc23SAndreas Gohr    {
23413dbfc23SAndreas Gohr        $data = $this->runQuery('/describe_index_stats', []);
23513dbfc23SAndreas Gohr
23613dbfc23SAndreas Gohr        return [
23713dbfc23SAndreas Gohr            'storage type' => 'Pinecone',
23813dbfc23SAndreas Gohr            'chunks' => $data['totalVectorCount'],
23913dbfc23SAndreas Gohr            'fullness' => $data['indexFullness'],
24013dbfc23SAndreas Gohr        ];
24113dbfc23SAndreas Gohr    }
24213dbfc23SAndreas Gohr}
243