xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 35555bacbc10d1d920fb24cfc042d01a87f94b73)
1f6ef2e50SAndreas Gohr<?php
2f6ef2e50SAndreas Gohr
3f6ef2e50SAndreas Gohr
4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
5f6ef2e50SAndreas Gohr
6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB;
8f6ef2e50SAndreas Gohr
9f6ef2e50SAndreas Gohr/**
10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database
11*35555bacSAndreas Gohr *
12*35555bacSAndreas Gohr * Note: all embeddings are stored and returned as normalized vectors
13f6ef2e50SAndreas Gohr */
14f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage
15f6ef2e50SAndreas Gohr{
1681b450c8SAndreas Gohr    /** @var float minimum similarity to consider a chunk a match */
1781b450c8SAndreas Gohr    const SIMILARITY_THRESHOLD = 0.75;
1881b450c8SAndreas Gohr
19f6ef2e50SAndreas Gohr    /** @var SQLiteDB */
20f6ef2e50SAndreas Gohr    protected $db;
21f6ef2e50SAndreas Gohr
22f6ef2e50SAndreas Gohr    /**
23f6ef2e50SAndreas Gohr     * Initializes the database connection and registers our custom function
24f6ef2e50SAndreas Gohr     *
25f6ef2e50SAndreas Gohr     * @throws \Exception
26f6ef2e50SAndreas Gohr     */
27f6ef2e50SAndreas Gohr    public function __construct()
28f6ef2e50SAndreas Gohr    {
29f6ef2e50SAndreas Gohr        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
30f6ef2e50SAndreas Gohr        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
31f6ef2e50SAndreas Gohr    }
32f6ef2e50SAndreas Gohr
33f6ef2e50SAndreas Gohr    /** @inheritdoc */
34f6ef2e50SAndreas Gohr    public function getChunk($chunkID)
35f6ef2e50SAndreas Gohr    {
36f6ef2e50SAndreas Gohr        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
37f6ef2e50SAndreas Gohr        if (!$record) return null;
38f6ef2e50SAndreas Gohr
39f6ef2e50SAndreas Gohr        return new Chunk(
40f6ef2e50SAndreas Gohr            $record['page'],
41f6ef2e50SAndreas Gohr            $record['id'],
42f6ef2e50SAndreas Gohr            $record['chunk'],
43f6ef2e50SAndreas Gohr            json_decode($record['embedding'], true),
44f6ef2e50SAndreas Gohr            $record['created']
45f6ef2e50SAndreas Gohr        );
46f6ef2e50SAndreas Gohr    }
47f6ef2e50SAndreas Gohr
48f6ef2e50SAndreas Gohr    /** @inheritdoc */
49f6ef2e50SAndreas Gohr    public function startCreation($clear = false)
50f6ef2e50SAndreas Gohr    {
51f6ef2e50SAndreas Gohr        if ($clear) {
52f6ef2e50SAndreas Gohr            /** @noinspection SqlWithoutWhere */
53f6ef2e50SAndreas Gohr            $this->db->exec('DELETE FROM embeddings');
54f6ef2e50SAndreas Gohr        }
55f6ef2e50SAndreas Gohr    }
56f6ef2e50SAndreas Gohr
57f6ef2e50SAndreas Gohr    /** @inheritdoc */
58f6ef2e50SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
59f6ef2e50SAndreas Gohr    {
60f6ef2e50SAndreas Gohr        // no-op
61f6ef2e50SAndreas Gohr    }
62f6ef2e50SAndreas Gohr
63f6ef2e50SAndreas Gohr    /** @inheritdoc */
64f6ef2e50SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
65f6ef2e50SAndreas Gohr    {
66f6ef2e50SAndreas Gohr        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
67f6ef2e50SAndreas Gohr    }
68f6ef2e50SAndreas Gohr
69f6ef2e50SAndreas Gohr    /** @inheritdoc */
70f6ef2e50SAndreas Gohr    public function addPageChunks($chunks)
71f6ef2e50SAndreas Gohr    {
72f6ef2e50SAndreas Gohr        foreach ($chunks as $chunk) {
73f6ef2e50SAndreas Gohr            $this->db->saveRecord('embeddings', [
74f6ef2e50SAndreas Gohr                'page' => $chunk->getPage(),
75f6ef2e50SAndreas Gohr                'id' => $chunk->getId(),
76f6ef2e50SAndreas Gohr                'chunk' => $chunk->getText(),
77f6ef2e50SAndreas Gohr                'embedding' => json_encode($chunk->getEmbedding()),
78f6ef2e50SAndreas Gohr                'created' => $chunk->getCreated()
79f6ef2e50SAndreas Gohr            ]);
80f6ef2e50SAndreas Gohr        }
81f6ef2e50SAndreas Gohr    }
82f6ef2e50SAndreas Gohr
83f6ef2e50SAndreas Gohr    /** @inheritdoc */
84f6ef2e50SAndreas Gohr    public function finalizeCreation()
85f6ef2e50SAndreas Gohr    {
86f6ef2e50SAndreas Gohr        $this->db->exec('VACUUM');
87f6ef2e50SAndreas Gohr    }
88f6ef2e50SAndreas Gohr
89f6ef2e50SAndreas Gohr    /** @inheritdoc */
9001f06932SAndreas Gohr    public function getPageChunks($page, $firstChunkID)
9101f06932SAndreas Gohr    {
9201f06932SAndreas Gohr        $result = $this->db->queryAll(
9301f06932SAndreas Gohr            'SELECT * FROM embeddings WHERE page = ?',
9401f06932SAndreas Gohr            [$page]
9501f06932SAndreas Gohr        );
9601f06932SAndreas Gohr        $chunks = [];
9701f06932SAndreas Gohr        foreach ($result as $record) {
9801f06932SAndreas Gohr            $chunks[] = new Chunk(
9901f06932SAndreas Gohr                $record['page'],
10001f06932SAndreas Gohr                $record['id'],
10101f06932SAndreas Gohr                $record['chunk'],
10201f06932SAndreas Gohr                json_decode($record['embedding'], true),
10301f06932SAndreas Gohr                $record['created']
10401f06932SAndreas Gohr            );
10501f06932SAndreas Gohr        }
10601f06932SAndreas Gohr        return $chunks;
10701f06932SAndreas Gohr    }
10801f06932SAndreas Gohr
10901f06932SAndreas Gohr
11001f06932SAndreas Gohr    /** @inheritdoc */
111f6ef2e50SAndreas Gohr    public function getSimilarChunks($vector, $limit = 4)
112f6ef2e50SAndreas Gohr    {
113f6ef2e50SAndreas Gohr        $result = $this->db->queryAll(
114f6ef2e50SAndreas Gohr            'SELECT *, COSIM(?, embedding) AS similarity
115f6ef2e50SAndreas Gohr               FROM embeddings
116f6ef2e50SAndreas Gohr              WHERE GETACCESSLEVEL(page) > 0
11781b450c8SAndreas Gohr                AND similarity > CAST(? AS FLOAT)
118f6ef2e50SAndreas Gohr           ORDER BY similarity DESC
119f6ef2e50SAndreas Gohr              LIMIT ?',
12081b450c8SAndreas Gohr            [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit]
121f6ef2e50SAndreas Gohr        );
122f6ef2e50SAndreas Gohr        $chunks = [];
123f6ef2e50SAndreas Gohr        foreach ($result as $record) {
124f6ef2e50SAndreas Gohr            $chunks[] = new Chunk(
125f6ef2e50SAndreas Gohr                $record['page'],
126f6ef2e50SAndreas Gohr                $record['id'],
127f6ef2e50SAndreas Gohr                $record['chunk'],
128f6ef2e50SAndreas Gohr                json_decode($record['embedding'], true),
1299b3d1b36SAndreas Gohr                $record['created'],
1309b3d1b36SAndreas Gohr                $record['similarity']
131f6ef2e50SAndreas Gohr            );
132f6ef2e50SAndreas Gohr        }
133f6ef2e50SAndreas Gohr        return $chunks;
134f6ef2e50SAndreas Gohr    }
135f6ef2e50SAndreas Gohr
136f6ef2e50SAndreas Gohr    /** @inheritdoc */
137f6ef2e50SAndreas Gohr    public function statistics()
138f6ef2e50SAndreas Gohr    {
139f6ef2e50SAndreas Gohr        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
140f6ef2e50SAndreas Gohr        $size = $this->db->queryValue(
141f6ef2e50SAndreas Gohr            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
142f6ef2e50SAndreas Gohr        );
143f6ef2e50SAndreas Gohr        return [
144f6ef2e50SAndreas Gohr            'storage type' => 'SQLite',
145f6ef2e50SAndreas Gohr            'chunks' => $items,
146f6ef2e50SAndreas Gohr            'db size' => filesize_h($size)
147f6ef2e50SAndreas Gohr        ];
148f6ef2e50SAndreas Gohr    }
149f6ef2e50SAndreas Gohr
150f6ef2e50SAndreas Gohr    /**
151f6ef2e50SAndreas Gohr     * Method registered as SQLite callback to calculate the cosine similarity
152f6ef2e50SAndreas Gohr     *
153f6ef2e50SAndreas Gohr     * @param string $query JSON encoded vector array
154f6ef2e50SAndreas Gohr     * @param string $embedding JSON encoded vector array
155f6ef2e50SAndreas Gohr     * @return float
156f6ef2e50SAndreas Gohr     */
157f6ef2e50SAndreas Gohr    public function sqliteCosineSimilarityCallback($query, $embedding)
158f6ef2e50SAndreas Gohr    {
159f6ef2e50SAndreas Gohr        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
160f6ef2e50SAndreas Gohr    }
161f6ef2e50SAndreas Gohr
162f6ef2e50SAndreas Gohr    /**
163f6ef2e50SAndreas Gohr     * Calculate the cosine similarity between two vectors
164f6ef2e50SAndreas Gohr     *
165*35555bacSAndreas Gohr     * Actually just calculating the dot product of the two vectors, since they are normalized
166*35555bacSAndreas Gohr     *
167*35555bacSAndreas Gohr     * @param float[] $queryVector The normalized vector of the search phrase
168*35555bacSAndreas Gohr     * @param float[] $embedding The normalized vector of the chunk
169f6ef2e50SAndreas Gohr     * @return float
170f6ef2e50SAndreas Gohr     */
171f6ef2e50SAndreas Gohr    protected function cosineSimilarity($queryVector, $embedding)
172f6ef2e50SAndreas Gohr    {
173f6ef2e50SAndreas Gohr        $dotProduct = 0;
174f6ef2e50SAndreas Gohr        foreach ($queryVector as $key => $value) {
175f6ef2e50SAndreas Gohr            $dotProduct += $value * $embedding[$key];
176f6ef2e50SAndreas Gohr        }
177*35555bacSAndreas Gohr        return $dotProduct;
178f6ef2e50SAndreas Gohr    }
179f6ef2e50SAndreas Gohr}
180