xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 35555bacbc10d1d920fb24cfc042d01a87f94b73)
1<?php
2
3
4namespace dokuwiki\plugin\aichat\Storage;
5
6use dokuwiki\plugin\aichat\Chunk;
7use dokuwiki\plugin\sqlite\SQLiteDB;
8
9/**
10 * Implements the storage backend using a SQLite database
11 *
12 * Note: all embeddings are stored and returned as normalized vectors
13 */
14class SQLiteStorage extends AbstractStorage
15{
16    /** @var float minimum similarity to consider a chunk a match */
17    const SIMILARITY_THRESHOLD = 0.75;
18
19    /** @var SQLiteDB */
20    protected $db;
21
22    /**
23     * Initializes the database connection and registers our custom function
24     *
25     * @throws \Exception
26     */
27    public function __construct()
28    {
29        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
30        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
31    }
32
33    /** @inheritdoc */
34    public function getChunk($chunkID)
35    {
36        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
37        if (!$record) return null;
38
39        return new Chunk(
40            $record['page'],
41            $record['id'],
42            $record['chunk'],
43            json_decode($record['embedding'], true),
44            $record['created']
45        );
46    }
47
48    /** @inheritdoc */
49    public function startCreation($clear = false)
50    {
51        if ($clear) {
52            /** @noinspection SqlWithoutWhere */
53            $this->db->exec('DELETE FROM embeddings');
54        }
55    }
56
57    /** @inheritdoc */
58    public function reusePageChunks($page, $firstChunkID)
59    {
60        // no-op
61    }
62
63    /** @inheritdoc */
64    public function deletePageChunks($page, $firstChunkID)
65    {
66        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
67    }
68
69    /** @inheritdoc */
70    public function addPageChunks($chunks)
71    {
72        foreach ($chunks as $chunk) {
73            $this->db->saveRecord('embeddings', [
74                'page' => $chunk->getPage(),
75                'id' => $chunk->getId(),
76                'chunk' => $chunk->getText(),
77                'embedding' => json_encode($chunk->getEmbedding()),
78                'created' => $chunk->getCreated()
79            ]);
80        }
81    }
82
83    /** @inheritdoc */
84    public function finalizeCreation()
85    {
86        $this->db->exec('VACUUM');
87    }
88
89    /** @inheritdoc */
90    public function getPageChunks($page, $firstChunkID)
91    {
92        $result = $this->db->queryAll(
93            'SELECT * FROM embeddings WHERE page = ?',
94            [$page]
95        );
96        $chunks = [];
97        foreach ($result as $record) {
98            $chunks[] = new Chunk(
99                $record['page'],
100                $record['id'],
101                $record['chunk'],
102                json_decode($record['embedding'], true),
103                $record['created']
104            );
105        }
106        return $chunks;
107    }
108
109
110    /** @inheritdoc */
111    public function getSimilarChunks($vector, $limit = 4)
112    {
113        $result = $this->db->queryAll(
114            'SELECT *, COSIM(?, embedding) AS similarity
115               FROM embeddings
116              WHERE GETACCESSLEVEL(page) > 0
117                AND similarity > CAST(? AS FLOAT)
118           ORDER BY similarity DESC
119              LIMIT ?',
120            [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit]
121        );
122        $chunks = [];
123        foreach ($result as $record) {
124            $chunks[] = new Chunk(
125                $record['page'],
126                $record['id'],
127                $record['chunk'],
128                json_decode($record['embedding'], true),
129                $record['created'],
130                $record['similarity']
131            );
132        }
133        return $chunks;
134    }
135
136    /** @inheritdoc */
137    public function statistics()
138    {
139        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
140        $size = $this->db->queryValue(
141            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
142        );
143        return [
144            'storage type' => 'SQLite',
145            'chunks' => $items,
146            'db size' => filesize_h($size)
147        ];
148    }
149
150    /**
151     * Method registered as SQLite callback to calculate the cosine similarity
152     *
153     * @param string $query JSON encoded vector array
154     * @param string $embedding JSON encoded vector array
155     * @return float
156     */
157    public function sqliteCosineSimilarityCallback($query, $embedding)
158    {
159        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
160    }
161
162    /**
163     * Calculate the cosine similarity between two vectors
164     *
165     * Actually just calculating the dot product of the two vectors, since they are normalized
166     *
167     * @param float[] $queryVector The normalized vector of the search phrase
168     * @param float[] $embedding The normalized vector of the chunk
169     * @return float
170     */
171    protected function cosineSimilarity($queryVector, $embedding)
172    {
173        $dotProduct = 0;
174        foreach ($queryVector as $key => $value) {
175            $dotProduct += $value * $embedding[$key];
176        }
177        return $dotProduct;
178    }
179}
180