xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 01f06932bbd74c60ea6c93ab68b0d6cf32d05aea)
1<?php
2
3
4namespace dokuwiki\plugin\aichat\Storage;
5
6use dokuwiki\plugin\aichat\Chunk;
7use dokuwiki\plugin\sqlite\SQLiteDB;
8
9/**
10 * Implements the storage backend using a SQLite database
11 */
12class SQLiteStorage extends AbstractStorage
13{
14    /** @var float minimum similarity to consider a chunk a match */
15    const SIMILARITY_THRESHOLD = 0.75;
16
17    /** @var SQLiteDB */
18    protected $db;
19
20    /**
21     * Initializes the database connection and registers our custom function
22     *
23     * @throws \Exception
24     */
25    public function __construct()
26    {
27        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
28        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
29    }
30
31    /** @inheritdoc */
32    public function getChunk($chunkID)
33    {
34        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
35        if (!$record) return null;
36
37        return new Chunk(
38            $record['page'],
39            $record['id'],
40            $record['chunk'],
41            json_decode($record['embedding'], true),
42            $record['created']
43        );
44    }
45
46    /** @inheritdoc */
47    public function startCreation($clear = false)
48    {
49        if ($clear) {
50            /** @noinspection SqlWithoutWhere */
51            $this->db->exec('DELETE FROM embeddings');
52        }
53    }
54
55    /** @inheritdoc */
56    public function reusePageChunks($page, $firstChunkID)
57    {
58        // no-op
59    }
60
61    /** @inheritdoc */
62    public function deletePageChunks($page, $firstChunkID)
63    {
64        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
65    }
66
67    /** @inheritdoc */
68    public function addPageChunks($chunks)
69    {
70        foreach ($chunks as $chunk) {
71            $this->db->saveRecord('embeddings', [
72                'page' => $chunk->getPage(),
73                'id' => $chunk->getId(),
74                'chunk' => $chunk->getText(),
75                'embedding' => json_encode($chunk->getEmbedding()),
76                'created' => $chunk->getCreated()
77            ]);
78        }
79    }
80
81    /** @inheritdoc */
82    public function finalizeCreation()
83    {
84        $this->db->exec('VACUUM');
85    }
86
87    /** @inheritdoc */
88    public function getPageChunks($page, $firstChunkID)
89    {
90        $result = $this->db->queryAll(
91            'SELECT * FROM embeddings WHERE page = ?',
92            [$page]
93        );
94        $chunks = [];
95        foreach ($result as $record) {
96            $chunks[] = new Chunk(
97                $record['page'],
98                $record['id'],
99                $record['chunk'],
100                json_decode($record['embedding'], true),
101                $record['created']
102            );
103        }
104        return $chunks;
105    }
106
107
108    /** @inheritdoc */
109    public function getSimilarChunks($vector, $limit = 4)
110    {
111        $result = $this->db->queryAll(
112            'SELECT *, COSIM(?, embedding) AS similarity
113               FROM embeddings
114              WHERE GETACCESSLEVEL(page) > 0
115                AND similarity > CAST(? AS FLOAT)
116           ORDER BY similarity DESC
117              LIMIT ?',
118            [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit]
119        );
120        $chunks = [];
121        foreach ($result as $record) {
122            $chunks[] = new Chunk(
123                $record['page'],
124                $record['id'],
125                $record['chunk'],
126                json_decode($record['embedding'], true),
127                $record['created'],
128                $record['similarity']
129            );
130        }
131        return $chunks;
132    }
133
134    /** @inheritdoc */
135    public function statistics()
136    {
137        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
138        $size = $this->db->queryValue(
139            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
140        );
141        return [
142            'storage type' => 'SQLite',
143            'chunks' => $items,
144            'db size' => filesize_h($size)
145        ];
146    }
147
148    /**
149     * Method registered as SQLite callback to calculate the cosine similarity
150     *
151     * @param string $query JSON encoded vector array
152     * @param string $embedding JSON encoded vector array
153     * @return float
154     */
155    public function sqliteCosineSimilarityCallback($query, $embedding)
156    {
157        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
158    }
159
160    /**
161     * Calculate the cosine similarity between two vectors
162     *
163     * @param float[] $queryVector The vector of the search phrase
164     * @param float[] $embedding The vector of the chunk
165     * @return float
166     * @link https://doku.wiki/src-cosine-similarity
167     */
168    protected function cosineSimilarity($queryVector, $embedding)
169    {
170        $dotProduct = 0;
171        $queryEmbeddingLength = 0;
172        $embeddingLength = 0;
173
174        foreach ($queryVector as $key => $value) {
175            $dotProduct += $value * $embedding[$key];
176            $queryEmbeddingLength += $value * $value;
177            $embeddingLength += $embedding[$key] * $embedding[$key];
178        }
179
180        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
181    }
182}
183