xref: /plugin/aichat/Storage/SQLiteStorage.php (revision f6ef2e505783ac17f756e44bf15c66238362377a)
1<?php
2
3
4namespace dokuwiki\plugin\aichat\Storage;
5
6use dokuwiki\plugin\aichat\Chunk;
7use dokuwiki\plugin\sqlite\SQLiteDB;
8
9/**
10 * Implements the storage backend using a SQLite database
11 */
12class SQLiteStorage extends AbstractStorage
13{
14    /** @var SQLiteDB */
15    protected $db;
16
17    /**
18     * Initializes the database connection and registers our custom function
19     *
20     * @throws \Exception
21     */
22    public function __construct()
23    {
24        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
25        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
26    }
27
28    /** @inheritdoc */
29    public function getChunk($chunkID)
30    {
31        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
32        if (!$record) return null;
33
34        return new Chunk(
35            $record['page'],
36            $record['id'],
37            $record['chunk'],
38            json_decode($record['embedding'], true),
39            $record['created']
40        );
41    }
42
43    /** @inheritdoc */
44    public function startCreation($clear = false)
45    {
46        if ($clear) {
47            /** @noinspection SqlWithoutWhere */
48            $this->db->exec('DELETE FROM embeddings');
49        }
50    }
51
52    /** @inheritdoc */
53    public function reusePageChunks($page, $firstChunkID)
54    {
55        // no-op
56    }
57
58    /** @inheritdoc */
59    public function deletePageChunks($page, $firstChunkID)
60    {
61        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
62    }
63
64    /** @inheritdoc */
65    public function addPageChunks($chunks)
66    {
67        foreach ($chunks as $chunk) {
68            $this->db->saveRecord('embeddings', [
69                'page' => $chunk->getPage(),
70                'id' => $chunk->getId(),
71                'chunk' => $chunk->getText(),
72                'embedding' => json_encode($chunk->getEmbedding()),
73                'created' => $chunk->getCreated()
74            ]);
75        }
76    }
77
78    /** @inheritdoc */
79    public function finalizeCreation()
80    {
81        $this->db->exec('VACUUM');
82    }
83
84    /** @inheritdoc */
85    public function getSimilarChunks($vector, $limit = 4)
86    {
87        $result = $this->db->queryAll(
88            'SELECT *, COSIM(?, embedding) AS similarity
89               FROM embeddings
90              WHERE GETACCESSLEVEL(page) > 0
91           ORDER BY similarity DESC
92              LIMIT ?',
93            [json_encode($vector), $limit]
94        );
95        $chunks = [];
96        foreach ($result as $record) {
97            $chunks[] = new Chunk(
98                $record['page'],
99                $record['id'],
100                $record['chunk'],
101                json_decode($record['embedding'], true),
102                $record['created']
103            );
104        }
105        return $chunks;
106    }
107
108    /** @inheritdoc */
109    public function statistics()
110    {
111        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
112        $size = $this->db->queryValue(
113            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
114        );
115        return [
116            'storage type' => 'SQLite',
117            'chunks' => $items,
118            'db size' => filesize_h($size)
119        ];
120    }
121
122    /**
123     * Method registered as SQLite callback to calculate the cosine similarity
124     *
125     * @param string $query JSON encoded vector array
126     * @param string $embedding JSON encoded vector array
127     * @return float
128     */
129    public function sqliteCosineSimilarityCallback($query, $embedding)
130    {
131        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
132    }
133
134    /**
135     * Calculate the cosine similarity between two vectors
136     *
137     * @param float[] $queryVector The vector of the search phrase
138     * @param float[] $embedding The vector of the chunk
139     * @return float
140     * @link https://doku.wiki/src-cosine-similarity
141     */
142    protected function cosineSimilarity($queryVector, $embedding)
143    {
144        $dotProduct = 0;
145        $queryEmbeddingLength = 0;
146        $embeddingLength = 0;
147
148        foreach ($queryVector as $key => $value) {
149            $dotProduct += $value * $embedding[$key];
150            $queryEmbeddingLength += $value * $value;
151            $embeddingLength += $embedding[$key] * $embedding[$key];
152        }
153
154        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
155    }
156}
157