xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 9b3d1b36b01775ec335ec281af23d89587095d71)
1<?php
2
3
4namespace dokuwiki\plugin\aichat\Storage;
5
6use dokuwiki\plugin\aichat\Chunk;
7use dokuwiki\plugin\sqlite\SQLiteDB;
8
9/**
10 * Implements the storage backend using a SQLite database
11 */
12class SQLiteStorage extends AbstractStorage
13{
14    /** @var SQLiteDB */
15    protected $db;
16
17    /**
18     * Initializes the database connection and registers our custom function
19     *
20     * @throws \Exception
21     */
22    public function __construct()
23    {
24        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
25        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
26    }
27
28    /** @inheritdoc */
29    public function getChunk($chunkID)
30    {
31        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
32        if (!$record) return null;
33
34        return new Chunk(
35            $record['page'],
36            $record['id'],
37            $record['chunk'],
38            json_decode($record['embedding'], true),
39            $record['created']
40        );
41    }
42
43    /** @inheritdoc */
44    public function startCreation($clear = false)
45    {
46        if ($clear) {
47            /** @noinspection SqlWithoutWhere */
48            $this->db->exec('DELETE FROM embeddings');
49        }
50    }
51
52    /** @inheritdoc */
53    public function reusePageChunks($page, $firstChunkID)
54    {
55        // no-op
56    }
57
58    /** @inheritdoc */
59    public function deletePageChunks($page, $firstChunkID)
60    {
61        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
62    }
63
64    /** @inheritdoc */
65    public function addPageChunks($chunks)
66    {
67        foreach ($chunks as $chunk) {
68            $this->db->saveRecord('embeddings', [
69                'page' => $chunk->getPage(),
70                'id' => $chunk->getId(),
71                'chunk' => $chunk->getText(),
72                'embedding' => json_encode($chunk->getEmbedding()),
73                'created' => $chunk->getCreated()
74            ]);
75        }
76    }
77
78    /** @inheritdoc */
79    public function finalizeCreation()
80    {
81        $this->db->exec('VACUUM');
82    }
83
84    /** @inheritdoc */
85    public function getSimilarChunks($vector, $limit = 4)
86    {
87        $result = $this->db->queryAll(
88            'SELECT *, COSIM(?, embedding) AS similarity
89               FROM embeddings
90              WHERE GETACCESSLEVEL(page) > 0
91           ORDER BY similarity DESC
92              LIMIT ?',
93            [json_encode($vector), $limit]
94        );
95        $chunks = [];
96        foreach ($result as $record) {
97            $chunks[] = new Chunk(
98                $record['page'],
99                $record['id'],
100                $record['chunk'],
101                json_decode($record['embedding'], true),
102                $record['created'],
103                $record['similarity']
104            );
105        }
106        return $chunks;
107    }
108
109    /** @inheritdoc */
110    public function statistics()
111    {
112        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
113        $size = $this->db->queryValue(
114            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
115        );
116        return [
117            'storage type' => 'SQLite',
118            'chunks' => $items,
119            'db size' => filesize_h($size)
120        ];
121    }
122
123    /**
124     * Method registered as SQLite callback to calculate the cosine similarity
125     *
126     * @param string $query JSON encoded vector array
127     * @param string $embedding JSON encoded vector array
128     * @return float
129     */
130    public function sqliteCosineSimilarityCallback($query, $embedding)
131    {
132        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
133    }
134
135    /**
136     * Calculate the cosine similarity between two vectors
137     *
138     * @param float[] $queryVector The vector of the search phrase
139     * @param float[] $embedding The vector of the chunk
140     * @return float
141     * @link https://doku.wiki/src-cosine-similarity
142     */
143    protected function cosineSimilarity($queryVector, $embedding)
144    {
145        $dotProduct = 0;
146        $queryEmbeddingLength = 0;
147        $embeddingLength = 0;
148
149        foreach ($queryVector as $key => $value) {
150            $dotProduct += $value * $embedding[$key];
151            $queryEmbeddingLength += $value * $value;
152            $embeddingLength += $embedding[$key] * $embedding[$key];
153        }
154
155        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
156    }
157}
158