xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 81b450c8469c053b1bd31ea13f5d6461d1f174e3)
1<?php
2
3
4namespace dokuwiki\plugin\aichat\Storage;
5
6use dokuwiki\plugin\aichat\Chunk;
7use dokuwiki\plugin\sqlite\SQLiteDB;
8
9/**
10 * Implements the storage backend using a SQLite database
11 */
12class SQLiteStorage extends AbstractStorage
13{
14    /** @var float minimum similarity to consider a chunk a match */
15    const SIMILARITY_THRESHOLD = 0.75;
16
17    /** @var SQLiteDB */
18    protected $db;
19
20    /**
21     * Initializes the database connection and registers our custom function
22     *
23     * @throws \Exception
24     */
25    public function __construct()
26    {
27        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
28        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
29    }
30
31    /** @inheritdoc */
32    public function getChunk($chunkID)
33    {
34        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
35        if (!$record) return null;
36
37        return new Chunk(
38            $record['page'],
39            $record['id'],
40            $record['chunk'],
41            json_decode($record['embedding'], true),
42            $record['created']
43        );
44    }
45
46    /** @inheritdoc */
47    public function startCreation($clear = false)
48    {
49        if ($clear) {
50            /** @noinspection SqlWithoutWhere */
51            $this->db->exec('DELETE FROM embeddings');
52        }
53    }
54
55    /** @inheritdoc */
56    public function reusePageChunks($page, $firstChunkID)
57    {
58        // no-op
59    }
60
61    /** @inheritdoc */
62    public function deletePageChunks($page, $firstChunkID)
63    {
64        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
65    }
66
67    /** @inheritdoc */
68    public function addPageChunks($chunks)
69    {
70        foreach ($chunks as $chunk) {
71            $this->db->saveRecord('embeddings', [
72                'page' => $chunk->getPage(),
73                'id' => $chunk->getId(),
74                'chunk' => $chunk->getText(),
75                'embedding' => json_encode($chunk->getEmbedding()),
76                'created' => $chunk->getCreated()
77            ]);
78        }
79    }
80
81    /** @inheritdoc */
82    public function finalizeCreation()
83    {
84        $this->db->exec('VACUUM');
85    }
86
87    /** @inheritdoc */
88    public function getSimilarChunks($vector, $limit = 4)
89    {
90        $result = $this->db->queryAll(
91            'SELECT *, COSIM(?, embedding) AS similarity
92               FROM embeddings
93              WHERE GETACCESSLEVEL(page) > 0
94                AND similarity > CAST(? AS FLOAT)
95           ORDER BY similarity DESC
96              LIMIT ?',
97            [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit]
98        );
99        $chunks = [];
100        foreach ($result as $record) {
101            $chunks[] = new Chunk(
102                $record['page'],
103                $record['id'],
104                $record['chunk'],
105                json_decode($record['embedding'], true),
106                $record['created'],
107                $record['similarity']
108            );
109        }
110        return $chunks;
111    }
112
113    /** @inheritdoc */
114    public function statistics()
115    {
116        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
117        $size = $this->db->queryValue(
118            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
119        );
120        return [
121            'storage type' => 'SQLite',
122            'chunks' => $items,
123            'db size' => filesize_h($size)
124        ];
125    }
126
127    /**
128     * Method registered as SQLite callback to calculate the cosine similarity
129     *
130     * @param string $query JSON encoded vector array
131     * @param string $embedding JSON encoded vector array
132     * @return float
133     */
134    public function sqliteCosineSimilarityCallback($query, $embedding)
135    {
136        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
137    }
138
139    /**
140     * Calculate the cosine similarity between two vectors
141     *
142     * @param float[] $queryVector The vector of the search phrase
143     * @param float[] $embedding The vector of the chunk
144     * @return float
145     * @link https://doku.wiki/src-cosine-similarity
146     */
147    protected function cosineSimilarity($queryVector, $embedding)
148    {
149        $dotProduct = 0;
150        $queryEmbeddingLength = 0;
151        $embeddingLength = 0;
152
153        foreach ($queryVector as $key => $value) {
154            $dotProduct += $value * $embedding[$key];
155            $queryEmbeddingLength += $value * $value;
156            $embeddingLength += $embedding[$key] * $embedding[$key];
157        }
158
159        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
160    }
161}
162