xref: /plugin/aichat/Storage/SQLiteStorage.php (revision f6ef2e505783ac17f756e44bf15c66238362377a)
1*f6ef2e50SAndreas Gohr<?php
2*f6ef2e50SAndreas Gohr
3*f6ef2e50SAndreas Gohr
4*f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
5*f6ef2e50SAndreas Gohr
6*f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7*f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB;
8*f6ef2e50SAndreas Gohr
9*f6ef2e50SAndreas Gohr/**
10*f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database
11*f6ef2e50SAndreas Gohr */
12*f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage
13*f6ef2e50SAndreas Gohr{
14*f6ef2e50SAndreas Gohr    /** @var SQLiteDB */
15*f6ef2e50SAndreas Gohr    protected $db;
16*f6ef2e50SAndreas Gohr
17*f6ef2e50SAndreas Gohr    /**
18*f6ef2e50SAndreas Gohr     * Initializes the database connection and registers our custom function
19*f6ef2e50SAndreas Gohr     *
20*f6ef2e50SAndreas Gohr     * @throws \Exception
21*f6ef2e50SAndreas Gohr     */
22*f6ef2e50SAndreas Gohr    public function __construct()
23*f6ef2e50SAndreas Gohr    {
24*f6ef2e50SAndreas Gohr        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
25*f6ef2e50SAndreas Gohr        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
26*f6ef2e50SAndreas Gohr    }
27*f6ef2e50SAndreas Gohr
28*f6ef2e50SAndreas Gohr    /** @inheritdoc */
29*f6ef2e50SAndreas Gohr    public function getChunk($chunkID)
30*f6ef2e50SAndreas Gohr    {
31*f6ef2e50SAndreas Gohr        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
32*f6ef2e50SAndreas Gohr        if (!$record) return null;
33*f6ef2e50SAndreas Gohr
34*f6ef2e50SAndreas Gohr        return new Chunk(
35*f6ef2e50SAndreas Gohr            $record['page'],
36*f6ef2e50SAndreas Gohr            $record['id'],
37*f6ef2e50SAndreas Gohr            $record['chunk'],
38*f6ef2e50SAndreas Gohr            json_decode($record['embedding'], true),
39*f6ef2e50SAndreas Gohr            $record['created']
40*f6ef2e50SAndreas Gohr        );
41*f6ef2e50SAndreas Gohr    }
42*f6ef2e50SAndreas Gohr
43*f6ef2e50SAndreas Gohr    /** @inheritdoc */
44*f6ef2e50SAndreas Gohr    public function startCreation($clear = false)
45*f6ef2e50SAndreas Gohr    {
46*f6ef2e50SAndreas Gohr        if ($clear) {
47*f6ef2e50SAndreas Gohr            /** @noinspection SqlWithoutWhere */
48*f6ef2e50SAndreas Gohr            $this->db->exec('DELETE FROM embeddings');
49*f6ef2e50SAndreas Gohr        }
50*f6ef2e50SAndreas Gohr    }
51*f6ef2e50SAndreas Gohr
52*f6ef2e50SAndreas Gohr    /** @inheritdoc */
53*f6ef2e50SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
54*f6ef2e50SAndreas Gohr    {
55*f6ef2e50SAndreas Gohr        // no-op
56*f6ef2e50SAndreas Gohr    }
57*f6ef2e50SAndreas Gohr
58*f6ef2e50SAndreas Gohr    /** @inheritdoc */
59*f6ef2e50SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
60*f6ef2e50SAndreas Gohr    {
61*f6ef2e50SAndreas Gohr        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
62*f6ef2e50SAndreas Gohr    }
63*f6ef2e50SAndreas Gohr
64*f6ef2e50SAndreas Gohr    /** @inheritdoc */
65*f6ef2e50SAndreas Gohr    public function addPageChunks($chunks)
66*f6ef2e50SAndreas Gohr    {
67*f6ef2e50SAndreas Gohr        foreach ($chunks as $chunk) {
68*f6ef2e50SAndreas Gohr            $this->db->saveRecord('embeddings', [
69*f6ef2e50SAndreas Gohr                'page' => $chunk->getPage(),
70*f6ef2e50SAndreas Gohr                'id' => $chunk->getId(),
71*f6ef2e50SAndreas Gohr                'chunk' => $chunk->getText(),
72*f6ef2e50SAndreas Gohr                'embedding' => json_encode($chunk->getEmbedding()),
73*f6ef2e50SAndreas Gohr                'created' => $chunk->getCreated()
74*f6ef2e50SAndreas Gohr            ]);
75*f6ef2e50SAndreas Gohr        }
76*f6ef2e50SAndreas Gohr    }
77*f6ef2e50SAndreas Gohr
78*f6ef2e50SAndreas Gohr    /** @inheritdoc */
79*f6ef2e50SAndreas Gohr    public function finalizeCreation()
80*f6ef2e50SAndreas Gohr    {
81*f6ef2e50SAndreas Gohr        $this->db->exec('VACUUM');
82*f6ef2e50SAndreas Gohr    }
83*f6ef2e50SAndreas Gohr
84*f6ef2e50SAndreas Gohr    /** @inheritdoc */
85*f6ef2e50SAndreas Gohr    public function getSimilarChunks($vector, $limit = 4)
86*f6ef2e50SAndreas Gohr    {
87*f6ef2e50SAndreas Gohr        $result = $this->db->queryAll(
88*f6ef2e50SAndreas Gohr            'SELECT *, COSIM(?, embedding) AS similarity
89*f6ef2e50SAndreas Gohr               FROM embeddings
90*f6ef2e50SAndreas Gohr              WHERE GETACCESSLEVEL(page) > 0
91*f6ef2e50SAndreas Gohr           ORDER BY similarity DESC
92*f6ef2e50SAndreas Gohr              LIMIT ?',
93*f6ef2e50SAndreas Gohr            [json_encode($vector), $limit]
94*f6ef2e50SAndreas Gohr        );
95*f6ef2e50SAndreas Gohr        $chunks = [];
96*f6ef2e50SAndreas Gohr        foreach ($result as $record) {
97*f6ef2e50SAndreas Gohr            $chunks[] = new Chunk(
98*f6ef2e50SAndreas Gohr                $record['page'],
99*f6ef2e50SAndreas Gohr                $record['id'],
100*f6ef2e50SAndreas Gohr                $record['chunk'],
101*f6ef2e50SAndreas Gohr                json_decode($record['embedding'], true),
102*f6ef2e50SAndreas Gohr                $record['created']
103*f6ef2e50SAndreas Gohr            );
104*f6ef2e50SAndreas Gohr        }
105*f6ef2e50SAndreas Gohr        return $chunks;
106*f6ef2e50SAndreas Gohr    }
107*f6ef2e50SAndreas Gohr
108*f6ef2e50SAndreas Gohr    /** @inheritdoc */
109*f6ef2e50SAndreas Gohr    public function statistics()
110*f6ef2e50SAndreas Gohr    {
111*f6ef2e50SAndreas Gohr        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
112*f6ef2e50SAndreas Gohr        $size = $this->db->queryValue(
113*f6ef2e50SAndreas Gohr            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
114*f6ef2e50SAndreas Gohr        );
115*f6ef2e50SAndreas Gohr        return [
116*f6ef2e50SAndreas Gohr            'storage type' => 'SQLite',
117*f6ef2e50SAndreas Gohr            'chunks' => $items,
118*f6ef2e50SAndreas Gohr            'db size' => filesize_h($size)
119*f6ef2e50SAndreas Gohr        ];
120*f6ef2e50SAndreas Gohr    }
121*f6ef2e50SAndreas Gohr
122*f6ef2e50SAndreas Gohr    /**
123*f6ef2e50SAndreas Gohr     * Method registered as SQLite callback to calculate the cosine similarity
124*f6ef2e50SAndreas Gohr     *
125*f6ef2e50SAndreas Gohr     * @param string $query JSON encoded vector array
126*f6ef2e50SAndreas Gohr     * @param string $embedding JSON encoded vector array
127*f6ef2e50SAndreas Gohr     * @return float
128*f6ef2e50SAndreas Gohr     */
129*f6ef2e50SAndreas Gohr    public function sqliteCosineSimilarityCallback($query, $embedding)
130*f6ef2e50SAndreas Gohr    {
131*f6ef2e50SAndreas Gohr        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
132*f6ef2e50SAndreas Gohr    }
133*f6ef2e50SAndreas Gohr
134*f6ef2e50SAndreas Gohr    /**
135*f6ef2e50SAndreas Gohr     * Calculate the cosine similarity between two vectors
136*f6ef2e50SAndreas Gohr     *
137*f6ef2e50SAndreas Gohr     * @param float[] $queryVector The vector of the search phrase
138*f6ef2e50SAndreas Gohr     * @param float[] $embedding The vector of the chunk
139*f6ef2e50SAndreas Gohr     * @return float
140*f6ef2e50SAndreas Gohr     * @link https://doku.wiki/src-cosine-similarity
141*f6ef2e50SAndreas Gohr     */
142*f6ef2e50SAndreas Gohr    protected function cosineSimilarity($queryVector, $embedding)
143*f6ef2e50SAndreas Gohr    {
144*f6ef2e50SAndreas Gohr        $dotProduct = 0;
145*f6ef2e50SAndreas Gohr        $queryEmbeddingLength = 0;
146*f6ef2e50SAndreas Gohr        $embeddingLength = 0;
147*f6ef2e50SAndreas Gohr
148*f6ef2e50SAndreas Gohr        foreach ($queryVector as $key => $value) {
149*f6ef2e50SAndreas Gohr            $dotProduct += $value * $embedding[$key];
150*f6ef2e50SAndreas Gohr            $queryEmbeddingLength += $value * $value;
151*f6ef2e50SAndreas Gohr            $embeddingLength += $embedding[$key] * $embedding[$key];
152*f6ef2e50SAndreas Gohr        }
153*f6ef2e50SAndreas Gohr
154*f6ef2e50SAndreas Gohr        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
155*f6ef2e50SAndreas Gohr    }
156*f6ef2e50SAndreas Gohr}
157