xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 81b450c8469c053b1bd31ea13f5d6461d1f174e3)
1f6ef2e50SAndreas Gohr<?php
2f6ef2e50SAndreas Gohr
3f6ef2e50SAndreas Gohr
4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
5f6ef2e50SAndreas Gohr
6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB;
8f6ef2e50SAndreas Gohr
9f6ef2e50SAndreas Gohr/**
10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database
11f6ef2e50SAndreas Gohr */
12f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage
13f6ef2e50SAndreas Gohr{
14*81b450c8SAndreas Gohr    /** @var float minimum similarity to consider a chunk a match */
15*81b450c8SAndreas Gohr    const SIMILARITY_THRESHOLD = 0.75;
16*81b450c8SAndreas Gohr
17f6ef2e50SAndreas Gohr    /** @var SQLiteDB */
18f6ef2e50SAndreas Gohr    protected $db;
19f6ef2e50SAndreas Gohr
20f6ef2e50SAndreas Gohr    /**
21f6ef2e50SAndreas Gohr     * Initializes the database connection and registers our custom function
22f6ef2e50SAndreas Gohr     *
23f6ef2e50SAndreas Gohr     * @throws \Exception
24f6ef2e50SAndreas Gohr     */
25f6ef2e50SAndreas Gohr    public function __construct()
26f6ef2e50SAndreas Gohr    {
27f6ef2e50SAndreas Gohr        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
28f6ef2e50SAndreas Gohr        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
29f6ef2e50SAndreas Gohr    }
30f6ef2e50SAndreas Gohr
31f6ef2e50SAndreas Gohr    /** @inheritdoc */
32f6ef2e50SAndreas Gohr    public function getChunk($chunkID)
33f6ef2e50SAndreas Gohr    {
34f6ef2e50SAndreas Gohr        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
35f6ef2e50SAndreas Gohr        if (!$record) return null;
36f6ef2e50SAndreas Gohr
37f6ef2e50SAndreas Gohr        return new Chunk(
38f6ef2e50SAndreas Gohr            $record['page'],
39f6ef2e50SAndreas Gohr            $record['id'],
40f6ef2e50SAndreas Gohr            $record['chunk'],
41f6ef2e50SAndreas Gohr            json_decode($record['embedding'], true),
42f6ef2e50SAndreas Gohr            $record['created']
43f6ef2e50SAndreas Gohr        );
44f6ef2e50SAndreas Gohr    }
45f6ef2e50SAndreas Gohr
46f6ef2e50SAndreas Gohr    /** @inheritdoc */
47f6ef2e50SAndreas Gohr    public function startCreation($clear = false)
48f6ef2e50SAndreas Gohr    {
49f6ef2e50SAndreas Gohr        if ($clear) {
50f6ef2e50SAndreas Gohr            /** @noinspection SqlWithoutWhere */
51f6ef2e50SAndreas Gohr            $this->db->exec('DELETE FROM embeddings');
52f6ef2e50SAndreas Gohr        }
53f6ef2e50SAndreas Gohr    }
54f6ef2e50SAndreas Gohr
55f6ef2e50SAndreas Gohr    /** @inheritdoc */
56f6ef2e50SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
57f6ef2e50SAndreas Gohr    {
58f6ef2e50SAndreas Gohr        // no-op
59f6ef2e50SAndreas Gohr    }
60f6ef2e50SAndreas Gohr
61f6ef2e50SAndreas Gohr    /** @inheritdoc */
62f6ef2e50SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
63f6ef2e50SAndreas Gohr    {
64f6ef2e50SAndreas Gohr        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
65f6ef2e50SAndreas Gohr    }
66f6ef2e50SAndreas Gohr
67f6ef2e50SAndreas Gohr    /** @inheritdoc */
68f6ef2e50SAndreas Gohr    public function addPageChunks($chunks)
69f6ef2e50SAndreas Gohr    {
70f6ef2e50SAndreas Gohr        foreach ($chunks as $chunk) {
71f6ef2e50SAndreas Gohr            $this->db->saveRecord('embeddings', [
72f6ef2e50SAndreas Gohr                'page' => $chunk->getPage(),
73f6ef2e50SAndreas Gohr                'id' => $chunk->getId(),
74f6ef2e50SAndreas Gohr                'chunk' => $chunk->getText(),
75f6ef2e50SAndreas Gohr                'embedding' => json_encode($chunk->getEmbedding()),
76f6ef2e50SAndreas Gohr                'created' => $chunk->getCreated()
77f6ef2e50SAndreas Gohr            ]);
78f6ef2e50SAndreas Gohr        }
79f6ef2e50SAndreas Gohr    }
80f6ef2e50SAndreas Gohr
81f6ef2e50SAndreas Gohr    /** @inheritdoc */
82f6ef2e50SAndreas Gohr    public function finalizeCreation()
83f6ef2e50SAndreas Gohr    {
84f6ef2e50SAndreas Gohr        $this->db->exec('VACUUM');
85f6ef2e50SAndreas Gohr    }
86f6ef2e50SAndreas Gohr
87f6ef2e50SAndreas Gohr    /** @inheritdoc */
88f6ef2e50SAndreas Gohr    public function getSimilarChunks($vector, $limit = 4)
89f6ef2e50SAndreas Gohr    {
90f6ef2e50SAndreas Gohr        $result = $this->db->queryAll(
91f6ef2e50SAndreas Gohr            'SELECT *, COSIM(?, embedding) AS similarity
92f6ef2e50SAndreas Gohr               FROM embeddings
93f6ef2e50SAndreas Gohr              WHERE GETACCESSLEVEL(page) > 0
94*81b450c8SAndreas Gohr                AND similarity > CAST(? AS FLOAT)
95f6ef2e50SAndreas Gohr           ORDER BY similarity DESC
96f6ef2e50SAndreas Gohr              LIMIT ?',
97*81b450c8SAndreas Gohr            [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit]
98f6ef2e50SAndreas Gohr        );
99f6ef2e50SAndreas Gohr        $chunks = [];
100f6ef2e50SAndreas Gohr        foreach ($result as $record) {
101f6ef2e50SAndreas Gohr            $chunks[] = new Chunk(
102f6ef2e50SAndreas Gohr                $record['page'],
103f6ef2e50SAndreas Gohr                $record['id'],
104f6ef2e50SAndreas Gohr                $record['chunk'],
105f6ef2e50SAndreas Gohr                json_decode($record['embedding'], true),
1069b3d1b36SAndreas Gohr                $record['created'],
1079b3d1b36SAndreas Gohr                $record['similarity']
108f6ef2e50SAndreas Gohr            );
109f6ef2e50SAndreas Gohr        }
110f6ef2e50SAndreas Gohr        return $chunks;
111f6ef2e50SAndreas Gohr    }
112f6ef2e50SAndreas Gohr
113f6ef2e50SAndreas Gohr    /** @inheritdoc */
114f6ef2e50SAndreas Gohr    public function statistics()
115f6ef2e50SAndreas Gohr    {
116f6ef2e50SAndreas Gohr        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
117f6ef2e50SAndreas Gohr        $size = $this->db->queryValue(
118f6ef2e50SAndreas Gohr            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
119f6ef2e50SAndreas Gohr        );
120f6ef2e50SAndreas Gohr        return [
121f6ef2e50SAndreas Gohr            'storage type' => 'SQLite',
122f6ef2e50SAndreas Gohr            'chunks' => $items,
123f6ef2e50SAndreas Gohr            'db size' => filesize_h($size)
124f6ef2e50SAndreas Gohr        ];
125f6ef2e50SAndreas Gohr    }
126f6ef2e50SAndreas Gohr
127f6ef2e50SAndreas Gohr    /**
128f6ef2e50SAndreas Gohr     * Method registered as SQLite callback to calculate the cosine similarity
129f6ef2e50SAndreas Gohr     *
130f6ef2e50SAndreas Gohr     * @param string $query JSON encoded vector array
131f6ef2e50SAndreas Gohr     * @param string $embedding JSON encoded vector array
132f6ef2e50SAndreas Gohr     * @return float
133f6ef2e50SAndreas Gohr     */
134f6ef2e50SAndreas Gohr    public function sqliteCosineSimilarityCallback($query, $embedding)
135f6ef2e50SAndreas Gohr    {
136f6ef2e50SAndreas Gohr        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
137f6ef2e50SAndreas Gohr    }
138f6ef2e50SAndreas Gohr
139f6ef2e50SAndreas Gohr    /**
140f6ef2e50SAndreas Gohr     * Calculate the cosine similarity between two vectors
141f6ef2e50SAndreas Gohr     *
142f6ef2e50SAndreas Gohr     * @param float[] $queryVector The vector of the search phrase
143f6ef2e50SAndreas Gohr     * @param float[] $embedding The vector of the chunk
144f6ef2e50SAndreas Gohr     * @return float
145f6ef2e50SAndreas Gohr     * @link https://doku.wiki/src-cosine-similarity
146f6ef2e50SAndreas Gohr     */
147f6ef2e50SAndreas Gohr    protected function cosineSimilarity($queryVector, $embedding)
148f6ef2e50SAndreas Gohr    {
149f6ef2e50SAndreas Gohr        $dotProduct = 0;
150f6ef2e50SAndreas Gohr        $queryEmbeddingLength = 0;
151f6ef2e50SAndreas Gohr        $embeddingLength = 0;
152f6ef2e50SAndreas Gohr
153f6ef2e50SAndreas Gohr        foreach ($queryVector as $key => $value) {
154f6ef2e50SAndreas Gohr            $dotProduct += $value * $embedding[$key];
155f6ef2e50SAndreas Gohr            $queryEmbeddingLength += $value * $value;
156f6ef2e50SAndreas Gohr            $embeddingLength += $embedding[$key] * $embedding[$key];
157f6ef2e50SAndreas Gohr        }
158f6ef2e50SAndreas Gohr
159f6ef2e50SAndreas Gohr        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
160f6ef2e50SAndreas Gohr    }
161f6ef2e50SAndreas Gohr}
162