xref: /plugin/aichat/Storage/SQLiteStorage.php (revision 9b3d1b36b01775ec335ec281af23d89587095d71)
1f6ef2e50SAndreas Gohr<?php
2f6ef2e50SAndreas Gohr
3f6ef2e50SAndreas Gohr
4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
5f6ef2e50SAndreas Gohr
6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB;
8f6ef2e50SAndreas Gohr
9f6ef2e50SAndreas Gohr/**
10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database
11f6ef2e50SAndreas Gohr */
12f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage
13f6ef2e50SAndreas Gohr{
14f6ef2e50SAndreas Gohr    /** @var SQLiteDB */
15f6ef2e50SAndreas Gohr    protected $db;
16f6ef2e50SAndreas Gohr
17f6ef2e50SAndreas Gohr    /**
18f6ef2e50SAndreas Gohr     * Initializes the database connection and registers our custom function
19f6ef2e50SAndreas Gohr     *
20f6ef2e50SAndreas Gohr     * @throws \Exception
21f6ef2e50SAndreas Gohr     */
22f6ef2e50SAndreas Gohr    public function __construct()
23f6ef2e50SAndreas Gohr    {
24f6ef2e50SAndreas Gohr        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
25f6ef2e50SAndreas Gohr        $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2);
26f6ef2e50SAndreas Gohr    }
27f6ef2e50SAndreas Gohr
28f6ef2e50SAndreas Gohr    /** @inheritdoc */
29f6ef2e50SAndreas Gohr    public function getChunk($chunkID)
30f6ef2e50SAndreas Gohr    {
31f6ef2e50SAndreas Gohr        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
32f6ef2e50SAndreas Gohr        if (!$record) return null;
33f6ef2e50SAndreas Gohr
34f6ef2e50SAndreas Gohr        return new Chunk(
35f6ef2e50SAndreas Gohr            $record['page'],
36f6ef2e50SAndreas Gohr            $record['id'],
37f6ef2e50SAndreas Gohr            $record['chunk'],
38f6ef2e50SAndreas Gohr            json_decode($record['embedding'], true),
39f6ef2e50SAndreas Gohr            $record['created']
40f6ef2e50SAndreas Gohr        );
41f6ef2e50SAndreas Gohr    }
42f6ef2e50SAndreas Gohr
43f6ef2e50SAndreas Gohr    /** @inheritdoc */
44f6ef2e50SAndreas Gohr    public function startCreation($clear = false)
45f6ef2e50SAndreas Gohr    {
46f6ef2e50SAndreas Gohr        if ($clear) {
47f6ef2e50SAndreas Gohr            /** @noinspection SqlWithoutWhere */
48f6ef2e50SAndreas Gohr            $this->db->exec('DELETE FROM embeddings');
49f6ef2e50SAndreas Gohr        }
50f6ef2e50SAndreas Gohr    }
51f6ef2e50SAndreas Gohr
52f6ef2e50SAndreas Gohr    /** @inheritdoc */
53f6ef2e50SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
54f6ef2e50SAndreas Gohr    {
55f6ef2e50SAndreas Gohr        // no-op
56f6ef2e50SAndreas Gohr    }
57f6ef2e50SAndreas Gohr
58f6ef2e50SAndreas Gohr    /** @inheritdoc */
59f6ef2e50SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
60f6ef2e50SAndreas Gohr    {
61f6ef2e50SAndreas Gohr        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
62f6ef2e50SAndreas Gohr    }
63f6ef2e50SAndreas Gohr
64f6ef2e50SAndreas Gohr    /** @inheritdoc */
65f6ef2e50SAndreas Gohr    public function addPageChunks($chunks)
66f6ef2e50SAndreas Gohr    {
67f6ef2e50SAndreas Gohr        foreach ($chunks as $chunk) {
68f6ef2e50SAndreas Gohr            $this->db->saveRecord('embeddings', [
69f6ef2e50SAndreas Gohr                'page' => $chunk->getPage(),
70f6ef2e50SAndreas Gohr                'id' => $chunk->getId(),
71f6ef2e50SAndreas Gohr                'chunk' => $chunk->getText(),
72f6ef2e50SAndreas Gohr                'embedding' => json_encode($chunk->getEmbedding()),
73f6ef2e50SAndreas Gohr                'created' => $chunk->getCreated()
74f6ef2e50SAndreas Gohr            ]);
75f6ef2e50SAndreas Gohr        }
76f6ef2e50SAndreas Gohr    }
77f6ef2e50SAndreas Gohr
78f6ef2e50SAndreas Gohr    /** @inheritdoc */
79f6ef2e50SAndreas Gohr    public function finalizeCreation()
80f6ef2e50SAndreas Gohr    {
81f6ef2e50SAndreas Gohr        $this->db->exec('VACUUM');
82f6ef2e50SAndreas Gohr    }
83f6ef2e50SAndreas Gohr
84f6ef2e50SAndreas Gohr    /** @inheritdoc */
85f6ef2e50SAndreas Gohr    public function getSimilarChunks($vector, $limit = 4)
86f6ef2e50SAndreas Gohr    {
87f6ef2e50SAndreas Gohr        $result = $this->db->queryAll(
88f6ef2e50SAndreas Gohr            'SELECT *, COSIM(?, embedding) AS similarity
89f6ef2e50SAndreas Gohr               FROM embeddings
90f6ef2e50SAndreas Gohr              WHERE GETACCESSLEVEL(page) > 0
91f6ef2e50SAndreas Gohr           ORDER BY similarity DESC
92f6ef2e50SAndreas Gohr              LIMIT ?',
93f6ef2e50SAndreas Gohr            [json_encode($vector), $limit]
94f6ef2e50SAndreas Gohr        );
95f6ef2e50SAndreas Gohr        $chunks = [];
96f6ef2e50SAndreas Gohr        foreach ($result as $record) {
97f6ef2e50SAndreas Gohr            $chunks[] = new Chunk(
98f6ef2e50SAndreas Gohr                $record['page'],
99f6ef2e50SAndreas Gohr                $record['id'],
100f6ef2e50SAndreas Gohr                $record['chunk'],
101f6ef2e50SAndreas Gohr                json_decode($record['embedding'], true),
102*9b3d1b36SAndreas Gohr                $record['created'],
103*9b3d1b36SAndreas Gohr                $record['similarity']
104f6ef2e50SAndreas Gohr            );
105f6ef2e50SAndreas Gohr        }
106f6ef2e50SAndreas Gohr        return $chunks;
107f6ef2e50SAndreas Gohr    }
108f6ef2e50SAndreas Gohr
109f6ef2e50SAndreas Gohr    /** @inheritdoc */
110f6ef2e50SAndreas Gohr    public function statistics()
111f6ef2e50SAndreas Gohr    {
112f6ef2e50SAndreas Gohr        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
113f6ef2e50SAndreas Gohr        $size = $this->db->queryValue(
114f6ef2e50SAndreas Gohr            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
115f6ef2e50SAndreas Gohr        );
116f6ef2e50SAndreas Gohr        return [
117f6ef2e50SAndreas Gohr            'storage type' => 'SQLite',
118f6ef2e50SAndreas Gohr            'chunks' => $items,
119f6ef2e50SAndreas Gohr            'db size' => filesize_h($size)
120f6ef2e50SAndreas Gohr        ];
121f6ef2e50SAndreas Gohr    }
122f6ef2e50SAndreas Gohr
123f6ef2e50SAndreas Gohr    /**
124f6ef2e50SAndreas Gohr     * Method registered as SQLite callback to calculate the cosine similarity
125f6ef2e50SAndreas Gohr     *
126f6ef2e50SAndreas Gohr     * @param string $query JSON encoded vector array
127f6ef2e50SAndreas Gohr     * @param string $embedding JSON encoded vector array
128f6ef2e50SAndreas Gohr     * @return float
129f6ef2e50SAndreas Gohr     */
130f6ef2e50SAndreas Gohr    public function sqliteCosineSimilarityCallback($query, $embedding)
131f6ef2e50SAndreas Gohr    {
132f6ef2e50SAndreas Gohr        return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding));
133f6ef2e50SAndreas Gohr    }
134f6ef2e50SAndreas Gohr
135f6ef2e50SAndreas Gohr    /**
136f6ef2e50SAndreas Gohr     * Calculate the cosine similarity between two vectors
137f6ef2e50SAndreas Gohr     *
138f6ef2e50SAndreas Gohr     * @param float[] $queryVector The vector of the search phrase
139f6ef2e50SAndreas Gohr     * @param float[] $embedding The vector of the chunk
140f6ef2e50SAndreas Gohr     * @return float
141f6ef2e50SAndreas Gohr     * @link https://doku.wiki/src-cosine-similarity
142f6ef2e50SAndreas Gohr     */
143f6ef2e50SAndreas Gohr    protected function cosineSimilarity($queryVector, $embedding)
144f6ef2e50SAndreas Gohr    {
145f6ef2e50SAndreas Gohr        $dotProduct = 0;
146f6ef2e50SAndreas Gohr        $queryEmbeddingLength = 0;
147f6ef2e50SAndreas Gohr        $embeddingLength = 0;
148f6ef2e50SAndreas Gohr
149f6ef2e50SAndreas Gohr        foreach ($queryVector as $key => $value) {
150f6ef2e50SAndreas Gohr            $dotProduct += $value * $embedding[$key];
151f6ef2e50SAndreas Gohr            $queryEmbeddingLength += $value * $value;
152f6ef2e50SAndreas Gohr            $embeddingLength += $embedding[$key] * $embedding[$key];
153f6ef2e50SAndreas Gohr        }
154f6ef2e50SAndreas Gohr
155f6ef2e50SAndreas Gohr        return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength));
156f6ef2e50SAndreas Gohr    }
157f6ef2e50SAndreas Gohr}
158