1<?php 2 3 4namespace dokuwiki\plugin\aichat\Storage; 5 6use dokuwiki\plugin\aichat\Chunk; 7use dokuwiki\plugin\sqlite\SQLiteDB; 8 9/** 10 * Implements the storage backend using a SQLite database 11 */ 12class SQLiteStorage extends AbstractStorage 13{ 14 /** @var float minimum similarity to consider a chunk a match */ 15 const SIMILARITY_THRESHOLD = 0.75; 16 17 /** @var SQLiteDB */ 18 protected $db; 19 20 /** 21 * Initializes the database connection and registers our custom function 22 * 23 * @throws \Exception 24 */ 25 public function __construct() 26 { 27 $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 28 $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 29 } 30 31 /** @inheritdoc */ 32 public function getChunk($chunkID) 33 { 34 $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 35 if (!$record) return null; 36 37 return new Chunk( 38 $record['page'], 39 $record['id'], 40 $record['chunk'], 41 json_decode($record['embedding'], true), 42 $record['created'] 43 ); 44 } 45 46 /** @inheritdoc */ 47 public function startCreation($clear = false) 48 { 49 if ($clear) { 50 /** @noinspection SqlWithoutWhere */ 51 $this->db->exec('DELETE FROM embeddings'); 52 } 53 } 54 55 /** @inheritdoc */ 56 public function reusePageChunks($page, $firstChunkID) 57 { 58 // no-op 59 } 60 61 /** @inheritdoc */ 62 public function deletePageChunks($page, $firstChunkID) 63 { 64 $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 65 } 66 67 /** @inheritdoc */ 68 public function addPageChunks($chunks) 69 { 70 foreach ($chunks as $chunk) { 71 $this->db->saveRecord('embeddings', [ 72 'page' => $chunk->getPage(), 73 'id' => $chunk->getId(), 74 'chunk' => $chunk->getText(), 75 'embedding' => json_encode($chunk->getEmbedding()), 76 'created' => $chunk->getCreated() 77 ]); 78 } 79 } 80 81 /** @inheritdoc */ 82 public function finalizeCreation() 83 { 84 $this->db->exec('VACUUM'); 85 } 86 87 /** @inheritdoc */ 88 public function getPageChunks($page, $firstChunkID) 89 { 90 $result = $this->db->queryAll( 91 'SELECT * FROM embeddings WHERE page = ?', 92 [$page] 93 ); 94 $chunks = []; 95 foreach ($result as $record) { 96 $chunks[] = new Chunk( 97 $record['page'], 98 $record['id'], 99 $record['chunk'], 100 json_decode($record['embedding'], true), 101 $record['created'] 102 ); 103 } 104 return $chunks; 105 } 106 107 108 /** @inheritdoc */ 109 public function getSimilarChunks($vector, $limit = 4) 110 { 111 $result = $this->db->queryAll( 112 'SELECT *, COSIM(?, embedding) AS similarity 113 FROM embeddings 114 WHERE GETACCESSLEVEL(page) > 0 115 AND similarity > CAST(? AS FLOAT) 116 ORDER BY similarity DESC 117 LIMIT ?', 118 [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit] 119 ); 120 $chunks = []; 121 foreach ($result as $record) { 122 $chunks[] = new Chunk( 123 $record['page'], 124 $record['id'], 125 $record['chunk'], 126 json_decode($record['embedding'], true), 127 $record['created'], 128 $record['similarity'] 129 ); 130 } 131 return $chunks; 132 } 133 134 /** @inheritdoc */ 135 public function statistics() 136 { 137 $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 138 $size = $this->db->queryValue( 139 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 140 ); 141 return [ 142 'storage type' => 'SQLite', 143 'chunks' => $items, 144 'db size' => filesize_h($size) 145 ]; 146 } 147 148 /** 149 * Method registered as SQLite callback to calculate the cosine similarity 150 * 151 * @param string $query JSON encoded vector array 152 * @param string $embedding JSON encoded vector array 153 * @return float 154 */ 155 public function sqliteCosineSimilarityCallback($query, $embedding) 156 { 157 return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 158 } 159 160 /** 161 * Calculate the cosine similarity between two vectors 162 * 163 * @param float[] $queryVector The vector of the search phrase 164 * @param float[] $embedding The vector of the chunk 165 * @return float 166 * @link https://doku.wiki/src-cosine-similarity 167 */ 168 protected function cosineSimilarity($queryVector, $embedding) 169 { 170 $dotProduct = 0; 171 $queryEmbeddingLength = 0; 172 $embeddingLength = 0; 173 174 foreach ($queryVector as $key => $value) { 175 $dotProduct += $value * $embedding[$key]; 176 $queryEmbeddingLength += $value * $value; 177 $embeddingLength += $embedding[$key] * $embedding[$key]; 178 } 179 180 return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength)); 181 } 182} 183