1<?php 2 3 4namespace dokuwiki\plugin\aichat\Storage; 5 6use dokuwiki\plugin\aichat\Chunk; 7use dokuwiki\plugin\sqlite\SQLiteDB; 8 9/** 10 * Implements the storage backend using a SQLite database 11 * 12 * Note: all embeddings are stored and returned as normalized vectors 13 */ 14class SQLiteStorage extends AbstractStorage 15{ 16 /** @var float minimum similarity to consider a chunk a match */ 17 const SIMILARITY_THRESHOLD = 0.75; 18 19 /** @var SQLiteDB */ 20 protected $db; 21 22 /** 23 * Initializes the database connection and registers our custom function 24 * 25 * @throws \Exception 26 */ 27 public function __construct() 28 { 29 $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 30 $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 31 } 32 33 /** @inheritdoc */ 34 public function getChunk($chunkID) 35 { 36 $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 37 if (!$record) return null; 38 39 return new Chunk( 40 $record['page'], 41 $record['id'], 42 $record['chunk'], 43 json_decode($record['embedding'], true), 44 $record['created'] 45 ); 46 } 47 48 /** @inheritdoc */ 49 public function startCreation($clear = false) 50 { 51 if ($clear) { 52 /** @noinspection SqlWithoutWhere */ 53 $this->db->exec('DELETE FROM embeddings'); 54 } 55 } 56 57 /** @inheritdoc */ 58 public function reusePageChunks($page, $firstChunkID) 59 { 60 // no-op 61 } 62 63 /** @inheritdoc */ 64 public function deletePageChunks($page, $firstChunkID) 65 { 66 $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 67 } 68 69 /** @inheritdoc */ 70 public function addPageChunks($chunks) 71 { 72 foreach ($chunks as $chunk) { 73 $this->db->saveRecord('embeddings', [ 74 'page' => $chunk->getPage(), 75 'id' => $chunk->getId(), 76 'chunk' => $chunk->getText(), 77 'embedding' => json_encode($chunk->getEmbedding()), 78 'created' => $chunk->getCreated() 79 ]); 80 } 81 } 82 83 /** @inheritdoc */ 84 public function finalizeCreation() 85 { 86 $this->db->exec('VACUUM'); 87 } 88 89 /** @inheritdoc */ 90 public function getPageChunks($page, $firstChunkID) 91 { 92 $result = $this->db->queryAll( 93 'SELECT * FROM embeddings WHERE page = ?', 94 [$page] 95 ); 96 $chunks = []; 97 foreach ($result as $record) { 98 $chunks[] = new Chunk( 99 $record['page'], 100 $record['id'], 101 $record['chunk'], 102 json_decode($record['embedding'], true), 103 $record['created'] 104 ); 105 } 106 return $chunks; 107 } 108 109 110 /** @inheritdoc */ 111 public function getSimilarChunks($vector, $limit = 4) 112 { 113 $result = $this->db->queryAll( 114 'SELECT *, COSIM(?, embedding) AS similarity 115 FROM embeddings 116 WHERE GETACCESSLEVEL(page) > 0 117 AND similarity > CAST(? AS FLOAT) 118 ORDER BY similarity DESC 119 LIMIT ?', 120 [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit] 121 ); 122 $chunks = []; 123 foreach ($result as $record) { 124 $chunks[] = new Chunk( 125 $record['page'], 126 $record['id'], 127 $record['chunk'], 128 json_decode($record['embedding'], true), 129 $record['created'], 130 $record['similarity'] 131 ); 132 } 133 return $chunks; 134 } 135 136 /** @inheritdoc */ 137 public function statistics() 138 { 139 $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 140 $size = $this->db->queryValue( 141 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 142 ); 143 return [ 144 'storage type' => 'SQLite', 145 'chunks' => $items, 146 'db size' => filesize_h($size) 147 ]; 148 } 149 150 /** 151 * Method registered as SQLite callback to calculate the cosine similarity 152 * 153 * @param string $query JSON encoded vector array 154 * @param string $embedding JSON encoded vector array 155 * @return float 156 */ 157 public function sqliteCosineSimilarityCallback($query, $embedding) 158 { 159 return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 160 } 161 162 /** 163 * Calculate the cosine similarity between two vectors 164 * 165 * Actually just calculating the dot product of the two vectors, since they are normalized 166 * 167 * @param float[] $queryVector The normalized vector of the search phrase 168 * @param float[] $embedding The normalized vector of the chunk 169 * @return float 170 */ 171 protected function cosineSimilarity($queryVector, $embedding) 172 { 173 $dotProduct = 0; 174 foreach ($queryVector as $key => $value) { 175 $dotProduct += $value * $embedding[$key]; 176 } 177 return $dotProduct; 178 } 179} 180