1<?php 2 3 4namespace dokuwiki\plugin\aichat\Storage; 5 6use dokuwiki\plugin\aichat\Chunk; 7use dokuwiki\plugin\sqlite\SQLiteDB; 8 9/** 10 * Implements the storage backend using a SQLite database 11 */ 12class SQLiteStorage extends AbstractStorage 13{ 14 /** @var float minimum similarity to consider a chunk a match */ 15 const SIMILARITY_THRESHOLD = 0.75; 16 17 /** @var SQLiteDB */ 18 protected $db; 19 20 /** 21 * Initializes the database connection and registers our custom function 22 * 23 * @throws \Exception 24 */ 25 public function __construct() 26 { 27 $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 28 $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 29 } 30 31 /** @inheritdoc */ 32 public function getChunk($chunkID) 33 { 34 $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 35 if (!$record) return null; 36 37 return new Chunk( 38 $record['page'], 39 $record['id'], 40 $record['chunk'], 41 json_decode($record['embedding'], true), 42 $record['created'] 43 ); 44 } 45 46 /** @inheritdoc */ 47 public function startCreation($clear = false) 48 { 49 if ($clear) { 50 /** @noinspection SqlWithoutWhere */ 51 $this->db->exec('DELETE FROM embeddings'); 52 } 53 } 54 55 /** @inheritdoc */ 56 public function reusePageChunks($page, $firstChunkID) 57 { 58 // no-op 59 } 60 61 /** @inheritdoc */ 62 public function deletePageChunks($page, $firstChunkID) 63 { 64 $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 65 } 66 67 /** @inheritdoc */ 68 public function addPageChunks($chunks) 69 { 70 foreach ($chunks as $chunk) { 71 $this->db->saveRecord('embeddings', [ 72 'page' => $chunk->getPage(), 73 'id' => $chunk->getId(), 74 'chunk' => $chunk->getText(), 75 'embedding' => json_encode($chunk->getEmbedding()), 76 'created' => $chunk->getCreated() 77 ]); 78 } 79 } 80 81 /** @inheritdoc */ 82 public function finalizeCreation() 83 { 84 $this->db->exec('VACUUM'); 85 } 86 87 /** @inheritdoc */ 88 public function getSimilarChunks($vector, $limit = 4) 89 { 90 $result = $this->db->queryAll( 91 'SELECT *, COSIM(?, embedding) AS similarity 92 FROM embeddings 93 WHERE GETACCESSLEVEL(page) > 0 94 AND similarity > CAST(? AS FLOAT) 95 ORDER BY similarity DESC 96 LIMIT ?', 97 [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit] 98 ); 99 $chunks = []; 100 foreach ($result as $record) { 101 $chunks[] = new Chunk( 102 $record['page'], 103 $record['id'], 104 $record['chunk'], 105 json_decode($record['embedding'], true), 106 $record['created'], 107 $record['similarity'] 108 ); 109 } 110 return $chunks; 111 } 112 113 /** @inheritdoc */ 114 public function statistics() 115 { 116 $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 117 $size = $this->db->queryValue( 118 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 119 ); 120 return [ 121 'storage type' => 'SQLite', 122 'chunks' => $items, 123 'db size' => filesize_h($size) 124 ]; 125 } 126 127 /** 128 * Method registered as SQLite callback to calculate the cosine similarity 129 * 130 * @param string $query JSON encoded vector array 131 * @param string $embedding JSON encoded vector array 132 * @return float 133 */ 134 public function sqliteCosineSimilarityCallback($query, $embedding) 135 { 136 return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 137 } 138 139 /** 140 * Calculate the cosine similarity between two vectors 141 * 142 * @param float[] $queryVector The vector of the search phrase 143 * @param float[] $embedding The vector of the chunk 144 * @return float 145 * @link https://doku.wiki/src-cosine-similarity 146 */ 147 protected function cosineSimilarity($queryVector, $embedding) 148 { 149 $dotProduct = 0; 150 $queryEmbeddingLength = 0; 151 $embeddingLength = 0; 152 153 foreach ($queryVector as $key => $value) { 154 $dotProduct += $value * $embedding[$key]; 155 $queryEmbeddingLength += $value * $value; 156 $embeddingLength += $embedding[$key] * $embedding[$key]; 157 } 158 159 return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength)); 160 } 161} 162