1f6ef2e50SAndreas Gohr<?php 2f6ef2e50SAndreas Gohr 3f6ef2e50SAndreas Gohr 4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 5f6ef2e50SAndreas Gohr 6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB; 8f6ef2e50SAndreas Gohr 9f6ef2e50SAndreas Gohr/** 10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database 11f6ef2e50SAndreas Gohr */ 12f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage 13f6ef2e50SAndreas Gohr{ 1481b450c8SAndreas Gohr /** @var float minimum similarity to consider a chunk a match */ 1581b450c8SAndreas Gohr const SIMILARITY_THRESHOLD = 0.75; 1681b450c8SAndreas Gohr 17f6ef2e50SAndreas Gohr /** @var SQLiteDB */ 18f6ef2e50SAndreas Gohr protected $db; 19f6ef2e50SAndreas Gohr 20f6ef2e50SAndreas Gohr /** 21f6ef2e50SAndreas Gohr * Initializes the database connection and registers our custom function 22f6ef2e50SAndreas Gohr * 23f6ef2e50SAndreas Gohr * @throws \Exception 24f6ef2e50SAndreas Gohr */ 25f6ef2e50SAndreas Gohr public function __construct() 26f6ef2e50SAndreas Gohr { 27f6ef2e50SAndreas Gohr $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 28f6ef2e50SAndreas Gohr $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 29f6ef2e50SAndreas Gohr } 30f6ef2e50SAndreas Gohr 31f6ef2e50SAndreas Gohr /** @inheritdoc */ 32f6ef2e50SAndreas Gohr public function getChunk($chunkID) 33f6ef2e50SAndreas Gohr { 34f6ef2e50SAndreas Gohr $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 35f6ef2e50SAndreas Gohr if (!$record) return null; 36f6ef2e50SAndreas Gohr 37f6ef2e50SAndreas Gohr return new Chunk( 38f6ef2e50SAndreas Gohr $record['page'], 39f6ef2e50SAndreas Gohr $record['id'], 40f6ef2e50SAndreas Gohr $record['chunk'], 41f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 42f6ef2e50SAndreas Gohr $record['created'] 43f6ef2e50SAndreas Gohr ); 44f6ef2e50SAndreas Gohr } 45f6ef2e50SAndreas Gohr 46f6ef2e50SAndreas Gohr /** @inheritdoc */ 47f6ef2e50SAndreas Gohr public function startCreation($clear = false) 48f6ef2e50SAndreas Gohr { 49f6ef2e50SAndreas Gohr if ($clear) { 50f6ef2e50SAndreas Gohr /** @noinspection SqlWithoutWhere */ 51f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings'); 52f6ef2e50SAndreas Gohr } 53f6ef2e50SAndreas Gohr } 54f6ef2e50SAndreas Gohr 55f6ef2e50SAndreas Gohr /** @inheritdoc */ 56f6ef2e50SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 57f6ef2e50SAndreas Gohr { 58f6ef2e50SAndreas Gohr // no-op 59f6ef2e50SAndreas Gohr } 60f6ef2e50SAndreas Gohr 61f6ef2e50SAndreas Gohr /** @inheritdoc */ 62f6ef2e50SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 63f6ef2e50SAndreas Gohr { 64f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 65f6ef2e50SAndreas Gohr } 66f6ef2e50SAndreas Gohr 67f6ef2e50SAndreas Gohr /** @inheritdoc */ 68f6ef2e50SAndreas Gohr public function addPageChunks($chunks) 69f6ef2e50SAndreas Gohr { 70f6ef2e50SAndreas Gohr foreach ($chunks as $chunk) { 71f6ef2e50SAndreas Gohr $this->db->saveRecord('embeddings', [ 72f6ef2e50SAndreas Gohr 'page' => $chunk->getPage(), 73f6ef2e50SAndreas Gohr 'id' => $chunk->getId(), 74f6ef2e50SAndreas Gohr 'chunk' => $chunk->getText(), 75f6ef2e50SAndreas Gohr 'embedding' => json_encode($chunk->getEmbedding()), 76f6ef2e50SAndreas Gohr 'created' => $chunk->getCreated() 77f6ef2e50SAndreas Gohr ]); 78f6ef2e50SAndreas Gohr } 79f6ef2e50SAndreas Gohr } 80f6ef2e50SAndreas Gohr 81f6ef2e50SAndreas Gohr /** @inheritdoc */ 82f6ef2e50SAndreas Gohr public function finalizeCreation() 83f6ef2e50SAndreas Gohr { 84f6ef2e50SAndreas Gohr $this->db->exec('VACUUM'); 85f6ef2e50SAndreas Gohr } 86f6ef2e50SAndreas Gohr 87f6ef2e50SAndreas Gohr /** @inheritdoc */ 88*01f06932SAndreas Gohr public function getPageChunks($page, $firstChunkID) 89*01f06932SAndreas Gohr { 90*01f06932SAndreas Gohr $result = $this->db->queryAll( 91*01f06932SAndreas Gohr 'SELECT * FROM embeddings WHERE page = ?', 92*01f06932SAndreas Gohr [$page] 93*01f06932SAndreas Gohr ); 94*01f06932SAndreas Gohr $chunks = []; 95*01f06932SAndreas Gohr foreach ($result as $record) { 96*01f06932SAndreas Gohr $chunks[] = new Chunk( 97*01f06932SAndreas Gohr $record['page'], 98*01f06932SAndreas Gohr $record['id'], 99*01f06932SAndreas Gohr $record['chunk'], 100*01f06932SAndreas Gohr json_decode($record['embedding'], true), 101*01f06932SAndreas Gohr $record['created'] 102*01f06932SAndreas Gohr ); 103*01f06932SAndreas Gohr } 104*01f06932SAndreas Gohr return $chunks; 105*01f06932SAndreas Gohr } 106*01f06932SAndreas Gohr 107*01f06932SAndreas Gohr 108*01f06932SAndreas Gohr /** @inheritdoc */ 109f6ef2e50SAndreas Gohr public function getSimilarChunks($vector, $limit = 4) 110f6ef2e50SAndreas Gohr { 111f6ef2e50SAndreas Gohr $result = $this->db->queryAll( 112f6ef2e50SAndreas Gohr 'SELECT *, COSIM(?, embedding) AS similarity 113f6ef2e50SAndreas Gohr FROM embeddings 114f6ef2e50SAndreas Gohr WHERE GETACCESSLEVEL(page) > 0 11581b450c8SAndreas Gohr AND similarity > CAST(? AS FLOAT) 116f6ef2e50SAndreas Gohr ORDER BY similarity DESC 117f6ef2e50SAndreas Gohr LIMIT ?', 11881b450c8SAndreas Gohr [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit] 119f6ef2e50SAndreas Gohr ); 120f6ef2e50SAndreas Gohr $chunks = []; 121f6ef2e50SAndreas Gohr foreach ($result as $record) { 122f6ef2e50SAndreas Gohr $chunks[] = new Chunk( 123f6ef2e50SAndreas Gohr $record['page'], 124f6ef2e50SAndreas Gohr $record['id'], 125f6ef2e50SAndreas Gohr $record['chunk'], 126f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 1279b3d1b36SAndreas Gohr $record['created'], 1289b3d1b36SAndreas Gohr $record['similarity'] 129f6ef2e50SAndreas Gohr ); 130f6ef2e50SAndreas Gohr } 131f6ef2e50SAndreas Gohr return $chunks; 132f6ef2e50SAndreas Gohr } 133f6ef2e50SAndreas Gohr 134f6ef2e50SAndreas Gohr /** @inheritdoc */ 135f6ef2e50SAndreas Gohr public function statistics() 136f6ef2e50SAndreas Gohr { 137f6ef2e50SAndreas Gohr $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 138f6ef2e50SAndreas Gohr $size = $this->db->queryValue( 139f6ef2e50SAndreas Gohr 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 140f6ef2e50SAndreas Gohr ); 141f6ef2e50SAndreas Gohr return [ 142f6ef2e50SAndreas Gohr 'storage type' => 'SQLite', 143f6ef2e50SAndreas Gohr 'chunks' => $items, 144f6ef2e50SAndreas Gohr 'db size' => filesize_h($size) 145f6ef2e50SAndreas Gohr ]; 146f6ef2e50SAndreas Gohr } 147f6ef2e50SAndreas Gohr 148f6ef2e50SAndreas Gohr /** 149f6ef2e50SAndreas Gohr * Method registered as SQLite callback to calculate the cosine similarity 150f6ef2e50SAndreas Gohr * 151f6ef2e50SAndreas Gohr * @param string $query JSON encoded vector array 152f6ef2e50SAndreas Gohr * @param string $embedding JSON encoded vector array 153f6ef2e50SAndreas Gohr * @return float 154f6ef2e50SAndreas Gohr */ 155f6ef2e50SAndreas Gohr public function sqliteCosineSimilarityCallback($query, $embedding) 156f6ef2e50SAndreas Gohr { 157f6ef2e50SAndreas Gohr return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 158f6ef2e50SAndreas Gohr } 159f6ef2e50SAndreas Gohr 160f6ef2e50SAndreas Gohr /** 161f6ef2e50SAndreas Gohr * Calculate the cosine similarity between two vectors 162f6ef2e50SAndreas Gohr * 163f6ef2e50SAndreas Gohr * @param float[] $queryVector The vector of the search phrase 164f6ef2e50SAndreas Gohr * @param float[] $embedding The vector of the chunk 165f6ef2e50SAndreas Gohr * @return float 166f6ef2e50SAndreas Gohr * @link https://doku.wiki/src-cosine-similarity 167f6ef2e50SAndreas Gohr */ 168f6ef2e50SAndreas Gohr protected function cosineSimilarity($queryVector, $embedding) 169f6ef2e50SAndreas Gohr { 170f6ef2e50SAndreas Gohr $dotProduct = 0; 171f6ef2e50SAndreas Gohr $queryEmbeddingLength = 0; 172f6ef2e50SAndreas Gohr $embeddingLength = 0; 173f6ef2e50SAndreas Gohr 174f6ef2e50SAndreas Gohr foreach ($queryVector as $key => $value) { 175f6ef2e50SAndreas Gohr $dotProduct += $value * $embedding[$key]; 176f6ef2e50SAndreas Gohr $queryEmbeddingLength += $value * $value; 177f6ef2e50SAndreas Gohr $embeddingLength += $embedding[$key] * $embedding[$key]; 178f6ef2e50SAndreas Gohr } 179f6ef2e50SAndreas Gohr 180f6ef2e50SAndreas Gohr return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength)); 181f6ef2e50SAndreas Gohr } 182f6ef2e50SAndreas Gohr} 183