1f6ef2e50SAndreas Gohr<?php 2f6ef2e50SAndreas Gohr 3f6ef2e50SAndreas Gohr 4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 5f6ef2e50SAndreas Gohr 6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB; 8f6ef2e50SAndreas Gohr 9f6ef2e50SAndreas Gohr/** 10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database 11f6ef2e50SAndreas Gohr */ 12f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage 13f6ef2e50SAndreas Gohr{ 14*81b450c8SAndreas Gohr /** @var float minimum similarity to consider a chunk a match */ 15*81b450c8SAndreas Gohr const SIMILARITY_THRESHOLD = 0.75; 16*81b450c8SAndreas Gohr 17f6ef2e50SAndreas Gohr /** @var SQLiteDB */ 18f6ef2e50SAndreas Gohr protected $db; 19f6ef2e50SAndreas Gohr 20f6ef2e50SAndreas Gohr /** 21f6ef2e50SAndreas Gohr * Initializes the database connection and registers our custom function 22f6ef2e50SAndreas Gohr * 23f6ef2e50SAndreas Gohr * @throws \Exception 24f6ef2e50SAndreas Gohr */ 25f6ef2e50SAndreas Gohr public function __construct() 26f6ef2e50SAndreas Gohr { 27f6ef2e50SAndreas Gohr $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 28f6ef2e50SAndreas Gohr $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 29f6ef2e50SAndreas Gohr } 30f6ef2e50SAndreas Gohr 31f6ef2e50SAndreas Gohr /** @inheritdoc */ 32f6ef2e50SAndreas Gohr public function getChunk($chunkID) 33f6ef2e50SAndreas Gohr { 34f6ef2e50SAndreas Gohr $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 35f6ef2e50SAndreas Gohr if (!$record) return null; 36f6ef2e50SAndreas Gohr 37f6ef2e50SAndreas Gohr return new Chunk( 38f6ef2e50SAndreas Gohr $record['page'], 39f6ef2e50SAndreas Gohr $record['id'], 40f6ef2e50SAndreas Gohr $record['chunk'], 41f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 42f6ef2e50SAndreas Gohr $record['created'] 43f6ef2e50SAndreas Gohr ); 44f6ef2e50SAndreas Gohr } 45f6ef2e50SAndreas Gohr 46f6ef2e50SAndreas Gohr /** @inheritdoc */ 47f6ef2e50SAndreas Gohr public function startCreation($clear = false) 48f6ef2e50SAndreas Gohr { 49f6ef2e50SAndreas Gohr if ($clear) { 50f6ef2e50SAndreas Gohr /** @noinspection SqlWithoutWhere */ 51f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings'); 52f6ef2e50SAndreas Gohr } 53f6ef2e50SAndreas Gohr } 54f6ef2e50SAndreas Gohr 55f6ef2e50SAndreas Gohr /** @inheritdoc */ 56f6ef2e50SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 57f6ef2e50SAndreas Gohr { 58f6ef2e50SAndreas Gohr // no-op 59f6ef2e50SAndreas Gohr } 60f6ef2e50SAndreas Gohr 61f6ef2e50SAndreas Gohr /** @inheritdoc */ 62f6ef2e50SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 63f6ef2e50SAndreas Gohr { 64f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 65f6ef2e50SAndreas Gohr } 66f6ef2e50SAndreas Gohr 67f6ef2e50SAndreas Gohr /** @inheritdoc */ 68f6ef2e50SAndreas Gohr public function addPageChunks($chunks) 69f6ef2e50SAndreas Gohr { 70f6ef2e50SAndreas Gohr foreach ($chunks as $chunk) { 71f6ef2e50SAndreas Gohr $this->db->saveRecord('embeddings', [ 72f6ef2e50SAndreas Gohr 'page' => $chunk->getPage(), 73f6ef2e50SAndreas Gohr 'id' => $chunk->getId(), 74f6ef2e50SAndreas Gohr 'chunk' => $chunk->getText(), 75f6ef2e50SAndreas Gohr 'embedding' => json_encode($chunk->getEmbedding()), 76f6ef2e50SAndreas Gohr 'created' => $chunk->getCreated() 77f6ef2e50SAndreas Gohr ]); 78f6ef2e50SAndreas Gohr } 79f6ef2e50SAndreas Gohr } 80f6ef2e50SAndreas Gohr 81f6ef2e50SAndreas Gohr /** @inheritdoc */ 82f6ef2e50SAndreas Gohr public function finalizeCreation() 83f6ef2e50SAndreas Gohr { 84f6ef2e50SAndreas Gohr $this->db->exec('VACUUM'); 85f6ef2e50SAndreas Gohr } 86f6ef2e50SAndreas Gohr 87f6ef2e50SAndreas Gohr /** @inheritdoc */ 88f6ef2e50SAndreas Gohr public function getSimilarChunks($vector, $limit = 4) 89f6ef2e50SAndreas Gohr { 90f6ef2e50SAndreas Gohr $result = $this->db->queryAll( 91f6ef2e50SAndreas Gohr 'SELECT *, COSIM(?, embedding) AS similarity 92f6ef2e50SAndreas Gohr FROM embeddings 93f6ef2e50SAndreas Gohr WHERE GETACCESSLEVEL(page) > 0 94*81b450c8SAndreas Gohr AND similarity > CAST(? AS FLOAT) 95f6ef2e50SAndreas Gohr ORDER BY similarity DESC 96f6ef2e50SAndreas Gohr LIMIT ?', 97*81b450c8SAndreas Gohr [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit] 98f6ef2e50SAndreas Gohr ); 99f6ef2e50SAndreas Gohr $chunks = []; 100f6ef2e50SAndreas Gohr foreach ($result as $record) { 101f6ef2e50SAndreas Gohr $chunks[] = new Chunk( 102f6ef2e50SAndreas Gohr $record['page'], 103f6ef2e50SAndreas Gohr $record['id'], 104f6ef2e50SAndreas Gohr $record['chunk'], 105f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 1069b3d1b36SAndreas Gohr $record['created'], 1079b3d1b36SAndreas Gohr $record['similarity'] 108f6ef2e50SAndreas Gohr ); 109f6ef2e50SAndreas Gohr } 110f6ef2e50SAndreas Gohr return $chunks; 111f6ef2e50SAndreas Gohr } 112f6ef2e50SAndreas Gohr 113f6ef2e50SAndreas Gohr /** @inheritdoc */ 114f6ef2e50SAndreas Gohr public function statistics() 115f6ef2e50SAndreas Gohr { 116f6ef2e50SAndreas Gohr $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 117f6ef2e50SAndreas Gohr $size = $this->db->queryValue( 118f6ef2e50SAndreas Gohr 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 119f6ef2e50SAndreas Gohr ); 120f6ef2e50SAndreas Gohr return [ 121f6ef2e50SAndreas Gohr 'storage type' => 'SQLite', 122f6ef2e50SAndreas Gohr 'chunks' => $items, 123f6ef2e50SAndreas Gohr 'db size' => filesize_h($size) 124f6ef2e50SAndreas Gohr ]; 125f6ef2e50SAndreas Gohr } 126f6ef2e50SAndreas Gohr 127f6ef2e50SAndreas Gohr /** 128f6ef2e50SAndreas Gohr * Method registered as SQLite callback to calculate the cosine similarity 129f6ef2e50SAndreas Gohr * 130f6ef2e50SAndreas Gohr * @param string $query JSON encoded vector array 131f6ef2e50SAndreas Gohr * @param string $embedding JSON encoded vector array 132f6ef2e50SAndreas Gohr * @return float 133f6ef2e50SAndreas Gohr */ 134f6ef2e50SAndreas Gohr public function sqliteCosineSimilarityCallback($query, $embedding) 135f6ef2e50SAndreas Gohr { 136f6ef2e50SAndreas Gohr return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 137f6ef2e50SAndreas Gohr } 138f6ef2e50SAndreas Gohr 139f6ef2e50SAndreas Gohr /** 140f6ef2e50SAndreas Gohr * Calculate the cosine similarity between two vectors 141f6ef2e50SAndreas Gohr * 142f6ef2e50SAndreas Gohr * @param float[] $queryVector The vector of the search phrase 143f6ef2e50SAndreas Gohr * @param float[] $embedding The vector of the chunk 144f6ef2e50SAndreas Gohr * @return float 145f6ef2e50SAndreas Gohr * @link https://doku.wiki/src-cosine-similarity 146f6ef2e50SAndreas Gohr */ 147f6ef2e50SAndreas Gohr protected function cosineSimilarity($queryVector, $embedding) 148f6ef2e50SAndreas Gohr { 149f6ef2e50SAndreas Gohr $dotProduct = 0; 150f6ef2e50SAndreas Gohr $queryEmbeddingLength = 0; 151f6ef2e50SAndreas Gohr $embeddingLength = 0; 152f6ef2e50SAndreas Gohr 153f6ef2e50SAndreas Gohr foreach ($queryVector as $key => $value) { 154f6ef2e50SAndreas Gohr $dotProduct += $value * $embedding[$key]; 155f6ef2e50SAndreas Gohr $queryEmbeddingLength += $value * $value; 156f6ef2e50SAndreas Gohr $embeddingLength += $embedding[$key] * $embedding[$key]; 157f6ef2e50SAndreas Gohr } 158f6ef2e50SAndreas Gohr 159f6ef2e50SAndreas Gohr return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength)); 160f6ef2e50SAndreas Gohr } 161f6ef2e50SAndreas Gohr} 162