1f6ef2e50SAndreas Gohr<?php 2f6ef2e50SAndreas Gohr 3f6ef2e50SAndreas Gohr 4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 5f6ef2e50SAndreas Gohr 6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB; 8f6ef2e50SAndreas Gohr 9f6ef2e50SAndreas Gohr/** 10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database 11*35555bacSAndreas Gohr * 12*35555bacSAndreas Gohr * Note: all embeddings are stored and returned as normalized vectors 13f6ef2e50SAndreas Gohr */ 14f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage 15f6ef2e50SAndreas Gohr{ 1681b450c8SAndreas Gohr /** @var float minimum similarity to consider a chunk a match */ 1781b450c8SAndreas Gohr const SIMILARITY_THRESHOLD = 0.75; 1881b450c8SAndreas Gohr 19f6ef2e50SAndreas Gohr /** @var SQLiteDB */ 20f6ef2e50SAndreas Gohr protected $db; 21f6ef2e50SAndreas Gohr 22f6ef2e50SAndreas Gohr /** 23f6ef2e50SAndreas Gohr * Initializes the database connection and registers our custom function 24f6ef2e50SAndreas Gohr * 25f6ef2e50SAndreas Gohr * @throws \Exception 26f6ef2e50SAndreas Gohr */ 27f6ef2e50SAndreas Gohr public function __construct() 28f6ef2e50SAndreas Gohr { 29f6ef2e50SAndreas Gohr $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 30f6ef2e50SAndreas Gohr $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 31f6ef2e50SAndreas Gohr } 32f6ef2e50SAndreas Gohr 33f6ef2e50SAndreas Gohr /** @inheritdoc */ 34f6ef2e50SAndreas Gohr public function getChunk($chunkID) 35f6ef2e50SAndreas Gohr { 36f6ef2e50SAndreas Gohr $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 37f6ef2e50SAndreas Gohr if (!$record) return null; 38f6ef2e50SAndreas Gohr 39f6ef2e50SAndreas Gohr return new Chunk( 40f6ef2e50SAndreas Gohr $record['page'], 41f6ef2e50SAndreas Gohr $record['id'], 42f6ef2e50SAndreas Gohr $record['chunk'], 43f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 44f6ef2e50SAndreas Gohr $record['created'] 45f6ef2e50SAndreas Gohr ); 46f6ef2e50SAndreas Gohr } 47f6ef2e50SAndreas Gohr 48f6ef2e50SAndreas Gohr /** @inheritdoc */ 49f6ef2e50SAndreas Gohr public function startCreation($clear = false) 50f6ef2e50SAndreas Gohr { 51f6ef2e50SAndreas Gohr if ($clear) { 52f6ef2e50SAndreas Gohr /** @noinspection SqlWithoutWhere */ 53f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings'); 54f6ef2e50SAndreas Gohr } 55f6ef2e50SAndreas Gohr } 56f6ef2e50SAndreas Gohr 57f6ef2e50SAndreas Gohr /** @inheritdoc */ 58f6ef2e50SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 59f6ef2e50SAndreas Gohr { 60f6ef2e50SAndreas Gohr // no-op 61f6ef2e50SAndreas Gohr } 62f6ef2e50SAndreas Gohr 63f6ef2e50SAndreas Gohr /** @inheritdoc */ 64f6ef2e50SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 65f6ef2e50SAndreas Gohr { 66f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 67f6ef2e50SAndreas Gohr } 68f6ef2e50SAndreas Gohr 69f6ef2e50SAndreas Gohr /** @inheritdoc */ 70f6ef2e50SAndreas Gohr public function addPageChunks($chunks) 71f6ef2e50SAndreas Gohr { 72f6ef2e50SAndreas Gohr foreach ($chunks as $chunk) { 73f6ef2e50SAndreas Gohr $this->db->saveRecord('embeddings', [ 74f6ef2e50SAndreas Gohr 'page' => $chunk->getPage(), 75f6ef2e50SAndreas Gohr 'id' => $chunk->getId(), 76f6ef2e50SAndreas Gohr 'chunk' => $chunk->getText(), 77f6ef2e50SAndreas Gohr 'embedding' => json_encode($chunk->getEmbedding()), 78f6ef2e50SAndreas Gohr 'created' => $chunk->getCreated() 79f6ef2e50SAndreas Gohr ]); 80f6ef2e50SAndreas Gohr } 81f6ef2e50SAndreas Gohr } 82f6ef2e50SAndreas Gohr 83f6ef2e50SAndreas Gohr /** @inheritdoc */ 84f6ef2e50SAndreas Gohr public function finalizeCreation() 85f6ef2e50SAndreas Gohr { 86f6ef2e50SAndreas Gohr $this->db->exec('VACUUM'); 87f6ef2e50SAndreas Gohr } 88f6ef2e50SAndreas Gohr 89f6ef2e50SAndreas Gohr /** @inheritdoc */ 9001f06932SAndreas Gohr public function getPageChunks($page, $firstChunkID) 9101f06932SAndreas Gohr { 9201f06932SAndreas Gohr $result = $this->db->queryAll( 9301f06932SAndreas Gohr 'SELECT * FROM embeddings WHERE page = ?', 9401f06932SAndreas Gohr [$page] 9501f06932SAndreas Gohr ); 9601f06932SAndreas Gohr $chunks = []; 9701f06932SAndreas Gohr foreach ($result as $record) { 9801f06932SAndreas Gohr $chunks[] = new Chunk( 9901f06932SAndreas Gohr $record['page'], 10001f06932SAndreas Gohr $record['id'], 10101f06932SAndreas Gohr $record['chunk'], 10201f06932SAndreas Gohr json_decode($record['embedding'], true), 10301f06932SAndreas Gohr $record['created'] 10401f06932SAndreas Gohr ); 10501f06932SAndreas Gohr } 10601f06932SAndreas Gohr return $chunks; 10701f06932SAndreas Gohr } 10801f06932SAndreas Gohr 10901f06932SAndreas Gohr 11001f06932SAndreas Gohr /** @inheritdoc */ 111f6ef2e50SAndreas Gohr public function getSimilarChunks($vector, $limit = 4) 112f6ef2e50SAndreas Gohr { 113f6ef2e50SAndreas Gohr $result = $this->db->queryAll( 114f6ef2e50SAndreas Gohr 'SELECT *, COSIM(?, embedding) AS similarity 115f6ef2e50SAndreas Gohr FROM embeddings 116f6ef2e50SAndreas Gohr WHERE GETACCESSLEVEL(page) > 0 11781b450c8SAndreas Gohr AND similarity > CAST(? AS FLOAT) 118f6ef2e50SAndreas Gohr ORDER BY similarity DESC 119f6ef2e50SAndreas Gohr LIMIT ?', 12081b450c8SAndreas Gohr [json_encode($vector), self::SIMILARITY_THRESHOLD, $limit] 121f6ef2e50SAndreas Gohr ); 122f6ef2e50SAndreas Gohr $chunks = []; 123f6ef2e50SAndreas Gohr foreach ($result as $record) { 124f6ef2e50SAndreas Gohr $chunks[] = new Chunk( 125f6ef2e50SAndreas Gohr $record['page'], 126f6ef2e50SAndreas Gohr $record['id'], 127f6ef2e50SAndreas Gohr $record['chunk'], 128f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 1299b3d1b36SAndreas Gohr $record['created'], 1309b3d1b36SAndreas Gohr $record['similarity'] 131f6ef2e50SAndreas Gohr ); 132f6ef2e50SAndreas Gohr } 133f6ef2e50SAndreas Gohr return $chunks; 134f6ef2e50SAndreas Gohr } 135f6ef2e50SAndreas Gohr 136f6ef2e50SAndreas Gohr /** @inheritdoc */ 137f6ef2e50SAndreas Gohr public function statistics() 138f6ef2e50SAndreas Gohr { 139f6ef2e50SAndreas Gohr $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 140f6ef2e50SAndreas Gohr $size = $this->db->queryValue( 141f6ef2e50SAndreas Gohr 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 142f6ef2e50SAndreas Gohr ); 143f6ef2e50SAndreas Gohr return [ 144f6ef2e50SAndreas Gohr 'storage type' => 'SQLite', 145f6ef2e50SAndreas Gohr 'chunks' => $items, 146f6ef2e50SAndreas Gohr 'db size' => filesize_h($size) 147f6ef2e50SAndreas Gohr ]; 148f6ef2e50SAndreas Gohr } 149f6ef2e50SAndreas Gohr 150f6ef2e50SAndreas Gohr /** 151f6ef2e50SAndreas Gohr * Method registered as SQLite callback to calculate the cosine similarity 152f6ef2e50SAndreas Gohr * 153f6ef2e50SAndreas Gohr * @param string $query JSON encoded vector array 154f6ef2e50SAndreas Gohr * @param string $embedding JSON encoded vector array 155f6ef2e50SAndreas Gohr * @return float 156f6ef2e50SAndreas Gohr */ 157f6ef2e50SAndreas Gohr public function sqliteCosineSimilarityCallback($query, $embedding) 158f6ef2e50SAndreas Gohr { 159f6ef2e50SAndreas Gohr return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 160f6ef2e50SAndreas Gohr } 161f6ef2e50SAndreas Gohr 162f6ef2e50SAndreas Gohr /** 163f6ef2e50SAndreas Gohr * Calculate the cosine similarity between two vectors 164f6ef2e50SAndreas Gohr * 165*35555bacSAndreas Gohr * Actually just calculating the dot product of the two vectors, since they are normalized 166*35555bacSAndreas Gohr * 167*35555bacSAndreas Gohr * @param float[] $queryVector The normalized vector of the search phrase 168*35555bacSAndreas Gohr * @param float[] $embedding The normalized vector of the chunk 169f6ef2e50SAndreas Gohr * @return float 170f6ef2e50SAndreas Gohr */ 171f6ef2e50SAndreas Gohr protected function cosineSimilarity($queryVector, $embedding) 172f6ef2e50SAndreas Gohr { 173f6ef2e50SAndreas Gohr $dotProduct = 0; 174f6ef2e50SAndreas Gohr foreach ($queryVector as $key => $value) { 175f6ef2e50SAndreas Gohr $dotProduct += $value * $embedding[$key]; 176f6ef2e50SAndreas Gohr } 177*35555bacSAndreas Gohr return $dotProduct; 178f6ef2e50SAndreas Gohr } 179f6ef2e50SAndreas Gohr} 180