1f6ef2e50SAndreas Gohr<?php 2f6ef2e50SAndreas Gohr 3f6ef2e50SAndreas Gohr 4f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 5f6ef2e50SAndreas Gohr 6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 7f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB; 8f6ef2e50SAndreas Gohr 9f6ef2e50SAndreas Gohr/** 10f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database 11f6ef2e50SAndreas Gohr */ 12f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage 13f6ef2e50SAndreas Gohr{ 14f6ef2e50SAndreas Gohr /** @var SQLiteDB */ 15f6ef2e50SAndreas Gohr protected $db; 16f6ef2e50SAndreas Gohr 17f6ef2e50SAndreas Gohr /** 18f6ef2e50SAndreas Gohr * Initializes the database connection and registers our custom function 19f6ef2e50SAndreas Gohr * 20f6ef2e50SAndreas Gohr * @throws \Exception 21f6ef2e50SAndreas Gohr */ 22f6ef2e50SAndreas Gohr public function __construct() 23f6ef2e50SAndreas Gohr { 24f6ef2e50SAndreas Gohr $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 25f6ef2e50SAndreas Gohr $this->db->getPdo()->sqliteCreateFunction('COSIM', [$this, 'sqliteCosineSimilarityCallback'], 2); 26f6ef2e50SAndreas Gohr } 27f6ef2e50SAndreas Gohr 28f6ef2e50SAndreas Gohr /** @inheritdoc */ 29f6ef2e50SAndreas Gohr public function getChunk($chunkID) 30f6ef2e50SAndreas Gohr { 31f6ef2e50SAndreas Gohr $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 32f6ef2e50SAndreas Gohr if (!$record) return null; 33f6ef2e50SAndreas Gohr 34f6ef2e50SAndreas Gohr return new Chunk( 35f6ef2e50SAndreas Gohr $record['page'], 36f6ef2e50SAndreas Gohr $record['id'], 37f6ef2e50SAndreas Gohr $record['chunk'], 38f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 39f6ef2e50SAndreas Gohr $record['created'] 40f6ef2e50SAndreas Gohr ); 41f6ef2e50SAndreas Gohr } 42f6ef2e50SAndreas Gohr 43f6ef2e50SAndreas Gohr /** @inheritdoc */ 44f6ef2e50SAndreas Gohr public function startCreation($clear = false) 45f6ef2e50SAndreas Gohr { 46f6ef2e50SAndreas Gohr if ($clear) { 47f6ef2e50SAndreas Gohr /** @noinspection SqlWithoutWhere */ 48f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings'); 49f6ef2e50SAndreas Gohr } 50f6ef2e50SAndreas Gohr } 51f6ef2e50SAndreas Gohr 52f6ef2e50SAndreas Gohr /** @inheritdoc */ 53f6ef2e50SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 54f6ef2e50SAndreas Gohr { 55f6ef2e50SAndreas Gohr // no-op 56f6ef2e50SAndreas Gohr } 57f6ef2e50SAndreas Gohr 58f6ef2e50SAndreas Gohr /** @inheritdoc */ 59f6ef2e50SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 60f6ef2e50SAndreas Gohr { 61f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 62f6ef2e50SAndreas Gohr } 63f6ef2e50SAndreas Gohr 64f6ef2e50SAndreas Gohr /** @inheritdoc */ 65f6ef2e50SAndreas Gohr public function addPageChunks($chunks) 66f6ef2e50SAndreas Gohr { 67f6ef2e50SAndreas Gohr foreach ($chunks as $chunk) { 68f6ef2e50SAndreas Gohr $this->db->saveRecord('embeddings', [ 69f6ef2e50SAndreas Gohr 'page' => $chunk->getPage(), 70f6ef2e50SAndreas Gohr 'id' => $chunk->getId(), 71f6ef2e50SAndreas Gohr 'chunk' => $chunk->getText(), 72f6ef2e50SAndreas Gohr 'embedding' => json_encode($chunk->getEmbedding()), 73f6ef2e50SAndreas Gohr 'created' => $chunk->getCreated() 74f6ef2e50SAndreas Gohr ]); 75f6ef2e50SAndreas Gohr } 76f6ef2e50SAndreas Gohr } 77f6ef2e50SAndreas Gohr 78f6ef2e50SAndreas Gohr /** @inheritdoc */ 79f6ef2e50SAndreas Gohr public function finalizeCreation() 80f6ef2e50SAndreas Gohr { 81f6ef2e50SAndreas Gohr $this->db->exec('VACUUM'); 82f6ef2e50SAndreas Gohr } 83f6ef2e50SAndreas Gohr 84f6ef2e50SAndreas Gohr /** @inheritdoc */ 85f6ef2e50SAndreas Gohr public function getSimilarChunks($vector, $limit = 4) 86f6ef2e50SAndreas Gohr { 87f6ef2e50SAndreas Gohr $result = $this->db->queryAll( 88f6ef2e50SAndreas Gohr 'SELECT *, COSIM(?, embedding) AS similarity 89f6ef2e50SAndreas Gohr FROM embeddings 90f6ef2e50SAndreas Gohr WHERE GETACCESSLEVEL(page) > 0 91f6ef2e50SAndreas Gohr ORDER BY similarity DESC 92f6ef2e50SAndreas Gohr LIMIT ?', 93f6ef2e50SAndreas Gohr [json_encode($vector), $limit] 94f6ef2e50SAndreas Gohr ); 95f6ef2e50SAndreas Gohr $chunks = []; 96f6ef2e50SAndreas Gohr foreach ($result as $record) { 97f6ef2e50SAndreas Gohr $chunks[] = new Chunk( 98f6ef2e50SAndreas Gohr $record['page'], 99f6ef2e50SAndreas Gohr $record['id'], 100f6ef2e50SAndreas Gohr $record['chunk'], 101f6ef2e50SAndreas Gohr json_decode($record['embedding'], true), 102*9b3d1b36SAndreas Gohr $record['created'], 103*9b3d1b36SAndreas Gohr $record['similarity'] 104f6ef2e50SAndreas Gohr ); 105f6ef2e50SAndreas Gohr } 106f6ef2e50SAndreas Gohr return $chunks; 107f6ef2e50SAndreas Gohr } 108f6ef2e50SAndreas Gohr 109f6ef2e50SAndreas Gohr /** @inheritdoc */ 110f6ef2e50SAndreas Gohr public function statistics() 111f6ef2e50SAndreas Gohr { 112f6ef2e50SAndreas Gohr $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 113f6ef2e50SAndreas Gohr $size = $this->db->queryValue( 114f6ef2e50SAndreas Gohr 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 115f6ef2e50SAndreas Gohr ); 116f6ef2e50SAndreas Gohr return [ 117f6ef2e50SAndreas Gohr 'storage type' => 'SQLite', 118f6ef2e50SAndreas Gohr 'chunks' => $items, 119f6ef2e50SAndreas Gohr 'db size' => filesize_h($size) 120f6ef2e50SAndreas Gohr ]; 121f6ef2e50SAndreas Gohr } 122f6ef2e50SAndreas Gohr 123f6ef2e50SAndreas Gohr /** 124f6ef2e50SAndreas Gohr * Method registered as SQLite callback to calculate the cosine similarity 125f6ef2e50SAndreas Gohr * 126f6ef2e50SAndreas Gohr * @param string $query JSON encoded vector array 127f6ef2e50SAndreas Gohr * @param string $embedding JSON encoded vector array 128f6ef2e50SAndreas Gohr * @return float 129f6ef2e50SAndreas Gohr */ 130f6ef2e50SAndreas Gohr public function sqliteCosineSimilarityCallback($query, $embedding) 131f6ef2e50SAndreas Gohr { 132f6ef2e50SAndreas Gohr return (float)$this->cosineSimilarity(json_decode($query), json_decode($embedding)); 133f6ef2e50SAndreas Gohr } 134f6ef2e50SAndreas Gohr 135f6ef2e50SAndreas Gohr /** 136f6ef2e50SAndreas Gohr * Calculate the cosine similarity between two vectors 137f6ef2e50SAndreas Gohr * 138f6ef2e50SAndreas Gohr * @param float[] $queryVector The vector of the search phrase 139f6ef2e50SAndreas Gohr * @param float[] $embedding The vector of the chunk 140f6ef2e50SAndreas Gohr * @return float 141f6ef2e50SAndreas Gohr * @link https://doku.wiki/src-cosine-similarity 142f6ef2e50SAndreas Gohr */ 143f6ef2e50SAndreas Gohr protected function cosineSimilarity($queryVector, $embedding) 144f6ef2e50SAndreas Gohr { 145f6ef2e50SAndreas Gohr $dotProduct = 0; 146f6ef2e50SAndreas Gohr $queryEmbeddingLength = 0; 147f6ef2e50SAndreas Gohr $embeddingLength = 0; 148f6ef2e50SAndreas Gohr 149f6ef2e50SAndreas Gohr foreach ($queryVector as $key => $value) { 150f6ef2e50SAndreas Gohr $dotProduct += $value * $embedding[$key]; 151f6ef2e50SAndreas Gohr $queryEmbeddingLength += $value * $value; 152f6ef2e50SAndreas Gohr $embeddingLength += $embedding[$key] * $embedding[$key]; 153f6ef2e50SAndreas Gohr } 154f6ef2e50SAndreas Gohr 155f6ef2e50SAndreas Gohr return $dotProduct / (sqrt($queryEmbeddingLength) * sqrt($embeddingLength)); 156f6ef2e50SAndreas Gohr } 157f6ef2e50SAndreas Gohr} 158