17ebc7895Ssplitbrain<?php 27ebc7895Ssplitbrain 37ebc7895Ssplitbrain/** @noinspection SqlResolve */ 4f6ef2e50SAndreas Gohr 5f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage; 6f6ef2e50SAndreas Gohr 7e33a1d7aSAndreas Gohruse dokuwiki\plugin\aichat\AIChat; 8f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk; 9f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB; 103379af09SAndreas Gohruse KMeans\Cluster; 113379af09SAndreas Gohruse KMeans\Space; 12f6ef2e50SAndreas Gohr 13f6ef2e50SAndreas Gohr/** 14f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database 1535555bacSAndreas Gohr * 1635555bacSAndreas Gohr * Note: all embeddings are stored and returned as normalized vectors 17f6ef2e50SAndreas Gohr */ 18f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage 19f6ef2e50SAndreas Gohr{ 203379af09SAndreas Gohr /** @var int Number of documents to randomly sample to create the clusters */ 2130b9cbc7Ssplitbrain final public const SAMPLE_SIZE = 2000; 223379af09SAndreas Gohr /** @var int The average size of each cluster */ 2330b9cbc7Ssplitbrain final public const CLUSTER_SIZE = 400; 243379af09SAndreas Gohr 25f6ef2e50SAndreas Gohr /** @var SQLiteDB */ 26f6ef2e50SAndreas Gohr protected $db; 27f6ef2e50SAndreas Gohr 28e33a1d7aSAndreas Gohr protected $useLanguageClusters = false; 29e33a1d7aSAndreas Gohr 30720bb43fSAndreas Gohr /** @var float minimum similarity to consider a chunk a match */ 31720bb43fSAndreas Gohr protected $similarityThreshold = 0; 32720bb43fSAndreas Gohr 3304afb84fSAndreas Gohr /** @inheritdoc */ 3404afb84fSAndreas Gohr public function __construct(array $config) 35f6ef2e50SAndreas Gohr { 36f6ef2e50SAndreas Gohr $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/'); 3730b9cbc7Ssplitbrain $this->db->getPdo()->sqliteCreateFunction('COSIM', $this->sqliteCosineSimilarityCallback(...), 2); 38e33a1d7aSAndreas Gohr 39e33a1d7aSAndreas Gohr $helper = plugin_load('helper', 'aichat'); 40e33a1d7aSAndreas Gohr $this->useLanguageClusters = $helper->getConf('preferUIlanguage') >= AIChat::LANG_UI_LIMITED; 41720bb43fSAndreas Gohr 42720bb43fSAndreas Gohr $this->similarityThreshold = $config['similarityThreshold'] / 100; 43f6ef2e50SAndreas Gohr } 44f6ef2e50SAndreas Gohr 45f6ef2e50SAndreas Gohr /** @inheritdoc */ 46f6ef2e50SAndreas Gohr public function getChunk($chunkID) 47f6ef2e50SAndreas Gohr { 48f6ef2e50SAndreas Gohr $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]); 49f6ef2e50SAndreas Gohr if (!$record) return null; 50f6ef2e50SAndreas Gohr 51f6ef2e50SAndreas Gohr return new Chunk( 52f6ef2e50SAndreas Gohr $record['page'], 53f6ef2e50SAndreas Gohr $record['id'], 54f6ef2e50SAndreas Gohr $record['chunk'], 5530b9cbc7Ssplitbrain json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR), 56e33a1d7aSAndreas Gohr $record['lang'], 57f6ef2e50SAndreas Gohr $record['created'] 58f6ef2e50SAndreas Gohr ); 59f6ef2e50SAndreas Gohr } 60f6ef2e50SAndreas Gohr 61f6ef2e50SAndreas Gohr /** @inheritdoc */ 62f6ef2e50SAndreas Gohr public function startCreation($clear = false) 63f6ef2e50SAndreas Gohr { 64f6ef2e50SAndreas Gohr if ($clear) { 65f6ef2e50SAndreas Gohr /** @noinspection SqlWithoutWhere */ 66f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings'); 67ab1f8ddeSAndreas Gohr /** @noinspection SqlWithoutWhere */ 68ab1f8ddeSAndreas Gohr $this->db->exec('DELETE FROM clusters'); 69f6ef2e50SAndreas Gohr } 70f6ef2e50SAndreas Gohr } 71f6ef2e50SAndreas Gohr 72f6ef2e50SAndreas Gohr /** @inheritdoc */ 73f6ef2e50SAndreas Gohr public function reusePageChunks($page, $firstChunkID) 74f6ef2e50SAndreas Gohr { 75f6ef2e50SAndreas Gohr // no-op 76f6ef2e50SAndreas Gohr } 77f6ef2e50SAndreas Gohr 78f6ef2e50SAndreas Gohr /** @inheritdoc */ 79f6ef2e50SAndreas Gohr public function deletePageChunks($page, $firstChunkID) 80f6ef2e50SAndreas Gohr { 81f6ef2e50SAndreas Gohr $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]); 82f6ef2e50SAndreas Gohr } 83f6ef2e50SAndreas Gohr 84f6ef2e50SAndreas Gohr /** @inheritdoc */ 85f6ef2e50SAndreas Gohr public function addPageChunks($chunks) 86f6ef2e50SAndreas Gohr { 87f6ef2e50SAndreas Gohr foreach ($chunks as $chunk) { 88f6ef2e50SAndreas Gohr $this->db->saveRecord('embeddings', [ 89f6ef2e50SAndreas Gohr 'page' => $chunk->getPage(), 90f6ef2e50SAndreas Gohr 'id' => $chunk->getId(), 91f6ef2e50SAndreas Gohr 'chunk' => $chunk->getText(), 9230b9cbc7Ssplitbrain 'embedding' => json_encode($chunk->getEmbedding(), JSON_THROW_ON_ERROR), 93e33a1d7aSAndreas Gohr 'created' => $chunk->getCreated(), 94e33a1d7aSAndreas Gohr 'lang' => $chunk->getLanguage(), 95f6ef2e50SAndreas Gohr ]); 96f6ef2e50SAndreas Gohr } 97f6ef2e50SAndreas Gohr } 98f6ef2e50SAndreas Gohr 99f6ef2e50SAndreas Gohr /** @inheritdoc */ 100f6ef2e50SAndreas Gohr public function finalizeCreation() 101f6ef2e50SAndreas Gohr { 1023379af09SAndreas Gohr if (!$this->hasClusters()) { 1033379af09SAndreas Gohr $this->createClusters(); 1043379af09SAndreas Gohr } 1053379af09SAndreas Gohr $this->setChunkClusters(); 1063379af09SAndreas Gohr 107f6ef2e50SAndreas Gohr $this->db->exec('VACUUM'); 108f6ef2e50SAndreas Gohr } 109f6ef2e50SAndreas Gohr 110f6ef2e50SAndreas Gohr /** @inheritdoc */ 1113379af09SAndreas Gohr public function runMaintenance() 1123379af09SAndreas Gohr { 1133379af09SAndreas Gohr $this->createClusters(); 1143379af09SAndreas Gohr $this->setChunkClusters(); 1153379af09SAndreas Gohr } 1163379af09SAndreas Gohr 1173379af09SAndreas Gohr /** @inheritdoc */ 11801f06932SAndreas Gohr public function getPageChunks($page, $firstChunkID) 11901f06932SAndreas Gohr { 12001f06932SAndreas Gohr $result = $this->db->queryAll( 12101f06932SAndreas Gohr 'SELECT * FROM embeddings WHERE page = ?', 12201f06932SAndreas Gohr [$page] 12301f06932SAndreas Gohr ); 12401f06932SAndreas Gohr $chunks = []; 12501f06932SAndreas Gohr foreach ($result as $record) { 12601f06932SAndreas Gohr $chunks[] = new Chunk( 12701f06932SAndreas Gohr $record['page'], 12801f06932SAndreas Gohr $record['id'], 12901f06932SAndreas Gohr $record['chunk'], 13030b9cbc7Ssplitbrain json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR), 131e33a1d7aSAndreas Gohr $record['lang'], 13201f06932SAndreas Gohr $record['created'] 13301f06932SAndreas Gohr ); 13401f06932SAndreas Gohr } 13501f06932SAndreas Gohr return $chunks; 13601f06932SAndreas Gohr } 13701f06932SAndreas Gohr 13801f06932SAndreas Gohr /** @inheritdoc */ 139e33a1d7aSAndreas Gohr public function getSimilarChunks($vector, $lang = '', $limit = 4) 140f6ef2e50SAndreas Gohr { 141e33a1d7aSAndreas Gohr $cluster = $this->getCluster($vector, $lang); 1428285fff9SAndreas Gohr if ($this->logger) $this->logger->info( 1437ebc7895Ssplitbrain 'Using cluster {cluster} for similarity search', 1447ebc7895Ssplitbrain ['cluster' => $cluster] 1458285fff9SAndreas Gohr ); 1463379af09SAndreas Gohr 147f6ef2e50SAndreas Gohr $result = $this->db->queryAll( 148f6ef2e50SAndreas Gohr 'SELECT *, COSIM(?, embedding) AS similarity 149f6ef2e50SAndreas Gohr FROM embeddings 1503379af09SAndreas Gohr WHERE cluster = ? 1513379af09SAndreas Gohr AND GETACCESSLEVEL(page) > 0 15281b450c8SAndreas Gohr AND similarity > CAST(? AS FLOAT) 153f6ef2e50SAndreas Gohr ORDER BY similarity DESC 154f6ef2e50SAndreas Gohr LIMIT ?', 155720bb43fSAndreas Gohr [json_encode($vector, JSON_THROW_ON_ERROR), $cluster, $this->similarityThreshold, $limit] 156f6ef2e50SAndreas Gohr ); 157f6ef2e50SAndreas Gohr $chunks = []; 158f6ef2e50SAndreas Gohr foreach ($result as $record) { 159f6ef2e50SAndreas Gohr $chunks[] = new Chunk( 160f6ef2e50SAndreas Gohr $record['page'], 161f6ef2e50SAndreas Gohr $record['id'], 162f6ef2e50SAndreas Gohr $record['chunk'], 16330b9cbc7Ssplitbrain json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR), 164e33a1d7aSAndreas Gohr $record['lang'], 1659b3d1b36SAndreas Gohr $record['created'], 1669b3d1b36SAndreas Gohr $record['similarity'] 167f6ef2e50SAndreas Gohr ); 168f6ef2e50SAndreas Gohr } 169f6ef2e50SAndreas Gohr return $chunks; 170f6ef2e50SAndreas Gohr } 171f6ef2e50SAndreas Gohr 172f6ef2e50SAndreas Gohr /** @inheritdoc */ 173f6ef2e50SAndreas Gohr public function statistics() 174f6ef2e50SAndreas Gohr { 175f6ef2e50SAndreas Gohr $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings'); 176f6ef2e50SAndreas Gohr $size = $this->db->queryValue( 177f6ef2e50SAndreas Gohr 'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()' 178f6ef2e50SAndreas Gohr ); 179f8d5ae01SAndreas Gohr $query = "SELECT cluster || ' ' || lang, COUNT(*) || ' chunks' as cnt 180f8d5ae01SAndreas Gohr FROM embeddings 181f8d5ae01SAndreas Gohr GROUP BY cluster 182f8d5ae01SAndreas Gohr ORDER BY cluster"; 1833379af09SAndreas Gohr $clusters = $this->db->queryKeyValueList($query); 1843379af09SAndreas Gohr 185f6ef2e50SAndreas Gohr return [ 186f6ef2e50SAndreas Gohr 'storage type' => 'SQLite', 187f6ef2e50SAndreas Gohr 'chunks' => $items, 1883379af09SAndreas Gohr 'db size' => filesize_h($size), 1893379af09SAndreas Gohr 'clusters' => $clusters, 190f6ef2e50SAndreas Gohr ]; 191f6ef2e50SAndreas Gohr } 192f6ef2e50SAndreas Gohr 193f6ef2e50SAndreas Gohr /** 194f6ef2e50SAndreas Gohr * Method registered as SQLite callback to calculate the cosine similarity 195f6ef2e50SAndreas Gohr * 196f6ef2e50SAndreas Gohr * @param string $query JSON encoded vector array 197f6ef2e50SAndreas Gohr * @param string $embedding JSON encoded vector array 198f6ef2e50SAndreas Gohr * @return float 199f6ef2e50SAndreas Gohr */ 200f6ef2e50SAndreas Gohr public function sqliteCosineSimilarityCallback($query, $embedding) 201f6ef2e50SAndreas Gohr { 202441edf84SAndreas Gohr return (float)$this->cosineSimilarity( 203441edf84SAndreas Gohr json_decode($query, true, 512, JSON_THROW_ON_ERROR), 204441edf84SAndreas Gohr json_decode($embedding, true, 512, JSON_THROW_ON_ERROR) 205441edf84SAndreas Gohr ); 206f6ef2e50SAndreas Gohr } 207f6ef2e50SAndreas Gohr 208f6ef2e50SAndreas Gohr /** 209f6ef2e50SAndreas Gohr * Calculate the cosine similarity between two vectors 210f6ef2e50SAndreas Gohr * 21135555bacSAndreas Gohr * Actually just calculating the dot product of the two vectors, since they are normalized 21235555bacSAndreas Gohr * 21335555bacSAndreas Gohr * @param float[] $queryVector The normalized vector of the search phrase 21435555bacSAndreas Gohr * @param float[] $embedding The normalized vector of the chunk 215f6ef2e50SAndreas Gohr * @return float 216f6ef2e50SAndreas Gohr */ 217f6ef2e50SAndreas Gohr protected function cosineSimilarity($queryVector, $embedding) 218f6ef2e50SAndreas Gohr { 219f6ef2e50SAndreas Gohr $dotProduct = 0; 220f6ef2e50SAndreas Gohr foreach ($queryVector as $key => $value) { 221*31a78876SAndreas Gohr if(!isset($embedding[$key])) break; // if the vector is shorter than the query, stop. 222f6ef2e50SAndreas Gohr $dotProduct += $value * $embedding[$key]; 223f6ef2e50SAndreas Gohr } 22435555bacSAndreas Gohr return $dotProduct; 225f6ef2e50SAndreas Gohr } 2263379af09SAndreas Gohr 2273379af09SAndreas Gohr /** 2283379af09SAndreas Gohr * Create new clusters based on random chunks 2293379af09SAndreas Gohr * 230e33a1d7aSAndreas Gohr * @return void 2313379af09SAndreas Gohr */ 2323379af09SAndreas Gohr protected function createClusters() 2333379af09SAndreas Gohr { 234e33a1d7aSAndreas Gohr if ($this->useLanguageClusters) { 235e33a1d7aSAndreas Gohr $result = $this->db->queryAll('SELECT DISTINCT lang FROM embeddings'); 236e33a1d7aSAndreas Gohr $langs = array_column($result, 'lang'); 237e33a1d7aSAndreas Gohr foreach ($langs as $lang) { 238e33a1d7aSAndreas Gohr $this->createLanguageClusters($lang); 239e33a1d7aSAndreas Gohr } 240e33a1d7aSAndreas Gohr } else { 241e33a1d7aSAndreas Gohr $this->createLanguageClusters(''); 242e33a1d7aSAndreas Gohr } 243e33a1d7aSAndreas Gohr } 244e33a1d7aSAndreas Gohr 245e33a1d7aSAndreas Gohr /** 246e33a1d7aSAndreas Gohr * Create new clusters based on random chunks for the given Language 247e33a1d7aSAndreas Gohr * 248e33a1d7aSAndreas Gohr * @param string $lang The language to cluster, empty when all languages go into the same cluster 249e33a1d7aSAndreas Gohr * @noinspection SqlWithoutWhere 250e33a1d7aSAndreas Gohr */ 251e33a1d7aSAndreas Gohr protected function createLanguageClusters($lang) 252e33a1d7aSAndreas Gohr { 253e33a1d7aSAndreas Gohr if ($lang != '') { 254e33a1d7aSAndreas Gohr $where = 'WHERE lang = ' . $this->db->getPdo()->quote($lang); 255e33a1d7aSAndreas Gohr } else { 256e33a1d7aSAndreas Gohr $where = ''; 257e33a1d7aSAndreas Gohr } 258e33a1d7aSAndreas Gohr 259e33a1d7aSAndreas Gohr if ($this->logger) $this->logger->info('Creating new {lang} clusters...', ['lang' => $lang]); 2603379af09SAndreas Gohr $this->db->getPdo()->beginTransaction(); 2613379af09SAndreas Gohr try { 2623379af09SAndreas Gohr // clean up old cluster data 263e33a1d7aSAndreas Gohr $query = "DELETE FROM clusters $where"; 2643379af09SAndreas Gohr $this->db->exec($query); 265e33a1d7aSAndreas Gohr $query = "UPDATE embeddings SET cluster = NULL $where"; 2663379af09SAndreas Gohr $this->db->exec($query); 2673379af09SAndreas Gohr 2683379af09SAndreas Gohr // get a random selection of chunks 269e33a1d7aSAndreas Gohr $query = "SELECT id, embedding FROM embeddings $where ORDER BY RANDOM() LIMIT ?"; 2703379af09SAndreas Gohr $result = $this->db->queryAll($query, [self::SAMPLE_SIZE]); 2713379af09SAndreas Gohr if (!$result) return; // no data to cluster 27230b9cbc7Ssplitbrain $dimensions = count(json_decode((string) $result[0]['embedding'], true, 512, JSON_THROW_ON_ERROR)); 2733379af09SAndreas Gohr 274adfc5429SAndreas Gohr // how many clusters? 275adfc5429SAndreas Gohr if (count($result) < self::CLUSTER_SIZE * 3) { 276adfc5429SAndreas Gohr // there would be less than 3 clusters, so just use one 277adfc5429SAndreas Gohr $clustercount = 1; 278adfc5429SAndreas Gohr } else { 2793379af09SAndreas Gohr // get the number of all chunks, to calculate the number of clusters 280e33a1d7aSAndreas Gohr $query = "SELECT COUNT(*) FROM embeddings $where"; 2813379af09SAndreas Gohr $total = $this->db->queryValue($query); 2823379af09SAndreas Gohr $clustercount = ceil($total / self::CLUSTER_SIZE); 283adfc5429SAndreas Gohr } 2843379af09SAndreas Gohr if ($this->logger) $this->logger->info('Creating {clusters} clusters', ['clusters' => $clustercount]); 2853379af09SAndreas Gohr 2863379af09SAndreas Gohr // cluster them using kmeans 2873379af09SAndreas Gohr $space = new Space($dimensions); 2883379af09SAndreas Gohr foreach ($result as $record) { 28930b9cbc7Ssplitbrain $space->addPoint(json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR)); 2903379af09SAndreas Gohr } 2913379af09SAndreas Gohr $clusters = $space->solve($clustercount, function ($space, $clusters) { 2923379af09SAndreas Gohr static $iterations = 0; 2933379af09SAndreas Gohr ++$iterations; 2943379af09SAndreas Gohr if ($this->logger) { 2957ebc7895Ssplitbrain $clustercounts = implode(',', array_map('count', $clusters)); 2963379af09SAndreas Gohr $this->logger->info('Iteration {iteration}: [{clusters}]', [ 2973379af09SAndreas Gohr 'iteration' => $iterations, 'clusters' => $clustercounts 2983379af09SAndreas Gohr ]); 2993379af09SAndreas Gohr } 3003379af09SAndreas Gohr }, Cluster::INIT_KMEANS_PLUS_PLUS); 3013379af09SAndreas Gohr 3023379af09SAndreas Gohr // store the clusters 3037ebc7895Ssplitbrain foreach ($clusters as $cluster) { 3043379af09SAndreas Gohr /** @var Cluster $cluster */ 3053379af09SAndreas Gohr $centroid = $cluster->getCoordinates(); 306e33a1d7aSAndreas Gohr $query = 'INSERT INTO clusters (lang, centroid) VALUES (?, ?)'; 30730b9cbc7Ssplitbrain $this->db->exec($query, [$lang, json_encode($centroid, JSON_THROW_ON_ERROR)]); 3083379af09SAndreas Gohr } 3093379af09SAndreas Gohr 3103379af09SAndreas Gohr $this->db->getPdo()->commit(); 3113379af09SAndreas Gohr if ($this->logger) $this->logger->success('Created {clusters} clusters', ['clusters' => count($clusters)]); 3123379af09SAndreas Gohr } catch (\Exception $e) { 3133379af09SAndreas Gohr $this->db->getPdo()->rollBack(); 31442b2c6e8SAndreas Gohr throw new \RuntimeException('Clustering failed: ' . $e->getMessage(), 4005, $e); 3153379af09SAndreas Gohr } 3163379af09SAndreas Gohr } 3173379af09SAndreas Gohr 3183379af09SAndreas Gohr /** 3193379af09SAndreas Gohr * Assign the nearest cluster for all chunks that don't have one 3203379af09SAndreas Gohr * 3213379af09SAndreas Gohr * @return void 3223379af09SAndreas Gohr */ 3233379af09SAndreas Gohr protected function setChunkClusters() 3243379af09SAndreas Gohr { 3253379af09SAndreas Gohr if ($this->logger) $this->logger->info('Assigning clusters to chunks...'); 326e33a1d7aSAndreas Gohr $query = 'SELECT id, embedding, lang FROM embeddings WHERE cluster IS NULL'; 3273379af09SAndreas Gohr $handle = $this->db->query($query); 3283379af09SAndreas Gohr 3293379af09SAndreas Gohr while ($record = $handle->fetch(\PDO::FETCH_ASSOC)) { 33030b9cbc7Ssplitbrain $vector = json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR); 331e33a1d7aSAndreas Gohr $cluster = $this->getCluster($vector, $this->useLanguageClusters ? $record['lang'] : ''); 3323379af09SAndreas Gohr $query = 'UPDATE embeddings SET cluster = ? WHERE id = ?'; 3333379af09SAndreas Gohr $this->db->exec($query, [$cluster, $record['id']]); 3343379af09SAndreas Gohr if ($this->logger) $this->logger->success( 3357ebc7895Ssplitbrain 'Chunk {id} assigned to cluster {cluster}', 3367ebc7895Ssplitbrain ['id' => $record['id'], 'cluster' => $cluster] 3373379af09SAndreas Gohr ); 3383379af09SAndreas Gohr } 3393379af09SAndreas Gohr $handle->closeCursor(); 3403379af09SAndreas Gohr } 3413379af09SAndreas Gohr 3423379af09SAndreas Gohr /** 3433379af09SAndreas Gohr * Get the nearest cluster for the given vector 3443379af09SAndreas Gohr * 3453379af09SAndreas Gohr * @param float[] $vector 3463379af09SAndreas Gohr * @return int|null 3473379af09SAndreas Gohr */ 348e33a1d7aSAndreas Gohr protected function getCluster($vector, $lang) 3493379af09SAndreas Gohr { 350e33a1d7aSAndreas Gohr if ($lang != '') { 351e33a1d7aSAndreas Gohr $where = 'WHERE lang = ' . $this->db->getPdo()->quote($lang); 352e33a1d7aSAndreas Gohr } else { 353e33a1d7aSAndreas Gohr $where = ''; 354e33a1d7aSAndreas Gohr } 355e33a1d7aSAndreas Gohr 356e33a1d7aSAndreas Gohr $query = "SELECT cluster, centroid 357e33a1d7aSAndreas Gohr FROM clusters 358e33a1d7aSAndreas Gohr $where 359e33a1d7aSAndreas Gohr ORDER BY COSIM(centroid, ?) DESC 360e33a1d7aSAndreas Gohr LIMIT 1"; 361e33a1d7aSAndreas Gohr 36230b9cbc7Ssplitbrain $result = $this->db->queryRecord($query, [json_encode($vector, JSON_THROW_ON_ERROR)]); 3633379af09SAndreas Gohr if (!$result) return null; 3643379af09SAndreas Gohr return $result['cluster']; 3653379af09SAndreas Gohr } 3663379af09SAndreas Gohr 3673379af09SAndreas Gohr /** 3683379af09SAndreas Gohr * Check if clustering has been done before 3693379af09SAndreas Gohr * @return bool 3703379af09SAndreas Gohr */ 3713379af09SAndreas Gohr protected function hasClusters() 3723379af09SAndreas Gohr { 3733379af09SAndreas Gohr $query = 'SELECT COUNT(*) FROM clusters'; 3743379af09SAndreas Gohr return $this->db->queryValue($query) > 0; 3753379af09SAndreas Gohr } 3768c8b7ba6SAndreas Gohr 3778c8b7ba6SAndreas Gohr /** 3788c8b7ba6SAndreas Gohr * Writes TSV files for visualizing with http://projector.tensorflow.org/ 3798c8b7ba6SAndreas Gohr * 3808c8b7ba6SAndreas Gohr * @param string $vectorfile path to the file with the vectors 3818c8b7ba6SAndreas Gohr * @param string $metafile path to the file with the metadata 3828c8b7ba6SAndreas Gohr * @return void 3838c8b7ba6SAndreas Gohr */ 3848c8b7ba6SAndreas Gohr public function dumpTSV($vectorfile, $metafile) 3858c8b7ba6SAndreas Gohr { 3868c8b7ba6SAndreas Gohr $query = 'SELECT * FROM embeddings'; 3878c8b7ba6SAndreas Gohr $handle = $this->db->query($query); 3888c8b7ba6SAndreas Gohr 3898c8b7ba6SAndreas Gohr $header = implode("\t", ['id', 'page', 'created']); 3908c8b7ba6SAndreas Gohr file_put_contents($metafile, $header . "\n", FILE_APPEND); 3918c8b7ba6SAndreas Gohr 3928c8b7ba6SAndreas Gohr while ($row = $handle->fetch(\PDO::FETCH_ASSOC)) { 39330b9cbc7Ssplitbrain $vector = json_decode((string) $row['embedding'], true, 512, JSON_THROW_ON_ERROR); 3948c8b7ba6SAndreas Gohr $vector = implode("\t", $vector); 3958c8b7ba6SAndreas Gohr 3968c8b7ba6SAndreas Gohr $meta = implode("\t", [$row['id'], $row['page'], $row['created']]); 3978c8b7ba6SAndreas Gohr 3988c8b7ba6SAndreas Gohr file_put_contents($vectorfile, $vector . "\n", FILE_APPEND); 3998c8b7ba6SAndreas Gohr file_put_contents($metafile, $meta . "\n", FILE_APPEND); 4008c8b7ba6SAndreas Gohr } 4018c8b7ba6SAndreas Gohr } 402f6ef2e50SAndreas Gohr} 403