xref: /plugin/aichat/Storage/SQLiteStorage.php (revision ab1f8dde36106432cc0a6f320220da5fae6971fe)
17ebc7895Ssplitbrain<?php
27ebc7895Ssplitbrain
37ebc7895Ssplitbrain/** @noinspection SqlResolve */
4f6ef2e50SAndreas Gohr
5f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
6f6ef2e50SAndreas Gohr
7e33a1d7aSAndreas Gohruse dokuwiki\plugin\aichat\AIChat;
8f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
9f6ef2e50SAndreas Gohruse dokuwiki\plugin\sqlite\SQLiteDB;
103379af09SAndreas Gohruse KMeans\Cluster;
113379af09SAndreas Gohruse KMeans\Space;
12f6ef2e50SAndreas Gohr
13f6ef2e50SAndreas Gohr/**
14f6ef2e50SAndreas Gohr * Implements the storage backend using a SQLite database
1535555bacSAndreas Gohr *
1635555bacSAndreas Gohr * Note: all embeddings are stored and returned as normalized vectors
17f6ef2e50SAndreas Gohr */
18f6ef2e50SAndreas Gohrclass SQLiteStorage extends AbstractStorage
19f6ef2e50SAndreas Gohr{
2081b450c8SAndreas Gohr
213379af09SAndreas Gohr    /** @var int Number of documents to randomly sample to create the clusters */
2230b9cbc7Ssplitbrain    final public const SAMPLE_SIZE = 2000;
233379af09SAndreas Gohr    /** @var int The average size of each cluster */
2430b9cbc7Ssplitbrain    final public const CLUSTER_SIZE = 400;
253379af09SAndreas Gohr
26f6ef2e50SAndreas Gohr    /** @var SQLiteDB */
27f6ef2e50SAndreas Gohr    protected $db;
28f6ef2e50SAndreas Gohr
29e33a1d7aSAndreas Gohr    protected $useLanguageClusters = false;
30e33a1d7aSAndreas Gohr
31720bb43fSAndreas Gohr    /** @var float minimum similarity to consider a chunk a match */
32720bb43fSAndreas Gohr    protected $similarityThreshold = 0;
33720bb43fSAndreas Gohr
3404afb84fSAndreas Gohr    /** @inheritdoc */
3504afb84fSAndreas Gohr    public function __construct(array $config)
36f6ef2e50SAndreas Gohr    {
37f6ef2e50SAndreas Gohr        $this->db = new SQLiteDB('aichat', DOKU_PLUGIN . 'aichat/db/');
3830b9cbc7Ssplitbrain        $this->db->getPdo()->sqliteCreateFunction('COSIM', $this->sqliteCosineSimilarityCallback(...), 2);
39e33a1d7aSAndreas Gohr
40e33a1d7aSAndreas Gohr        $helper = plugin_load('helper', 'aichat');
41e33a1d7aSAndreas Gohr        $this->useLanguageClusters = $helper->getConf('preferUIlanguage') >= AIChat::LANG_UI_LIMITED;
42720bb43fSAndreas Gohr
43720bb43fSAndreas Gohr        $this->similarityThreshold = $config['similarityThreshold']/100;
44f6ef2e50SAndreas Gohr    }
45f6ef2e50SAndreas Gohr
46f6ef2e50SAndreas Gohr    /** @inheritdoc */
47f6ef2e50SAndreas Gohr    public function getChunk($chunkID)
48f6ef2e50SAndreas Gohr    {
49f6ef2e50SAndreas Gohr        $record = $this->db->queryRecord('SELECT * FROM embeddings WHERE id = ?', [$chunkID]);
50f6ef2e50SAndreas Gohr        if (!$record) return null;
51f6ef2e50SAndreas Gohr
52f6ef2e50SAndreas Gohr        return new Chunk(
53f6ef2e50SAndreas Gohr            $record['page'],
54f6ef2e50SAndreas Gohr            $record['id'],
55f6ef2e50SAndreas Gohr            $record['chunk'],
5630b9cbc7Ssplitbrain            json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR),
57e33a1d7aSAndreas Gohr            $record['lang'],
58f6ef2e50SAndreas Gohr            $record['created']
59f6ef2e50SAndreas Gohr        );
60f6ef2e50SAndreas Gohr    }
61f6ef2e50SAndreas Gohr
62f6ef2e50SAndreas Gohr    /** @inheritdoc */
63f6ef2e50SAndreas Gohr    public function startCreation($clear = false)
64f6ef2e50SAndreas Gohr    {
65f6ef2e50SAndreas Gohr        if ($clear) {
66f6ef2e50SAndreas Gohr            /** @noinspection SqlWithoutWhere */
67f6ef2e50SAndreas Gohr            $this->db->exec('DELETE FROM embeddings');
68*ab1f8ddeSAndreas Gohr            /** @noinspection SqlWithoutWhere */
69*ab1f8ddeSAndreas Gohr            $this->db->exec('DELETE FROM clusters');
70f6ef2e50SAndreas Gohr        }
71f6ef2e50SAndreas Gohr    }
72f6ef2e50SAndreas Gohr
73f6ef2e50SAndreas Gohr    /** @inheritdoc */
74f6ef2e50SAndreas Gohr    public function reusePageChunks($page, $firstChunkID)
75f6ef2e50SAndreas Gohr    {
76f6ef2e50SAndreas Gohr        // no-op
77f6ef2e50SAndreas Gohr    }
78f6ef2e50SAndreas Gohr
79f6ef2e50SAndreas Gohr    /** @inheritdoc */
80f6ef2e50SAndreas Gohr    public function deletePageChunks($page, $firstChunkID)
81f6ef2e50SAndreas Gohr    {
82f6ef2e50SAndreas Gohr        $this->db->exec('DELETE FROM embeddings WHERE page = ?', [$page]);
83f6ef2e50SAndreas Gohr    }
84f6ef2e50SAndreas Gohr
85f6ef2e50SAndreas Gohr    /** @inheritdoc */
86f6ef2e50SAndreas Gohr    public function addPageChunks($chunks)
87f6ef2e50SAndreas Gohr    {
88f6ef2e50SAndreas Gohr        foreach ($chunks as $chunk) {
89f6ef2e50SAndreas Gohr            $this->db->saveRecord('embeddings', [
90f6ef2e50SAndreas Gohr                'page' => $chunk->getPage(),
91f6ef2e50SAndreas Gohr                'id' => $chunk->getId(),
92f6ef2e50SAndreas Gohr                'chunk' => $chunk->getText(),
9330b9cbc7Ssplitbrain                'embedding' => json_encode($chunk->getEmbedding(), JSON_THROW_ON_ERROR),
94e33a1d7aSAndreas Gohr                'created' => $chunk->getCreated(),
95e33a1d7aSAndreas Gohr                'lang' => $chunk->getLanguage(),
96f6ef2e50SAndreas Gohr            ]);
97f6ef2e50SAndreas Gohr        }
98f6ef2e50SAndreas Gohr    }
99f6ef2e50SAndreas Gohr
100f6ef2e50SAndreas Gohr    /** @inheritdoc */
101f6ef2e50SAndreas Gohr    public function finalizeCreation()
102f6ef2e50SAndreas Gohr    {
1033379af09SAndreas Gohr        if (!$this->hasClusters()) {
1043379af09SAndreas Gohr            $this->createClusters();
1053379af09SAndreas Gohr        }
1063379af09SAndreas Gohr        $this->setChunkClusters();
1073379af09SAndreas Gohr
108f6ef2e50SAndreas Gohr        $this->db->exec('VACUUM');
109f6ef2e50SAndreas Gohr    }
110f6ef2e50SAndreas Gohr
111f6ef2e50SAndreas Gohr    /** @inheritdoc */
1123379af09SAndreas Gohr    public function runMaintenance()
1133379af09SAndreas Gohr    {
1143379af09SAndreas Gohr        $this->createClusters();
1153379af09SAndreas Gohr        $this->setChunkClusters();
1163379af09SAndreas Gohr    }
1173379af09SAndreas Gohr
1183379af09SAndreas Gohr    /** @inheritdoc */
11901f06932SAndreas Gohr    public function getPageChunks($page, $firstChunkID)
12001f06932SAndreas Gohr    {
12101f06932SAndreas Gohr        $result = $this->db->queryAll(
12201f06932SAndreas Gohr            'SELECT * FROM embeddings WHERE page = ?',
12301f06932SAndreas Gohr            [$page]
12401f06932SAndreas Gohr        );
12501f06932SAndreas Gohr        $chunks = [];
12601f06932SAndreas Gohr        foreach ($result as $record) {
12701f06932SAndreas Gohr            $chunks[] = new Chunk(
12801f06932SAndreas Gohr                $record['page'],
12901f06932SAndreas Gohr                $record['id'],
13001f06932SAndreas Gohr                $record['chunk'],
13130b9cbc7Ssplitbrain                json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR),
132e33a1d7aSAndreas Gohr                $record['lang'],
13301f06932SAndreas Gohr                $record['created']
13401f06932SAndreas Gohr            );
13501f06932SAndreas Gohr        }
13601f06932SAndreas Gohr        return $chunks;
13701f06932SAndreas Gohr    }
13801f06932SAndreas Gohr
13901f06932SAndreas Gohr    /** @inheritdoc */
140e33a1d7aSAndreas Gohr    public function getSimilarChunks($vector, $lang = '', $limit = 4)
141f6ef2e50SAndreas Gohr    {
142e33a1d7aSAndreas Gohr        $cluster = $this->getCluster($vector, $lang);
1438285fff9SAndreas Gohr        if ($this->logger) $this->logger->info(
1447ebc7895Ssplitbrain            'Using cluster {cluster} for similarity search',
1457ebc7895Ssplitbrain            ['cluster' => $cluster]
1468285fff9SAndreas Gohr        );
1473379af09SAndreas Gohr
148f6ef2e50SAndreas Gohr        $result = $this->db->queryAll(
149f6ef2e50SAndreas Gohr            'SELECT *, COSIM(?, embedding) AS similarity
150f6ef2e50SAndreas Gohr               FROM embeddings
1513379af09SAndreas Gohr              WHERE cluster = ?
1523379af09SAndreas Gohr                AND GETACCESSLEVEL(page) > 0
15381b450c8SAndreas Gohr                AND similarity > CAST(? AS FLOAT)
154f6ef2e50SAndreas Gohr           ORDER BY similarity DESC
155f6ef2e50SAndreas Gohr              LIMIT ?',
156720bb43fSAndreas Gohr            [json_encode($vector, JSON_THROW_ON_ERROR), $cluster, $this->similarityThreshold, $limit]
157f6ef2e50SAndreas Gohr        );
158f6ef2e50SAndreas Gohr        $chunks = [];
159f6ef2e50SAndreas Gohr        foreach ($result as $record) {
160f6ef2e50SAndreas Gohr            $chunks[] = new Chunk(
161f6ef2e50SAndreas Gohr                $record['page'],
162f6ef2e50SAndreas Gohr                $record['id'],
163f6ef2e50SAndreas Gohr                $record['chunk'],
16430b9cbc7Ssplitbrain                json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR),
165e33a1d7aSAndreas Gohr                $record['lang'],
1669b3d1b36SAndreas Gohr                $record['created'],
1679b3d1b36SAndreas Gohr                $record['similarity']
168f6ef2e50SAndreas Gohr            );
169f6ef2e50SAndreas Gohr        }
170f6ef2e50SAndreas Gohr        return $chunks;
171f6ef2e50SAndreas Gohr    }
172f6ef2e50SAndreas Gohr
173f6ef2e50SAndreas Gohr    /** @inheritdoc */
174f6ef2e50SAndreas Gohr    public function statistics()
175f6ef2e50SAndreas Gohr    {
176f6ef2e50SAndreas Gohr        $items = $this->db->queryValue('SELECT COUNT(*) FROM embeddings');
177f6ef2e50SAndreas Gohr        $size = $this->db->queryValue(
178f6ef2e50SAndreas Gohr            'SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()'
179f6ef2e50SAndreas Gohr        );
180f8d5ae01SAndreas Gohr        $query = "SELECT cluster || ' ' || lang, COUNT(*) || ' chunks' as cnt
181f8d5ae01SAndreas Gohr                    FROM embeddings
182f8d5ae01SAndreas Gohr                GROUP BY cluster
183f8d5ae01SAndreas Gohr                ORDER BY cluster";
1843379af09SAndreas Gohr        $clusters = $this->db->queryKeyValueList($query);
1853379af09SAndreas Gohr
186f6ef2e50SAndreas Gohr        return [
187f6ef2e50SAndreas Gohr            'storage type' => 'SQLite',
188f6ef2e50SAndreas Gohr            'chunks' => $items,
1893379af09SAndreas Gohr            'db size' => filesize_h($size),
1903379af09SAndreas Gohr            'clusters' => $clusters,
191f6ef2e50SAndreas Gohr        ];
192f6ef2e50SAndreas Gohr    }
193f6ef2e50SAndreas Gohr
194f6ef2e50SAndreas Gohr    /**
195f6ef2e50SAndreas Gohr     * Method registered as SQLite callback to calculate the cosine similarity
196f6ef2e50SAndreas Gohr     *
197f6ef2e50SAndreas Gohr     * @param string $query JSON encoded vector array
198f6ef2e50SAndreas Gohr     * @param string $embedding JSON encoded vector array
199f6ef2e50SAndreas Gohr     * @return float
200f6ef2e50SAndreas Gohr     */
201f6ef2e50SAndreas Gohr    public function sqliteCosineSimilarityCallback($query, $embedding)
202f6ef2e50SAndreas Gohr    {
203441edf84SAndreas Gohr        return (float)$this->cosineSimilarity(
204441edf84SAndreas Gohr            json_decode($query, true, 512, JSON_THROW_ON_ERROR),
205441edf84SAndreas Gohr            json_decode($embedding, true, 512, JSON_THROW_ON_ERROR)
206441edf84SAndreas Gohr        );
207f6ef2e50SAndreas Gohr    }
208f6ef2e50SAndreas Gohr
209f6ef2e50SAndreas Gohr    /**
210f6ef2e50SAndreas Gohr     * Calculate the cosine similarity between two vectors
211f6ef2e50SAndreas Gohr     *
21235555bacSAndreas Gohr     * Actually just calculating the dot product of the two vectors, since they are normalized
21335555bacSAndreas Gohr     *
21435555bacSAndreas Gohr     * @param float[] $queryVector The normalized vector of the search phrase
21535555bacSAndreas Gohr     * @param float[] $embedding The normalized vector of the chunk
216f6ef2e50SAndreas Gohr     * @return float
217f6ef2e50SAndreas Gohr     */
218f6ef2e50SAndreas Gohr    protected function cosineSimilarity($queryVector, $embedding)
219f6ef2e50SAndreas Gohr    {
220f6ef2e50SAndreas Gohr        $dotProduct = 0;
221f6ef2e50SAndreas Gohr        foreach ($queryVector as $key => $value) {
222f6ef2e50SAndreas Gohr            $dotProduct += $value * $embedding[$key];
223f6ef2e50SAndreas Gohr        }
22435555bacSAndreas Gohr        return $dotProduct;
225f6ef2e50SAndreas Gohr    }
2263379af09SAndreas Gohr
2273379af09SAndreas Gohr    /**
2283379af09SAndreas Gohr     * Create new clusters based on random chunks
2293379af09SAndreas Gohr     *
230e33a1d7aSAndreas Gohr     * @return void
2313379af09SAndreas Gohr     */
2323379af09SAndreas Gohr    protected function createClusters()
2333379af09SAndreas Gohr    {
234e33a1d7aSAndreas Gohr        if ($this->useLanguageClusters) {
235e33a1d7aSAndreas Gohr            $result = $this->db->queryAll('SELECT DISTINCT lang FROM embeddings');
236e33a1d7aSAndreas Gohr            $langs = array_column($result, 'lang');
237e33a1d7aSAndreas Gohr            foreach ($langs as $lang) {
238e33a1d7aSAndreas Gohr                $this->createLanguageClusters($lang);
239e33a1d7aSAndreas Gohr            }
240e33a1d7aSAndreas Gohr        } else {
241e33a1d7aSAndreas Gohr            $this->createLanguageClusters('');
242e33a1d7aSAndreas Gohr        }
243e33a1d7aSAndreas Gohr    }
244e33a1d7aSAndreas Gohr
245e33a1d7aSAndreas Gohr    /**
246e33a1d7aSAndreas Gohr     * Create new clusters based on random chunks for the given Language
247e33a1d7aSAndreas Gohr     *
248e33a1d7aSAndreas Gohr     * @param string $lang The language to cluster, empty when all languages go into the same cluster
249e33a1d7aSAndreas Gohr     * @noinspection SqlWithoutWhere
250e33a1d7aSAndreas Gohr     */
251e33a1d7aSAndreas Gohr    protected function createLanguageClusters($lang)
252e33a1d7aSAndreas Gohr    {
253e33a1d7aSAndreas Gohr        if ($lang != '') {
254e33a1d7aSAndreas Gohr            $where = 'WHERE lang = ' . $this->db->getPdo()->quote($lang);
255e33a1d7aSAndreas Gohr        } else {
256e33a1d7aSAndreas Gohr            $where = '';
257e33a1d7aSAndreas Gohr        }
258e33a1d7aSAndreas Gohr
259e33a1d7aSAndreas Gohr        if ($this->logger) $this->logger->info('Creating new {lang} clusters...', ['lang' => $lang]);
2603379af09SAndreas Gohr        $this->db->getPdo()->beginTransaction();
2613379af09SAndreas Gohr        try {
2623379af09SAndreas Gohr            // clean up old cluster data
263e33a1d7aSAndreas Gohr            $query = "DELETE FROM clusters $where";
2643379af09SAndreas Gohr            $this->db->exec($query);
265e33a1d7aSAndreas Gohr            $query = "UPDATE embeddings SET cluster = NULL $where";
2663379af09SAndreas Gohr            $this->db->exec($query);
2673379af09SAndreas Gohr
2683379af09SAndreas Gohr            // get a random selection of chunks
269e33a1d7aSAndreas Gohr            $query = "SELECT id, embedding FROM embeddings $where ORDER BY RANDOM() LIMIT ?";
2703379af09SAndreas Gohr            $result = $this->db->queryAll($query, [self::SAMPLE_SIZE]);
2713379af09SAndreas Gohr            if (!$result) return; // no data to cluster
27230b9cbc7Ssplitbrain            $dimensions = count(json_decode((string) $result[0]['embedding'], true, 512, JSON_THROW_ON_ERROR));
2733379af09SAndreas Gohr
274adfc5429SAndreas Gohr            // how many clusters?
275adfc5429SAndreas Gohr            if (count($result) < self::CLUSTER_SIZE * 3) {
276adfc5429SAndreas Gohr                // there would be less than 3 clusters, so just use one
277adfc5429SAndreas Gohr                $clustercount = 1;
278adfc5429SAndreas Gohr            } else {
2793379af09SAndreas Gohr                // get the number of all chunks, to calculate the number of clusters
280e33a1d7aSAndreas Gohr                $query = "SELECT COUNT(*) FROM embeddings $where";
2813379af09SAndreas Gohr                $total = $this->db->queryValue($query);
2823379af09SAndreas Gohr                $clustercount = ceil($total / self::CLUSTER_SIZE);
283adfc5429SAndreas Gohr            }
2843379af09SAndreas Gohr            if ($this->logger) $this->logger->info('Creating {clusters} clusters', ['clusters' => $clustercount]);
2853379af09SAndreas Gohr
2863379af09SAndreas Gohr            // cluster them using kmeans
2873379af09SAndreas Gohr            $space = new Space($dimensions);
2883379af09SAndreas Gohr            foreach ($result as $record) {
28930b9cbc7Ssplitbrain                $space->addPoint(json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR));
2903379af09SAndreas Gohr            }
2913379af09SAndreas Gohr            $clusters = $space->solve($clustercount, function ($space, $clusters) {
2923379af09SAndreas Gohr                static $iterations = 0;
2933379af09SAndreas Gohr                ++$iterations;
2943379af09SAndreas Gohr                if ($this->logger) {
2957ebc7895Ssplitbrain                    $clustercounts = implode(',', array_map('count', $clusters));
2963379af09SAndreas Gohr                    $this->logger->info('Iteration {iteration}: [{clusters}]', [
2973379af09SAndreas Gohr                        'iteration' => $iterations, 'clusters' => $clustercounts
2983379af09SAndreas Gohr                    ]);
2993379af09SAndreas Gohr                }
3003379af09SAndreas Gohr            }, Cluster::INIT_KMEANS_PLUS_PLUS);
3013379af09SAndreas Gohr
3023379af09SAndreas Gohr            // store the clusters
3037ebc7895Ssplitbrain            foreach ($clusters as $cluster) {
3043379af09SAndreas Gohr                /** @var Cluster $cluster */
3053379af09SAndreas Gohr                $centroid = $cluster->getCoordinates();
306e33a1d7aSAndreas Gohr                $query = 'INSERT INTO clusters (lang, centroid) VALUES (?, ?)';
30730b9cbc7Ssplitbrain                $this->db->exec($query, [$lang, json_encode($centroid, JSON_THROW_ON_ERROR)]);
3083379af09SAndreas Gohr            }
3093379af09SAndreas Gohr
3103379af09SAndreas Gohr            $this->db->getPdo()->commit();
3113379af09SAndreas Gohr            if ($this->logger) $this->logger->success('Created {clusters} clusters', ['clusters' => count($clusters)]);
3123379af09SAndreas Gohr        } catch (\Exception $e) {
3133379af09SAndreas Gohr            $this->db->getPdo()->rollBack();
314e33a1d7aSAndreas Gohr            throw new \RuntimeException('Clustering failed: ' . $e->getMessage(), 0, $e);
3153379af09SAndreas Gohr        }
3163379af09SAndreas Gohr    }
3173379af09SAndreas Gohr
3183379af09SAndreas Gohr    /**
3193379af09SAndreas Gohr     * Assign the nearest cluster for all chunks that don't have one
3203379af09SAndreas Gohr     *
3213379af09SAndreas Gohr     * @return void
3223379af09SAndreas Gohr     */
3233379af09SAndreas Gohr    protected function setChunkClusters()
3243379af09SAndreas Gohr    {
3253379af09SAndreas Gohr        if ($this->logger) $this->logger->info('Assigning clusters to chunks...');
326e33a1d7aSAndreas Gohr        $query = 'SELECT id, embedding, lang FROM embeddings WHERE cluster IS NULL';
3273379af09SAndreas Gohr        $handle = $this->db->query($query);
3283379af09SAndreas Gohr
3293379af09SAndreas Gohr        while ($record = $handle->fetch(\PDO::FETCH_ASSOC)) {
33030b9cbc7Ssplitbrain            $vector = json_decode((string) $record['embedding'], true, 512, JSON_THROW_ON_ERROR);
331e33a1d7aSAndreas Gohr            $cluster = $this->getCluster($vector, $this->useLanguageClusters ? $record['lang'] : '');
3323379af09SAndreas Gohr            $query = 'UPDATE embeddings SET cluster = ? WHERE id = ?';
3333379af09SAndreas Gohr            $this->db->exec($query, [$cluster, $record['id']]);
3343379af09SAndreas Gohr            if ($this->logger) $this->logger->success(
3357ebc7895Ssplitbrain                'Chunk {id} assigned to cluster {cluster}',
3367ebc7895Ssplitbrain                ['id' => $record['id'], 'cluster' => $cluster]
3373379af09SAndreas Gohr            );
3383379af09SAndreas Gohr        }
3393379af09SAndreas Gohr        $handle->closeCursor();
3403379af09SAndreas Gohr    }
3413379af09SAndreas Gohr
3423379af09SAndreas Gohr    /**
3433379af09SAndreas Gohr     * Get the nearest cluster for the given vector
3443379af09SAndreas Gohr     *
3453379af09SAndreas Gohr     * @param float[] $vector
3463379af09SAndreas Gohr     * @return int|null
3473379af09SAndreas Gohr     */
348e33a1d7aSAndreas Gohr    protected function getCluster($vector, $lang)
3493379af09SAndreas Gohr    {
350e33a1d7aSAndreas Gohr        if ($lang != '') {
351e33a1d7aSAndreas Gohr            $where = 'WHERE lang = ' . $this->db->getPdo()->quote($lang);
352e33a1d7aSAndreas Gohr        } else {
353e33a1d7aSAndreas Gohr            $where = '';
354e33a1d7aSAndreas Gohr        }
355e33a1d7aSAndreas Gohr
356e33a1d7aSAndreas Gohr        $query = "SELECT cluster, centroid
357e33a1d7aSAndreas Gohr                    FROM clusters
358e33a1d7aSAndreas Gohr                   $where
359e33a1d7aSAndreas Gohr                ORDER BY COSIM(centroid, ?) DESC
360e33a1d7aSAndreas Gohr                   LIMIT 1";
361e33a1d7aSAndreas Gohr
36230b9cbc7Ssplitbrain        $result = $this->db->queryRecord($query, [json_encode($vector, JSON_THROW_ON_ERROR)]);
3633379af09SAndreas Gohr        if (!$result) return null;
3643379af09SAndreas Gohr        return $result['cluster'];
3653379af09SAndreas Gohr    }
3663379af09SAndreas Gohr
3673379af09SAndreas Gohr    /**
3683379af09SAndreas Gohr     * Check if clustering has been done before
3693379af09SAndreas Gohr     * @return bool
3703379af09SAndreas Gohr     */
3713379af09SAndreas Gohr    protected function hasClusters()
3723379af09SAndreas Gohr    {
3733379af09SAndreas Gohr        $query = 'SELECT COUNT(*) FROM clusters';
3743379af09SAndreas Gohr        return $this->db->queryValue($query) > 0;
3753379af09SAndreas Gohr    }
3768c8b7ba6SAndreas Gohr
3778c8b7ba6SAndreas Gohr    /**
3788c8b7ba6SAndreas Gohr     * Writes TSV files for visualizing with http://projector.tensorflow.org/
3798c8b7ba6SAndreas Gohr     *
3808c8b7ba6SAndreas Gohr     * @param string $vectorfile path to the file with the vectors
3818c8b7ba6SAndreas Gohr     * @param string $metafile path to the file with the metadata
3828c8b7ba6SAndreas Gohr     * @return void
3838c8b7ba6SAndreas Gohr     */
3848c8b7ba6SAndreas Gohr    public function dumpTSV($vectorfile, $metafile)
3858c8b7ba6SAndreas Gohr    {
3868c8b7ba6SAndreas Gohr        $query = 'SELECT * FROM embeddings';
3878c8b7ba6SAndreas Gohr        $handle = $this->db->query($query);
3888c8b7ba6SAndreas Gohr
3898c8b7ba6SAndreas Gohr        $header = implode("\t", ['id', 'page', 'created']);
3908c8b7ba6SAndreas Gohr        file_put_contents($metafile, $header . "\n", FILE_APPEND);
3918c8b7ba6SAndreas Gohr
3928c8b7ba6SAndreas Gohr        while ($row = $handle->fetch(\PDO::FETCH_ASSOC)) {
39330b9cbc7Ssplitbrain            $vector = json_decode((string) $row['embedding'], true, 512, JSON_THROW_ON_ERROR);
3948c8b7ba6SAndreas Gohr            $vector = implode("\t", $vector);
3958c8b7ba6SAndreas Gohr
3968c8b7ba6SAndreas Gohr            $meta = implode("\t", [$row['id'], $row['page'], $row['created']]);
3978c8b7ba6SAndreas Gohr
3988c8b7ba6SAndreas Gohr            file_put_contents($vectorfile, $vector . "\n", FILE_APPEND);
3998c8b7ba6SAndreas Gohr            file_put_contents($metafile, $meta . "\n", FILE_APPEND);
4008c8b7ba6SAndreas Gohr        }
4018c8b7ba6SAndreas Gohr    }
402f6ef2e50SAndreas Gohr}
403