xref: /plugin/aichat/Storage/AbstractStorage.php (revision 7ebc78955c65af90e7ee0afbd07adc15271113ba)
1f6ef2e50SAndreas Gohr<?php
2f6ef2e50SAndreas Gohr
3f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
4f6ef2e50SAndreas Gohr
53379af09SAndreas Gohruse dokuwiki\Extension\CLIPlugin;
6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7f6ef2e50SAndreas Gohr
8f6ef2e50SAndreas Gohr/**
9f6ef2e50SAndreas Gohr * Defines a vector storage for page chunks and their embeddings
10f6ef2e50SAndreas Gohr *
11f6ef2e50SAndreas Gohr * Please not that chunkIDs are created outside of the storage. They reference the Page's ID in
12f6ef2e50SAndreas Gohr * DokuWiki's fulltext index. ChunkIDs count from the page's id*100 upwards. Eg. Page 12 will have
13f6ef2e50SAndreas Gohr * chunks 1200, 1201, 1202, ...
14f6ef2e50SAndreas Gohr */
15f6ef2e50SAndreas Gohrabstract class AbstractStorage
16f6ef2e50SAndreas Gohr{
173379af09SAndreas Gohr    /** @var CLIPlugin $logger */
183379af09SAndreas Gohr    protected $logger;
193379af09SAndreas Gohr
203379af09SAndreas Gohr    /**
213379af09SAndreas Gohr     * @param CLIPlugin $logger
223379af09SAndreas Gohr     * @return void
233379af09SAndreas Gohr     */
24*7ebc7895Ssplitbrain    public function setLogger($logger)
25*7ebc7895Ssplitbrain    {
263379af09SAndreas Gohr        $this->logger = $logger;
273379af09SAndreas Gohr    }
28f6ef2e50SAndreas Gohr
29f6ef2e50SAndreas Gohr    /**
30f6ef2e50SAndreas Gohr     * Get the chunk with the given ID
31f6ef2e50SAndreas Gohr     *
32f6ef2e50SAndreas Gohr     * @param int $chunkID
33f6ef2e50SAndreas Gohr     * @return Chunk|null
34f6ef2e50SAndreas Gohr     */
35f6ef2e50SAndreas Gohr    abstract public function getChunk($chunkID);
36f6ef2e50SAndreas Gohr
37f6ef2e50SAndreas Gohr    /**
38f6ef2e50SAndreas Gohr     * Called when the storage is about to be (re)built
39f6ef2e50SAndreas Gohr     *
40f6ef2e50SAndreas Gohr     * Storages may need to open a transaction or prepare other things here.
41f6ef2e50SAndreas Gohr     *
42f6ef2e50SAndreas Gohr     * @param bool $clear Should any existing data been thrown away?
43f6ef2e50SAndreas Gohr     * @return void
44f6ef2e50SAndreas Gohr     */
45f6ef2e50SAndreas Gohr    abstract public function startCreation($clear = false);
46f6ef2e50SAndreas Gohr
47f6ef2e50SAndreas Gohr    /**
48f6ef2e50SAndreas Gohr     * Called when the storage is (re)built and the existing chunks should be reused
49f6ef2e50SAndreas Gohr     *
50f6ef2e50SAndreas Gohr     * Storages that can be updated, may simply do nothing here
51f6ef2e50SAndreas Gohr     *
52f6ef2e50SAndreas Gohr     * @param string $page The page the chunks belong to
53f6ef2e50SAndreas Gohr     * @param int $firstChunkID The ID of the first chunk to reuse
54f6ef2e50SAndreas Gohr     * @return void
55f6ef2e50SAndreas Gohr     */
56f6ef2e50SAndreas Gohr    abstract public function reusePageChunks($page, $firstChunkID);
57f6ef2e50SAndreas Gohr
58f6ef2e50SAndreas Gohr    /**
59f6ef2e50SAndreas Gohr     * Delete all chunks associated with the given page
60f6ef2e50SAndreas Gohr     *
61f6ef2e50SAndreas Gohr     * @param string $page The page the chunks belong to
6201f06932SAndreas Gohr     * @param int $firstChunkID The ID of the first chunk
63f6ef2e50SAndreas Gohr     * @return void
64f6ef2e50SAndreas Gohr     */
65f6ef2e50SAndreas Gohr    abstract public function deletePageChunks($page, $firstChunkID);
66f6ef2e50SAndreas Gohr
67f6ef2e50SAndreas Gohr    /**
68f6ef2e50SAndreas Gohr     * Add the given new Chunks to the storage
69f6ef2e50SAndreas Gohr     *
70f6ef2e50SAndreas Gohr     * @param Chunk[] $chunks
71f6ef2e50SAndreas Gohr     * @return void
72f6ef2e50SAndreas Gohr     */
73f6ef2e50SAndreas Gohr    abstract public function addPageChunks($chunks);
74f6ef2e50SAndreas Gohr
75f6ef2e50SAndreas Gohr    /**
76f6ef2e50SAndreas Gohr     * All chunks have been added, finalize the storage
77f6ef2e50SAndreas Gohr     *
78f6ef2e50SAndreas Gohr     * This is where transactions may be committed and or memory structures be written to disk.
79f6ef2e50SAndreas Gohr     *
80f6ef2e50SAndreas Gohr     * @return void
81f6ef2e50SAndreas Gohr     */
82f6ef2e50SAndreas Gohr    abstract public function finalizeCreation();
83f6ef2e50SAndreas Gohr
84f6ef2e50SAndreas Gohr    /**
853379af09SAndreas Gohr     * Run maintenance tasks on the storage
863379af09SAndreas Gohr     *
873379af09SAndreas Gohr     * Each storage can decide on it's own what to do here. Documentation should explain
883379af09SAndreas Gohr     * how often this should be run.
893379af09SAndreas Gohr     *
903379af09SAndreas Gohr     * @return void
913379af09SAndreas Gohr     */
923379af09SAndreas Gohr    abstract public function runMaintenance();
933379af09SAndreas Gohr
943379af09SAndreas Gohr    /**
9501f06932SAndreas Gohr     * Get all chunks associated with the given page
9601f06932SAndreas Gohr     *
9701f06932SAndreas Gohr     * @param string $page The page the chunks belong to
9801f06932SAndreas Gohr     * @param int $firstChunkID The ID of the first chunk
9901f06932SAndreas Gohr     * @return Chunk[]
10001f06932SAndreas Gohr     */
10101f06932SAndreas Gohr    abstract public function getPageChunks($page, $firstChunkID);
10201f06932SAndreas Gohr
10301f06932SAndreas Gohr    /**
104f6ef2e50SAndreas Gohr     * Get the chunks most similar to the given vector, using a nearest neighbor search
105f6ef2e50SAndreas Gohr     *
106f6ef2e50SAndreas Gohr     * The returned chunks should be sorted by similarity, most similar first.
107f6ef2e50SAndreas Gohr     *
108f6ef2e50SAndreas Gohr     * If possible in an efficient way, only chunks readable by the current user should be returned (ACL check).
109f6ef2e50SAndreas Gohr     * If not, the storage should return twice the $limit of chunks and the caller will filter out the readable ones.
110f6ef2e50SAndreas Gohr     *
111f6ef2e50SAndreas Gohr     * @param float[] $vector The vector to compare to
112e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language. When empty consider all languages
113f6ef2e50SAndreas Gohr     * @param int $limit The number of results to return, see note above
114f6ef2e50SAndreas Gohr     * @return Chunk[]
115f6ef2e50SAndreas Gohr     */
116e33a1d7aSAndreas Gohr    abstract public function getSimilarChunks($vector, $lang = '', $limit = 4);
117f6ef2e50SAndreas Gohr
118f6ef2e50SAndreas Gohr    /**
119f6ef2e50SAndreas Gohr     * Get information about the storage
120f6ef2e50SAndreas Gohr     *
121f6ef2e50SAndreas Gohr     * Each storage can decide on it's own what to return here as key value pairs. Keys should be self explanatory.
122f6ef2e50SAndreas Gohr     *
123f6ef2e50SAndreas Gohr     * @return string[]
124f6ef2e50SAndreas Gohr     */
125f6ef2e50SAndreas Gohr    abstract public function statistics();
1268c8b7ba6SAndreas Gohr
1278c8b7ba6SAndreas Gohr    /**
1288c8b7ba6SAndreas Gohr     * Writes TSV files for visualizing with http://projector.tensorflow.org/
1298c8b7ba6SAndreas Gohr     *
1308c8b7ba6SAndreas Gohr     * @param string $vectorfile path to the file with the vectors
1318c8b7ba6SAndreas Gohr     * @param string $metafile path to the file with the metadata
1328c8b7ba6SAndreas Gohr     * @return void
1338c8b7ba6SAndreas Gohr     */
134*7ebc7895Ssplitbrain    public function dumpTSV($vectorfile, $metafile)
135*7ebc7895Ssplitbrain    {
1368c8b7ba6SAndreas Gohr        throw new \RuntimeException('Not implemented for current storage');
1378c8b7ba6SAndreas Gohr    }
138f6ef2e50SAndreas Gohr}
139