xref: /plugin/aichat/Storage/AbstractStorage.php (revision 04afb84f6cb8a0c9b1d4d807e18f90fe739ec371)
1f6ef2e50SAndreas Gohr<?php
2f6ef2e50SAndreas Gohr
3f6ef2e50SAndreas Gohrnamespace dokuwiki\plugin\aichat\Storage;
4f6ef2e50SAndreas Gohr
53379af09SAndreas Gohruse dokuwiki\Extension\CLIPlugin;
6f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Chunk;
7f6ef2e50SAndreas Gohr
8f6ef2e50SAndreas Gohr/**
9f6ef2e50SAndreas Gohr * Defines a vector storage for page chunks and their embeddings
10f6ef2e50SAndreas Gohr *
11f6ef2e50SAndreas Gohr * Please not that chunkIDs are created outside of the storage. They reference the Page's ID in
12f6ef2e50SAndreas Gohr * DokuWiki's fulltext index. ChunkIDs count from the page's id*100 upwards. Eg. Page 12 will have
13f6ef2e50SAndreas Gohr * chunks 1200, 1201, 1202, ...
14f6ef2e50SAndreas Gohr */
15f6ef2e50SAndreas Gohrabstract class AbstractStorage
16f6ef2e50SAndreas Gohr{
173379af09SAndreas Gohr    /** @var CLIPlugin $logger */
183379af09SAndreas Gohr    protected $logger;
193379af09SAndreas Gohr
203379af09SAndreas Gohr    /**
21*04afb84fSAndreas Gohr     * @param array $config The plugin's configuration
22*04afb84fSAndreas Gohr     */
23*04afb84fSAndreas Gohr    abstract public function __construct(array $config);
24*04afb84fSAndreas Gohr
25*04afb84fSAndreas Gohr    /**
263379af09SAndreas Gohr     * @param CLIPlugin $logger
273379af09SAndreas Gohr     * @return void
283379af09SAndreas Gohr     */
297ebc7895Ssplitbrain    public function setLogger($logger)
307ebc7895Ssplitbrain    {
313379af09SAndreas Gohr        $this->logger = $logger;
323379af09SAndreas Gohr    }
33f6ef2e50SAndreas Gohr
34f6ef2e50SAndreas Gohr    /**
35f6ef2e50SAndreas Gohr     * Get the chunk with the given ID
36f6ef2e50SAndreas Gohr     *
37f6ef2e50SAndreas Gohr     * @param int $chunkID
38f6ef2e50SAndreas Gohr     * @return Chunk|null
39f6ef2e50SAndreas Gohr     */
40f6ef2e50SAndreas Gohr    abstract public function getChunk($chunkID);
41f6ef2e50SAndreas Gohr
42f6ef2e50SAndreas Gohr    /**
43f6ef2e50SAndreas Gohr     * Called when the storage is about to be (re)built
44f6ef2e50SAndreas Gohr     *
45f6ef2e50SAndreas Gohr     * Storages may need to open a transaction or prepare other things here.
46f6ef2e50SAndreas Gohr     *
47f6ef2e50SAndreas Gohr     * @param bool $clear Should any existing data been thrown away?
48f6ef2e50SAndreas Gohr     * @return void
49f6ef2e50SAndreas Gohr     */
50f6ef2e50SAndreas Gohr    abstract public function startCreation($clear = false);
51f6ef2e50SAndreas Gohr
52f6ef2e50SAndreas Gohr    /**
53f6ef2e50SAndreas Gohr     * Called when the storage is (re)built and the existing chunks should be reused
54f6ef2e50SAndreas Gohr     *
55f6ef2e50SAndreas Gohr     * Storages that can be updated, may simply do nothing here
56f6ef2e50SAndreas Gohr     *
57f6ef2e50SAndreas Gohr     * @param string $page The page the chunks belong to
58f6ef2e50SAndreas Gohr     * @param int $firstChunkID The ID of the first chunk to reuse
59f6ef2e50SAndreas Gohr     * @return void
60f6ef2e50SAndreas Gohr     */
61f6ef2e50SAndreas Gohr    abstract public function reusePageChunks($page, $firstChunkID);
62f6ef2e50SAndreas Gohr
63f6ef2e50SAndreas Gohr    /**
64f6ef2e50SAndreas Gohr     * Delete all chunks associated with the given page
65f6ef2e50SAndreas Gohr     *
66f6ef2e50SAndreas Gohr     * @param string $page The page the chunks belong to
6701f06932SAndreas Gohr     * @param int $firstChunkID The ID of the first chunk
68f6ef2e50SAndreas Gohr     * @return void
69f6ef2e50SAndreas Gohr     */
70f6ef2e50SAndreas Gohr    abstract public function deletePageChunks($page, $firstChunkID);
71f6ef2e50SAndreas Gohr
72f6ef2e50SAndreas Gohr    /**
73f6ef2e50SAndreas Gohr     * Add the given new Chunks to the storage
74f6ef2e50SAndreas Gohr     *
75f6ef2e50SAndreas Gohr     * @param Chunk[] $chunks
76f6ef2e50SAndreas Gohr     * @return void
77f6ef2e50SAndreas Gohr     */
78f6ef2e50SAndreas Gohr    abstract public function addPageChunks($chunks);
79f6ef2e50SAndreas Gohr
80f6ef2e50SAndreas Gohr    /**
81f6ef2e50SAndreas Gohr     * All chunks have been added, finalize the storage
82f6ef2e50SAndreas Gohr     *
83f6ef2e50SAndreas Gohr     * This is where transactions may be committed and or memory structures be written to disk.
84f6ef2e50SAndreas Gohr     *
85f6ef2e50SAndreas Gohr     * @return void
86f6ef2e50SAndreas Gohr     */
87f6ef2e50SAndreas Gohr    abstract public function finalizeCreation();
88f6ef2e50SAndreas Gohr
89f6ef2e50SAndreas Gohr    /**
903379af09SAndreas Gohr     * Run maintenance tasks on the storage
913379af09SAndreas Gohr     *
923379af09SAndreas Gohr     * Each storage can decide on it's own what to do here. Documentation should explain
933379af09SAndreas Gohr     * how often this should be run.
943379af09SAndreas Gohr     *
953379af09SAndreas Gohr     * @return void
963379af09SAndreas Gohr     */
973379af09SAndreas Gohr    abstract public function runMaintenance();
983379af09SAndreas Gohr
993379af09SAndreas Gohr    /**
10001f06932SAndreas Gohr     * Get all chunks associated with the given page
10101f06932SAndreas Gohr     *
10201f06932SAndreas Gohr     * @param string $page The page the chunks belong to
10301f06932SAndreas Gohr     * @param int $firstChunkID The ID of the first chunk
10401f06932SAndreas Gohr     * @return Chunk[]
10501f06932SAndreas Gohr     */
10601f06932SAndreas Gohr    abstract public function getPageChunks($page, $firstChunkID);
10701f06932SAndreas Gohr
10801f06932SAndreas Gohr    /**
109f6ef2e50SAndreas Gohr     * Get the chunks most similar to the given vector, using a nearest neighbor search
110f6ef2e50SAndreas Gohr     *
111f6ef2e50SAndreas Gohr     * The returned chunks should be sorted by similarity, most similar first.
112f6ef2e50SAndreas Gohr     *
113f6ef2e50SAndreas Gohr     * If possible in an efficient way, only chunks readable by the current user should be returned (ACL check).
114f6ef2e50SAndreas Gohr     * If not, the storage should return twice the $limit of chunks and the caller will filter out the readable ones.
115f6ef2e50SAndreas Gohr     *
116f6ef2e50SAndreas Gohr     * @param float[] $vector The vector to compare to
117e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language. When empty consider all languages
118f6ef2e50SAndreas Gohr     * @param int $limit The number of results to return, see note above
119f6ef2e50SAndreas Gohr     * @return Chunk[]
120f6ef2e50SAndreas Gohr     */
121e33a1d7aSAndreas Gohr    abstract public function getSimilarChunks($vector, $lang = '', $limit = 4);
122f6ef2e50SAndreas Gohr
123f6ef2e50SAndreas Gohr    /**
124f6ef2e50SAndreas Gohr     * Get information about the storage
125f6ef2e50SAndreas Gohr     *
126f6ef2e50SAndreas Gohr     * Each storage can decide on it's own what to return here as key value pairs. Keys should be self explanatory.
127f6ef2e50SAndreas Gohr     *
128f6ef2e50SAndreas Gohr     * @return string[]
129f6ef2e50SAndreas Gohr     */
130f6ef2e50SAndreas Gohr    abstract public function statistics();
1318c8b7ba6SAndreas Gohr
1328c8b7ba6SAndreas Gohr    /**
1338c8b7ba6SAndreas Gohr     * Writes TSV files for visualizing with http://projector.tensorflow.org/
1348c8b7ba6SAndreas Gohr     *
1358c8b7ba6SAndreas Gohr     * @param string $vectorfile path to the file with the vectors
1368c8b7ba6SAndreas Gohr     * @param string $metafile path to the file with the metadata
1378c8b7ba6SAndreas Gohr     * @return void
1388c8b7ba6SAndreas Gohr     */
1397ebc7895Ssplitbrain    public function dumpTSV($vectorfile, $metafile)
1407ebc7895Ssplitbrain    {
1418c8b7ba6SAndreas Gohr        throw new \RuntimeException('Not implemented for current storage');
1428c8b7ba6SAndreas Gohr    }
143f6ef2e50SAndreas Gohr}
144