xref: /dokuwiki/inc/Search/Collection/AbstractCollection.php (revision d92c078c66fe0feb3beeeb1a383e15ca031fb4f8)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexAccessException;
6use dokuwiki\Search\Exception\IndexLockException;
7use dokuwiki\Search\Exception\IndexWriteException;
8use dokuwiki\Search\Index\FileIndex;
9use dokuwiki\Search\Index\TupleOps;
10use dokuwiki\Search\Tokenizer;
11use dokuwiki\Search\Index\Lock;
12use dokuwiki\Search\Index\MemoryIndex;
13
14/**
15 * Abstract base class for index collections
16 *
17 * A collection manages a group of related indexes that together provide a specific search use case.
18 * Every collection works with four index types: entity, token, frequency, and reverse.
19 *
20 * entity - the list of the main entities (eg. pages)
21 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files)
22 * frequency - how often a token appears on a entity (can be split into multiple files)
23 * reverse - the list of tokens assigned to each entity
24 *
25 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
26 * @author Andreas Gohr <andi@splitbrain.org>
27 * @author Tom N Harris <tnharris@whoopdedo.org>
28 */
29abstract class AbstractCollection
30{
31    /** @var bool Has a lock been acquired for all used indexes? */
32    protected bool $isWritable = false;
33
34    /**
35     * Initialize the collection with the names of the indexes it manages
36     *
37     * @param string $idxEntity Name of the primary entity index, eg. 'page'
38     * @param string $idxToken Base name of the secondary entity index, eg. 'w' for words
39     * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies
40     * @param string $idxReverse Name of the reverse index, eg. 'pageword'
41     * @param bool $splitByLength Whether to split token/frequency indexes by token length
42     */
43    public function __construct(
44        protected string $idxEntity,
45        protected string $idxToken,
46        protected string $idxFrequency = '',
47        protected string $idxReverse = '',
48        protected bool   $splitByLength = false
49    ) {
50    }
51
52    /**
53     * Destructor
54     *
55     * Ensures locks are released when the class is destroyed
56     */
57    public function __destruct()
58    {
59        if ($this->isWritable) {
60            $this->unlock();
61        }
62    }
63
64    /**
65     * Lock all indexes for writing
66     *
67     * @return $this can be used for chaining
68     * @throws IndexLockException
69     */
70    public function lock(): static
71    {
72        foreach (array_filter([$this->idxEntity, $this->idxToken, $this->idxFrequency, $this->idxReverse]) as $idxName) {
73            if (!(new Lock($idxName))->acquire()) {
74                $this->unlock(); // release any already acquired locks
75                throw new IndexLockException('Could not lock ' . $idxName . ' for writing');
76            }
77        }
78        // locking succeeded
79        $this->isWritable = true;
80        return $this;
81    }
82
83    /**
84     * Unlock all indexes
85     *
86     * @return void
87     */
88    public function unlock(): void
89    {
90        foreach (array_filter([$this->idxEntity, $this->idxToken, $this->idxFrequency, $this->idxReverse]) as $idxName) {
91            (new Lock($idxName))->release();
92        }
93        $this->isWritable = false;
94    }
95
96    /**
97     * @return FileIndex
98     */
99    public function getEntityIndex(): FileIndex
100    {
101        return new FileIndex($this->idxEntity, '', $this->isWritable);
102    }
103
104    /**
105     * @param int|string $suffix
106     * @return MemoryIndex
107     */
108    public function getTokenIndex(int|string $suffix): MemoryIndex
109    {
110        return new MemoryIndex($this->idxToken, $suffix, $this->isWritable);
111    }
112
113    /**
114     * @param int|string $suffix
115     * @return MemoryIndex
116     */
117    public function getFrequencyIndex(int|string $suffix): MemoryIndex
118    {
119        return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable);
120    }
121
122    /**
123     * @return FileIndex
124     */
125    public function getReverseIndex(): FileIndex
126    {
127        return new FileIndex($this->idxReverse, '', $this->isWritable);
128    }
129
130    /**
131     * Maximum suffix for the token indexes (eg. max word length currently stored)
132     *
133     * @return int
134     */
135    public function getTokenIndexMaximum(): int
136    {
137        return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum
138    }
139
140    /**
141     * Add or update the tokens for a given entity
142     *
143     * The given list of tokens replaces the previously stored list for that entity. An empty list removes the
144     * entity from the index.
145     *
146     * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs
147     * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values.
148     * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete
149     * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value.
150     *
151     * @param string $entity The name of the entity
152     * @param string[] $tokens The list of tokens for this entity
153     * @throws IndexAccessException
154     * @throws IndexWriteException
155     * @throws IndexLockException
156     */
157    public function addEntity(string $entity, array $tokens): void
158    {
159        if (!$this->isWritable) {
160            throw new IndexLockException('Indexes not locked. Forgot to call lock()?');
161        }
162
163        $entityIndex = $this->getEntityIndex();
164        $entityId = $entityIndex->accessCachedValue($entity);
165
166        $old = $this->getReverseAssignments($entity);
167        $new = $this->resolveTokens($tokens);
168
169        $merged = array_replace_recursive($old, $new);
170
171        $this->updateIndexes($merged, $entityId);
172        $this->saveReverseAssignments($entity, $merged);
173    }
174
175    /**
176     * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]]
177     *
178     * Calls countTokens() to get token frequencies (subclass responsibility), then groups
179     * by token length if splitByLength is enabled, or under '' if not. Finally resolves
180     * token strings to IDs via the appropriate token index.
181     *
182     * @param string[] $tokens The raw token list
183     * @return array [group => [tokenId => frequency, ...], ...]
184     * @throws IndexLockException
185     * @throws IndexWriteException
186     */
187    protected function resolveTokens(array $tokens): array
188    {
189        $counted = $this->countTokens($tokens);
190
191        // group tokens by their index suffix
192        $groups = [];
193        foreach ($counted as $token => $freq) {
194            $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : '';
195            $groups[$group][$token] = $freq;
196        }
197
198        // resolve token strings to IDs
199        $result = [];
200        foreach ($groups as $group => $tokenFreqs) {
201            $tokenIndex = $this->getTokenIndex($group);
202            $result[$group] = [];
203            foreach ($tokenFreqs as $token => $freq) {
204                $tokenId = $tokenIndex->getRowID((string)$token);
205                $result[$group][$tokenId] = $freq;
206            }
207            $tokenIndex->save();
208        }
209
210        return $result;
211    }
212
213    /**
214     * Count or deduplicate tokens and return their frequencies
215     *
216     * FrequencyCollections return actual occurrence counts.
217     * LookupCollections deduplicate and return 1 for each token.
218     *
219     * @param string[] $tokens The raw token list
220     * @return array [token => frequency, ...]
221     */
222    abstract protected function countTokens(array $tokens): array;
223
224    /**
225     * Get the token assignments for a given entity from the reverse index
226     *
227     * Returns the parsed reverse index record. The exact structure depends on the collection type.
228     *
229     * @param string $entity
230     * @return array
231     * @throws IndexAccessException
232     * @throws IndexWriteException
233     */
234    public function getReverseAssignments(string $entity): array
235    {
236        $entityIndex = $this->getEntityIndex();
237        $entityId = $entityIndex->accessCachedValue($entity);
238
239        $reverseIndex = $this->getReverseIndex();
240        $record = $reverseIndex->retrieveRow($entityId);
241
242        if ($record === '') {
243            return [];
244        }
245
246        return $this->parseReverseRecord($record);
247    }
248
249    /**
250     * Store the reverse index info about what tokens are assigned to the entity
251     *
252     * @param string $entity
253     * @param array $data The assignment data to store
254     * @return void
255     * @throws IndexAccessException
256     * @throws IndexWriteException
257     * @throws IndexLockException
258     */
259    protected function saveReverseAssignments(string $entity, array $data): void
260    {
261        // remove tokens with frequency 0 (no longer assigned), then remove empty groups
262        $data = array_map('array_filter', $data);
263        $data = array_filter($data);
264
265        $record = $this->formatReverseRecord($data);
266
267        $entityIndex = $this->getEntityIndex();
268        $entityId = $entityIndex->accessCachedValue($entity);
269
270        $reverseIndex = $this->getReverseIndex();
271        $reverseIndex->changeRow($entityId, $record);
272    }
273
274    /**
275     * Parse a reverse index record into a two-level array
276     *
277     * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values
278     * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(),
279     * tokens absent from the new data retain 0, signaling deletion from the frequency index.
280     *
281     * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length.
282     * For non-split collections the group prefix is omitted: "tokenId:tokenId:..."
283     * This mirrors how TupleOps omits *1 for frequency 1.
284     *
285     * @param string $record The raw reverse index record
286     * @return array [group => [tokenId => 0, ...], ...]
287     */
288    protected function parseReverseRecord(string $record): array
289    {
290        $result = [];
291        foreach (explode(':', $record) as $entry) {
292            $parts = explode('*', $entry, 2);
293            $tokenId = array_pop($parts);
294            $group = array_pop($parts) ?? '';
295            $result[$group][$tokenId] = 0;
296        }
297        return $result;
298    }
299
300    /**
301     * Format a two-level array into a reverse index record string
302     *
303     * @param array $data [group => [tokenId => freq, ...], ...]
304     * @return string The formatted record
305     */
306    protected function formatReverseRecord(array $data): string
307    {
308        $parts = [];
309        foreach ($data as $group => $tokens) {
310            $prefix = $group === '' ? '' : "$group*";
311            foreach (array_keys($tokens) as $tokenId) {
312                $parts[] = $prefix . $tokenId;
313            }
314        }
315        return implode(':', $parts);
316    }
317
318    /**
319     * Update frequency indexes with the given data
320     *
321     * Iterates over the two-level structure [group => [tokenId => freq]] and updates the
322     * corresponding frequency index for each group. A frequency of 0 removes the entity
323     * from that token's frequency record.
324     *
325     * @param array $data [group => [tokenId => frequency, ...], ...]
326     * @param int $entityId The entity ID
327     * @throws IndexLockException
328     * @throws IndexWriteException
329     */
330    protected function updateIndexes(array $data, int $entityId): void
331    {
332        foreach ($data as $group => $tokens) {
333            $freqIndex = $this->getFrequencyIndex($group);
334            foreach ($tokens as $tokenId => $freq) {
335                $record = $freqIndex->retrieveRow($tokenId);
336                $record = TupleOps::updateTuple($record, $entityId, $freq);
337                $freqIndex->changeRow($tokenId, $record);
338            }
339            $freqIndex->save();
340        }
341    }
342}
343