xref: /dokuwiki/inc/Search/Collection/AbstractCollection.php (revision c66b5ec65fd5aa2f1037d2be542b49297f3aac0e)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexAccessException;
6use dokuwiki\Search\Exception\IndexLockException;
7use dokuwiki\Search\Exception\IndexWriteException;
8use dokuwiki\Search\Index\FileIndex;
9use dokuwiki\Search\Index\Lock;
10use dokuwiki\Search\Index\MemoryIndex;
11use dokuwiki\Search\Index\TupleOps;
12use dokuwiki\Search\Tokenizer;
13
14/**
15 * Abstract base class for index collections
16 *
17 * A collection manages a group of related indexes that together provide a specific search use case.
18 * Every collection works with four index types: entity, token, frequency, and reverse.
19 *
20 * entity - the list of the main entities (eg. pages)
21 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files)
22 * frequency - how often a token appears on a entity (can be split into multiple files)
23 * reverse - the list of tokens assigned to each entity
24 *
25 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
26 * @author Andreas Gohr <andi@splitbrain.org>
27 * @author Tom N Harris <tnharris@whoopdedo.org>
28 */
29abstract class AbstractCollection
30{
31    /** @var string[] Index names that have been successfully locked */
32    protected array $lockedIndexes = [];
33
34    /** @var bool Has a lock been acquired for all used indexes? */
35    protected bool $isWritable = false;
36
37    /**
38     * Initialize the collection with the names of the indexes it manages
39     *
40     * @param string $idxEntity Name of the primary entity index, eg. 'page'
41     * @param string $idxToken Base name of the secondary entity index, eg. 'w' for words
42     * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies
43     * @param string $idxReverse Name of the reverse index, eg. 'pageword'
44     * @param bool $splitByLength Whether to split token/frequency indexes by token length
45     */
46    public function __construct(
47        protected string $idxEntity,
48        protected string $idxToken,
49        protected string $idxFrequency = '',
50        protected string $idxReverse = '',
51        protected bool   $splitByLength = false
52    )
53    {
54    }
55
56    /**
57     * Destructor
58     *
59     * Ensures locks are released when the class is destroyed
60     */
61    public function __destruct()
62    {
63        $this->unlock();
64    }
65
66    /**
67     * Lock all indexes for writing
68     *
69     * @return $this can be used for chaining
70     * @throws IndexLockException
71     */
72    public function lock(): static
73    {
74        foreach (array_filter([
75            $this->idxEntity,
76            $this->idxToken,
77            $this->idxFrequency,
78            $this->idxReverse
79        ]) as $idxName) {
80            try {
81                Lock::acquire($idxName);
82                $this->lockedIndexes[] = $idxName;
83            } catch (IndexLockException $e) {
84                $this->unlock();
85                throw $e;
86            }
87        }
88        $this->isWritable = true;
89        return $this;
90    }
91
92    /**
93     * Unlock all indexes that were successfully locked
94     *
95     * @return void
96     */
97    public function unlock(): void
98    {
99        foreach ($this->lockedIndexes as $idxName) {
100            Lock::release($idxName);
101        }
102        $this->lockedIndexes = [];
103        $this->isWritable = false;
104    }
105
106    /**
107     * @return FileIndex
108     * @throws IndexLockException
109     */
110    public function getEntityIndex(): FileIndex
111    {
112        return new FileIndex($this->idxEntity, '', $this->isWritable);
113    }
114
115    /**
116     * @param int|string $suffix
117     * @return MemoryIndex
118     * @throws IndexLockException
119     */
120    public function getTokenIndex(int|string $suffix): MemoryIndex
121    {
122        return new MemoryIndex($this->idxToken, $suffix, $this->isWritable);
123    }
124
125    /**
126     * @param int|string $suffix
127     * @return MemoryIndex
128     * @throws IndexLockException
129     */
130    public function getFrequencyIndex(int|string $suffix): MemoryIndex
131    {
132        return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable);
133    }
134
135    /**
136     * @return FileIndex
137     * @throws IndexLockException
138     */
139    public function getReverseIndex(): FileIndex
140    {
141        return new FileIndex($this->idxReverse, '', $this->isWritable);
142    }
143
144    /**
145     * Maximum suffix for the token indexes (eg. max word length currently stored)
146     *
147     * @return int
148     * @throws IndexLockException
149     */
150    public function getTokenIndexMaximum(): int
151    {
152        return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum
153    }
154
155    /**
156     * Add or update the tokens for a given entity
157     *
158     * The given list of tokens replaces the previously stored list for that entity. An empty list removes the
159     * entity from the index.
160     *
161     * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs
162     * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values.
163     * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete
164     * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value.
165     *
166     * @param string $entity The name of the entity
167     * @param string[] $tokens The list of tokens for this entity
168     * @throws IndexAccessException
169     * @throws IndexWriteException
170     * @throws IndexLockException
171     */
172    public function addEntity(string $entity, array $tokens): void
173    {
174        if (!$this->isWritable) {
175            throw new IndexLockException('Indexes not locked. Forgot to call lock()?');
176        }
177
178        $entityIndex = $this->getEntityIndex();
179        $entityId = $entityIndex->accessCachedValue($entity);
180
181        $old = $this->getReverseAssignments($entity);
182        $new = $this->resolveTokens($tokens);
183
184        $merged = array_replace_recursive($old, $new);
185
186        $this->updateIndexes($merged, $entityId);
187        $this->saveReverseAssignments($entity, $merged);
188    }
189
190    /**
191     * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]]
192     *
193     * Calls countTokens() to get token frequencies (subclass responsibility), then groups
194     * by token length if splitByLength is enabled, or under '' if not. Finally resolves
195     * token strings to IDs via the appropriate token index.
196     *
197     * @param string[] $tokens The raw token list
198     * @return array [group => [tokenId => frequency, ...], ...]
199     * @throws IndexLockException
200     * @throws IndexWriteException
201     */
202    protected function resolveTokens(array $tokens): array
203    {
204        $counted = $this->countTokens($tokens);
205
206        // group tokens by their index suffix
207        $groups = [];
208        foreach ($counted as $token => $freq) {
209            $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : '';
210            $groups[$group][$token] = $freq;
211        }
212
213        // resolve token strings to IDs
214        $result = [];
215        foreach ($groups as $group => $tokenFreqs) {
216            $tokenIndex = $this->getTokenIndex($group);
217            $result[$group] = [];
218            foreach ($tokenFreqs as $token => $freq) {
219                $tokenId = $tokenIndex->getRowID((string)$token);
220                $result[$group][$tokenId] = $freq;
221            }
222            $tokenIndex->save();
223        }
224
225        return $result;
226    }
227
228    /**
229     * Count or deduplicate tokens and return their frequencies
230     *
231     * FrequencyCollections return actual occurrence counts.
232     * LookupCollections deduplicate and return 1 for each token.
233     *
234     * @param string[] $tokens The raw token list
235     * @return array [token => frequency, ...]
236     */
237    abstract protected function countTokens(array $tokens): array;
238
239    /**
240     * Get the token assignments for a given entity from the reverse index
241     *
242     * Returns the parsed reverse index record. The exact structure depends on the collection type.
243     *
244     * @param string $entity
245     * @return array
246     * @throws IndexAccessException
247     * @throws IndexWriteException
248     * @throws IndexLockException
249     */
250    public function getReverseAssignments(string $entity): array
251    {
252        $entityIndex = $this->getEntityIndex();
253        $entityId = $entityIndex->accessCachedValue($entity);
254
255        $reverseIndex = $this->getReverseIndex();
256        $record = $reverseIndex->retrieveRow($entityId);
257
258        if ($record === '') {
259            return [];
260        }
261
262        return $this->parseReverseRecord($record);
263    }
264
265    /**
266     * Store the reverse index info about what tokens are assigned to the entity
267     *
268     * @param string $entity
269     * @param array $data The assignment data to store
270     * @return void
271     * @throws IndexAccessException
272     * @throws IndexWriteException
273     * @throws IndexLockException
274     */
275    protected function saveReverseAssignments(string $entity, array $data): void
276    {
277        // remove tokens with frequency 0 (no longer assigned), then remove empty groups
278        $data = array_map('array_filter', $data);
279        $data = array_filter($data);
280
281        $record = $this->formatReverseRecord($data);
282
283        $entityIndex = $this->getEntityIndex();
284        $entityId = $entityIndex->accessCachedValue($entity);
285
286        $reverseIndex = $this->getReverseIndex();
287        $reverseIndex->changeRow($entityId, $record);
288    }
289
290    /**
291     * Parse a reverse index record into a two-level array
292     *
293     * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values
294     * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(),
295     * tokens absent from the new data retain 0, signaling deletion from the frequency index.
296     *
297     * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length.
298     * For non-split collections the group prefix is omitted: "tokenId:tokenId:..."
299     * This mirrors how TupleOps omits *1 for frequency 1.
300     *
301     * @param string $record The raw reverse index record
302     * @return array [group => [tokenId => 0, ...], ...]
303     */
304    protected function parseReverseRecord(string $record): array
305    {
306        $result = [];
307        foreach (explode(':', $record) as $entry) {
308            $parts = explode('*', $entry, 2);
309            $tokenId = array_pop($parts);
310            $group = array_pop($parts) ?? '';
311            $result[$group][$tokenId] = 0;
312        }
313        return $result;
314    }
315
316    /**
317     * Format a two-level array into a reverse index record string
318     *
319     * @param array $data [group => [tokenId => freq, ...], ...]
320     * @return string The formatted record
321     */
322    protected function formatReverseRecord(array $data): string
323    {
324        $parts = [];
325        foreach ($data as $group => $tokens) {
326            $prefix = $group === '' ? '' : "$group*";
327            foreach (array_keys($tokens) as $tokenId) {
328                $parts[] = $prefix . $tokenId;
329            }
330        }
331        return implode(':', $parts);
332    }
333
334    /**
335     * Update frequency indexes with the given data
336     *
337     * Iterates over the two-level structure [group => [tokenId => freq]] and updates the
338     * corresponding frequency index for each group. A frequency of 0 removes the entity
339     * from that token's frequency record.
340     *
341     * @param array $data [group => [tokenId => frequency, ...], ...]
342     * @param int $entityId The entity ID
343     * @throws IndexLockException
344     * @throws IndexWriteException
345     */
346    protected function updateIndexes(array $data, int $entityId): void
347    {
348        foreach ($data as $group => $tokens) {
349            $freqIndex = $this->getFrequencyIndex($group);
350            foreach ($tokens as $tokenId => $freq) {
351                $record = $freqIndex->retrieveRow($tokenId);
352                $record = TupleOps::updateTuple($record, $entityId, $freq);
353                $freqIndex->changeRow($tokenId, $record);
354            }
355            $freqIndex->save();
356        }
357    }
358}
359