xref: /dokuwiki/inc/Search/Collection/AbstractCollection.php (revision 21fbd01b3c3eea88b767376b7b158f31f0f63127)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexAccessException;
6use dokuwiki\Search\Exception\IndexIntegrityException;
7use dokuwiki\Search\Exception\IndexLockException;
8use dokuwiki\Search\Exception\IndexUsageException;
9use dokuwiki\Search\Exception\IndexWriteException;
10use dokuwiki\Search\Index\AbstractIndex;
11use dokuwiki\Search\Index\FileIndex;
12use dokuwiki\Search\Index\Lock;
13use dokuwiki\Search\Index\MemoryIndex;
14use dokuwiki\Search\Index\TupleOps;
15use dokuwiki\Search\Tokenizer;
16
17/**
18 * Abstract base class for index collections
19 *
20 * A collection manages a group of related indexes that together provide a specific search use case.
21 * Every collection works with four index types: entity, token, frequency, and reverse.
22 *
23 * entity - the list of the main entities (eg. pages)
24 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files)
25 * frequency - how often a token appears on a entity (can be split into multiple files)
26 * reverse - the list of tokens assigned to each entity
27 *
28 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
29 * @author Andreas Gohr <andi@splitbrain.org>
30 * @author Tom N Harris <tnharris@whoopdedo.org>
31 */
32abstract class AbstractCollection
33{
34    /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */
35    protected array $lockedIndexes = [];
36
37    /** @var bool Has a lock been acquired for all used indexes? */
38    protected bool $isWritable = false;
39
40    /**
41     * Initialize the collection with the names of the indexes it manages
42     *
43     * Entity and token indexes can be passed as already instantiated AbstractIndex objects
44     * for sharing between collections. When $idxToken is an object, $splitByLength must be false.
45     *
46     * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page'
47     * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words
48     * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies
49     * @param string $idxReverse Name of the reverse index, eg. 'pageword'
50     * @param bool $splitByLength Whether to split token/frequency indexes by token length
51     * @throws IndexUsageException
52     */
53    public function __construct(
54        protected string|AbstractIndex $idxEntity,
55        protected string|AbstractIndex $idxToken,
56        protected string $idxFrequency = '',
57        protected string $idxReverse = '',
58        protected bool   $splitByLength = false
59    )
60    {
61        if ($idxToken instanceof AbstractIndex && $splitByLength) {
62            throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index');
63        }
64    }
65
66    /**
67     * Destructor
68     *
69     * Ensures locks are released when the class is destroyed
70     */
71    public function __destruct()
72    {
73        $this->unlock();
74    }
75
76    /**
77     * Lock all indexes for writing
78     *
79     * @return $this can be used for chaining
80     * @throws IndexLockException
81     */
82    public function lock(): static
83    {
84        foreach ([
85            $this->idxEntity,
86            $this->idxToken,
87            $this->idxFrequency,
88            $this->idxReverse
89        ] as $idx) {
90            if ($idx === '') continue;
91            try {
92                if ($idx instanceof AbstractIndex) {
93                    $idx->lock();
94                    $this->lockedIndexes[] = $idx;
95                } else {
96                    Lock::acquire($idx);
97                    $this->lockedIndexes[] = $idx;
98                }
99            } catch (IndexLockException $e) {
100                $this->unlock();
101                throw $e;
102            }
103        }
104        $this->isWritable = true;
105        return $this;
106    }
107
108    /**
109     * Unlock all indexes that were successfully locked
110     *
111     * @return static
112     */
113    public function unlock(): static
114    {
115        foreach ($this->lockedIndexes as $idx) {
116            if ($idx instanceof AbstractIndex) {
117                $idx->unlock();
118            } else {
119                Lock::release($idx);
120            }
121        }
122        $this->lockedIndexes = [];
123        $this->isWritable = false;
124        return $this;
125    }
126
127    /**
128     * @return AbstractIndex
129     * @throws IndexLockException
130     */
131    public function getEntityIndex(): AbstractIndex
132    {
133        if ($this->idxEntity instanceof AbstractIndex) {
134            return $this->idxEntity;
135        }
136        return new FileIndex($this->idxEntity, '', $this->isWritable);
137    }
138
139    /**
140     * @param int $group Index group (0 for non-split, token length for split)
141     * @return AbstractIndex
142     * @throws IndexLockException
143     */
144    public function getTokenIndex(int $group = 0): AbstractIndex
145    {
146        if ($this->idxToken instanceof AbstractIndex) {
147            return $this->idxToken;
148        }
149        return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable);
150    }
151
152    /**
153     * @param int $group Index group (0 for non-split, token length for split)
154     * @return AbstractIndex
155     * @throws IndexLockException
156     */
157    public function getFrequencyIndex(int $group = 0): AbstractIndex
158    {
159        return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable);
160    }
161
162    /**
163     * @return AbstractIndex
164     * @throws IndexLockException
165     */
166    public function getReverseIndex(): AbstractIndex
167    {
168        return new FileIndex($this->idxReverse, '', $this->isWritable);
169    }
170
171    /**
172     * Whether this collection splits token/frequency indexes by token length
173     *
174     * @return bool
175     */
176    public function isSplitByLength(): bool
177    {
178        return $this->splitByLength;
179    }
180
181    /**
182     * Convert a logical group number to the index file suffix
183     *
184     * Group 0 represents non-split indexes (suffix '') while positive integers
185     * represent split-by-length indexes (suffix = the length).
186     *
187     * @param int $group
188     * @return string The file suffix ('' for group 0, the group number as string otherwise)
189     * @throws IndexUsageException when group does not match the collection's split mode
190     */
191    protected function groupToSuffix(int $group): string
192    {
193        if ($group === 0 && $this->splitByLength) {
194            throw new IndexUsageException('Group 0 is not valid for split-by-length collections');
195        }
196        if ($group !== 0 && !$this->splitByLength) {
197            throw new IndexUsageException("Group $group is not valid for non-split collections");
198        }
199        return $group === 0 ? '' : (string)$group;
200    }
201
202    /**
203     * Resolve token IDs to entity frequencies
204     *
205     * Given a set of token IDs from a specific index group, returns the entities
206     * that have those tokens and their frequencies. This encapsulates the frequency
207     * index access so that subclasses (e.g. DirectCollection) can provide alternative
208     * mappings.
209     *
210     * @param int $group Index group (0 for non-split, token length for split)
211     * @param int[] $tokenIds The token IDs to resolve
212     * @return array [tokenId => [entityId => frequency, ...], ...]
213     */
214    public function resolveTokenFrequencies(int $group, array $tokenIds): array
215    {
216        $freqIndex = $this->getFrequencyIndex($group);
217        if (!$freqIndex->exists()) return [];
218        return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds));
219    }
220
221    /**
222     * Return all entity names that have data in this collection
223     *
224     * @return string[] entity names
225     */
226    public function getEntitiesWithData(): array
227    {
228        $entityIndex = $this->getEntityIndex();
229
230        // collect entity IDs from all frequency index groups
231        $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0;
232        $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0];
233
234        $entityIds = [];
235        foreach ($groups as $group) {
236            $freqIndex = $this->getFrequencyIndex($group);
237            if (!$freqIndex->exists()) continue;
238            foreach ($freqIndex as $line) {
239                foreach (TupleOps::parseTuples($line) as $entityId => $count) {
240                    $entityIds[$entityId] = true;
241                }
242            }
243        }
244
245        $names = $entityIndex->retrieveRows(array_keys($entityIds));
246        return array_values(array_filter($names, static fn($v) => $v !== ''));
247    }
248
249    /**
250     * Maximum suffix for the token indexes (eg. max word length currently stored)
251     *
252     * @return int
253     * @throws IndexLockException
254     */
255    public function getTokenIndexMaximum(): int
256    {
257        if ($this->idxToken instanceof AbstractIndex) {
258            return $this->idxToken->max();
259        }
260        return (new MemoryIndex($this->idxToken, ''))->max();
261    }
262
263    /**
264     * Check the structural integrity of this collection's indexes
265     *
266     * Verifies that paired indexes have matching line counts:
267     * - token == frequency (per group, both keyed by token RID)
268     * - entity == reverse (both keyed by entity RID)
269     *
270     * @throws IndexIntegrityException when a structural inconsistency is found
271     */
272    public function checkIntegrity(): void
273    {
274        // Check token/frequency pairs
275        $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0;
276        $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0];
277
278        foreach ($groups as $group) {
279            $tokenIndex = $this->getTokenIndex($group);
280            $freqIndex = $this->getFrequencyIndex($group);
281
282            if (!$tokenIndex->exists() && !$freqIndex->exists()) continue;
283
284            if ($tokenIndex->exists() !== $freqIndex->exists()) {
285                throw new IndexIntegrityException(
286                    "Group $group: missing " .
287                    ($tokenIndex->exists() ? 'frequency' : 'token') . ' index'
288                );
289            }
290
291            $tc = count($tokenIndex);
292            $fc = count($freqIndex);
293            if ($tc !== $fc) {
294                throw new IndexIntegrityException(
295                    "Group $group: token count ($tc) != frequency count ($fc)"
296                );
297            }
298        }
299
300        // Check entity/reverse pair
301        $entityIndex = $this->getEntityIndex();
302        $reverseIndex = $this->getReverseIndex();
303        if ($entityIndex->exists() && $reverseIndex->exists()) {
304            $ec = count($entityIndex);
305            $rc = count($reverseIndex);
306            if ($ec !== $rc) {
307                throw new IndexIntegrityException(
308                    "Entity count ($ec) != reverse count ($rc)"
309                );
310            }
311        }
312    }
313
314    /**
315     * Add or update the tokens for a given entity
316     *
317     * The given list of tokens replaces the previously stored list for that entity. An empty list removes the
318     * entity from the index.
319     *
320     * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs
321     * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values.
322     * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete
323     * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value.
324     *
325     * @param string $entity The name of the entity
326     * @param string[] $tokens The list of tokens for this entity
327     * @return static
328     * @throws IndexAccessException
329     * @throws IndexWriteException
330     * @throws IndexLockException
331     */
332    public function addEntity(string $entity, array $tokens): static
333    {
334        if (!$this->isWritable) {
335            throw new IndexLockException('Indexes not locked. Forgot to call lock()?');
336        }
337
338        $entityIndex = $this->getEntityIndex();
339        $entityId = $entityIndex->accessCachedValue($entity);
340
341        $old = $this->getReverseAssignments($entity);
342        $new = $this->resolveTokens($tokens);
343
344        $merged = array_replace_recursive($old, $new);
345
346        $this->updateIndexes($merged, $entityId);
347        $this->saveReverseAssignments($entity, $merged);
348
349        return $this;
350    }
351
352    /**
353     * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]]
354     *
355     * Calls countTokens() to get token frequencies (subclass responsibility), then groups
356     * by token length if splitByLength is enabled, or under '' if not. Finally resolves
357     * token strings to IDs via the appropriate token index.
358     *
359     * @param string[] $tokens The raw token list
360     * @return array [group => [tokenId => frequency, ...], ...]
361     * @throws IndexLockException
362     * @throws IndexWriteException
363     */
364    protected function resolveTokens(array $tokens): array
365    {
366        $counted = $this->countTokens($tokens);
367
368        // group tokens by their index suffix
369        $groups = [];
370        foreach ($counted as $token => $freq) {
371            $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0;
372            $groups[$group][$token] = $freq;
373        }
374
375        // resolve token strings to IDs
376        $result = [];
377        foreach ($groups as $group => $tokenFreqs) {
378            $tokenIndex = $this->getTokenIndex($group);
379            $result[$group] = [];
380            foreach ($tokenFreqs as $token => $freq) {
381                $tokenId = $tokenIndex->getRowID((string)$token);
382                $result[$group][$tokenId] = $freq;
383            }
384            $tokenIndex->save();
385        }
386
387        return $result;
388    }
389
390    /**
391     * Count or deduplicate tokens and return their frequencies
392     *
393     * FrequencyCollections return actual occurrence counts.
394     * LookupCollections deduplicate and return 1 for each token.
395     *
396     * @param string[] $tokens The raw token list
397     * @return array [token => frequency, ...]
398     */
399    abstract protected function countTokens(array $tokens): array;
400
401    /**
402     * Get the token assignments for a given entity from the reverse index
403     *
404     * Returns the parsed reverse index record. The exact structure depends on the collection type.
405     *
406     * @param string $entity
407     * @return array
408     * @throws IndexAccessException
409     * @throws IndexWriteException
410     * @throws IndexLockException
411     */
412    public function getReverseAssignments(string $entity): array
413    {
414        $entityIndex = $this->getEntityIndex();
415        $entityId = $entityIndex->accessCachedValue($entity);
416
417        $reverseIndex = $this->getReverseIndex();
418        $record = $reverseIndex->retrieveRow($entityId);
419
420        if ($record === '') {
421            return [];
422        }
423
424        return $this->parseReverseRecord($record);
425    }
426
427    /**
428     * Store the reverse index info about what tokens are assigned to the entity
429     *
430     * @param string $entity
431     * @param array $data The assignment data to store
432     * @return void
433     * @throws IndexAccessException
434     * @throws IndexWriteException
435     * @throws IndexLockException
436     */
437    protected function saveReverseAssignments(string $entity, array $data): void
438    {
439        // remove tokens with frequency 0 (no longer assigned), then remove empty groups
440        $data = array_map('array_filter', $data);
441        $data = array_filter($data);
442
443        $record = $this->formatReverseRecord($data);
444
445        $entityIndex = $this->getEntityIndex();
446        $entityId = $entityIndex->accessCachedValue($entity);
447
448        $reverseIndex = $this->getReverseIndex();
449        $reverseIndex->changeRow($entityId, $record);
450    }
451
452    /**
453     * Parse a reverse index record into a two-level array
454     *
455     * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values
456     * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(),
457     * tokens absent from the new data retain 0, signaling deletion from the frequency index.
458     *
459     * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length.
460     * For non-split collections the group prefix is omitted: "tokenId:tokenId:..."
461     * This mirrors how TupleOps omits *1 for frequency 1.
462     *
463     * @param string $record The raw reverse index record
464     * @return array [group => [tokenId => 0, ...], ...]
465     */
466    protected function parseReverseRecord(string $record): array
467    {
468        $result = [];
469        foreach (explode(':', $record) as $entry) {
470            $parts = explode('*', $entry, 2);
471            $tokenId = array_pop($parts);
472            $group = (int)(array_pop($parts) ?? 0);
473            $result[$group][$tokenId] = 0;
474        }
475        return $result;
476    }
477
478    /**
479     * Format a two-level array into a reverse index record string
480     *
481     * @param array $data [group => [tokenId => freq, ...], ...]
482     * @return string The formatted record
483     */
484    protected function formatReverseRecord(array $data): string
485    {
486        $parts = [];
487        foreach ($data as $group => $tokens) {
488            $prefix = $group === 0 ? '' : "$group*";
489            foreach (array_keys($tokens) as $tokenId) {
490                $parts[] = $prefix . $tokenId;
491            }
492        }
493        return implode(':', $parts);
494    }
495
496    /**
497     * Update frequency indexes with the given data
498     *
499     * Iterates over the two-level structure [group => [tokenId => freq]] and updates the
500     * corresponding frequency index for each group. A frequency of 0 removes the entity
501     * from that token's frequency record.
502     *
503     * @param array $data [group => [tokenId => frequency, ...], ...]
504     * @param int $entityId The entity ID
505     * @throws IndexLockException
506     * @throws IndexWriteException
507     */
508    protected function updateIndexes(array $data, int $entityId): void
509    {
510        foreach ($data as $group => $tokens) {
511            $freqIndex = $this->getFrequencyIndex($group);
512            foreach ($tokens as $tokenId => $freq) {
513                $record = $freqIndex->retrieveRow($tokenId);
514                $record = TupleOps::updateTuple($record, $entityId, $freq);
515                $freqIndex->changeRow($tokenId, $record);
516            }
517            $freqIndex->save();
518        }
519    }
520}
521