xref: /dokuwiki/inc/Search/Collection/AbstractCollection.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexAccessException;
6use dokuwiki\Search\Exception\IndexLockException;
7use dokuwiki\Search\Exception\IndexUsageException;
8use dokuwiki\Search\Exception\IndexWriteException;
9use dokuwiki\Search\Index\AbstractIndex;
10use dokuwiki\Search\Index\FileIndex;
11use dokuwiki\Search\Index\Lock;
12use dokuwiki\Search\Index\MemoryIndex;
13use dokuwiki\Search\Index\TupleOps;
14use dokuwiki\Search\Tokenizer;
15
16/**
17 * Abstract base class for index collections
18 *
19 * A collection manages a group of related indexes that together provide a specific search use case.
20 * Every collection works with four index types: entity, token, frequency, and reverse.
21 *
22 * entity - the list of the main entities (eg. pages)
23 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files)
24 * frequency - how often a token appears on a entity (can be split into multiple files)
25 * reverse - the list of tokens assigned to each entity
26 *
27 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
28 * @author Andreas Gohr <andi@splitbrain.org>
29 * @author Tom N Harris <tnharris@whoopdedo.org>
30 */
31abstract class AbstractCollection
32{
33    /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */
34    protected array $lockedIndexes = [];
35
36    /** @var bool Has a lock been acquired for all used indexes? */
37    protected bool $isWritable = false;
38
39    /**
40     * Initialize the collection with the names of the indexes it manages
41     *
42     * Entity and token indexes can be passed as already instantiated AbstractIndex objects
43     * for sharing between collections. When $idxToken is an object, $splitByLength must be false.
44     *
45     * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page'
46     * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words
47     * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies
48     * @param string $idxReverse Name of the reverse index, eg. 'pageword'
49     * @param bool $splitByLength Whether to split token/frequency indexes by token length
50     * @throws IndexUsageException
51     */
52    public function __construct(
53        protected string|AbstractIndex $idxEntity,
54        protected string|AbstractIndex $idxToken,
55        protected string $idxFrequency = '',
56        protected string $idxReverse = '',
57        protected bool   $splitByLength = false
58    )
59    {
60        if ($idxToken instanceof AbstractIndex && $splitByLength) {
61            throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index');
62        }
63    }
64
65    /**
66     * Destructor
67     *
68     * Ensures locks are released when the class is destroyed
69     */
70    public function __destruct()
71    {
72        $this->unlock();
73    }
74
75    /**
76     * Lock all indexes for writing
77     *
78     * @return $this can be used for chaining
79     * @throws IndexLockException
80     */
81    public function lock(): static
82    {
83        foreach ([
84            $this->idxEntity,
85            $this->idxToken,
86            $this->idxFrequency,
87            $this->idxReverse
88        ] as $idx) {
89            if ($idx === '') continue;
90            try {
91                if ($idx instanceof AbstractIndex) {
92                    $idx->lock();
93                    $this->lockedIndexes[] = $idx;
94                } else {
95                    Lock::acquire($idx);
96                    $this->lockedIndexes[] = $idx;
97                }
98            } catch (IndexLockException $e) {
99                $this->unlock();
100                throw $e;
101            }
102        }
103        $this->isWritable = true;
104        return $this;
105    }
106
107    /**
108     * Unlock all indexes that were successfully locked
109     *
110     * @return static
111     */
112    public function unlock(): static
113    {
114        foreach ($this->lockedIndexes as $idx) {
115            if ($idx instanceof AbstractIndex) {
116                $idx->unlock();
117            } else {
118                Lock::release($idx);
119            }
120        }
121        $this->lockedIndexes = [];
122        $this->isWritable = false;
123        return $this;
124    }
125
126    /**
127     * @return AbstractIndex
128     * @throws IndexLockException
129     */
130    public function getEntityIndex(): AbstractIndex
131    {
132        if ($this->idxEntity instanceof AbstractIndex) {
133            return $this->idxEntity;
134        }
135        return new FileIndex($this->idxEntity, '', $this->isWritable);
136    }
137
138    /**
139     * @param int $group Index group (0 for non-split, token length for split)
140     * @return AbstractIndex
141     * @throws IndexLockException
142     */
143    public function getTokenIndex(int $group = 0): AbstractIndex
144    {
145        if ($this->idxToken instanceof AbstractIndex) {
146            return $this->idxToken;
147        }
148        return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable);
149    }
150
151    /**
152     * @param int $group Index group (0 for non-split, token length for split)
153     * @return AbstractIndex
154     * @throws IndexLockException
155     */
156    public function getFrequencyIndex(int $group = 0): AbstractIndex
157    {
158        return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable);
159    }
160
161    /**
162     * @return AbstractIndex
163     * @throws IndexLockException
164     */
165    public function getReverseIndex(): AbstractIndex
166    {
167        return new FileIndex($this->idxReverse, '', $this->isWritable);
168    }
169
170    /**
171     * Whether this collection splits token/frequency indexes by token length
172     *
173     * @return bool
174     */
175    public function isSplitByLength(): bool
176    {
177        return $this->splitByLength;
178    }
179
180    /**
181     * Convert a logical group number to the index file suffix
182     *
183     * Group 0 represents non-split indexes (suffix '') while positive integers
184     * represent split-by-length indexes (suffix = the length).
185     *
186     * @param int $group
187     * @return string The file suffix ('' for group 0, the group number as string otherwise)
188     * @throws IndexUsageException when group does not match the collection's split mode
189     */
190    protected function groupToSuffix(int $group): string
191    {
192        if ($group === 0 && $this->splitByLength) {
193            throw new IndexUsageException('Group 0 is not valid for split-by-length collections');
194        }
195        if ($group !== 0 && !$this->splitByLength) {
196            throw new IndexUsageException("Group $group is not valid for non-split collections");
197        }
198        return $group === 0 ? '' : (string)$group;
199    }
200
201    /**
202     * Resolve token IDs to entity frequencies
203     *
204     * Given a set of token IDs from a specific index group, returns the entities
205     * that have those tokens and their frequencies. This encapsulates the frequency
206     * index access so that subclasses (e.g. DirectCollection) can provide alternative
207     * mappings.
208     *
209     * @param int $group Index group (0 for non-split, token length for split)
210     * @param int[] $tokenIds The token IDs to resolve
211     * @return array [tokenId => [entityId => frequency, ...], ...]
212     */
213    public function resolveTokenFrequencies(int $group, array $tokenIds): array
214    {
215        $freqIndex = $this->getFrequencyIndex($group);
216        if (!$freqIndex->exists()) return [];
217        return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds));
218    }
219
220    /**
221     * Return all entity names that have data in this collection
222     *
223     * @return string[] entity names
224     */
225    public function getEntitiesWithData(): array
226    {
227        $entityIndex = $this->getEntityIndex();
228
229        // collect entity IDs from all frequency index groups
230        $groups = $this->splitByLength
231            ? range(1, $this->getTokenIndexMaximum())
232            : [0];
233
234        $entityIds = [];
235        foreach ($groups as $group) {
236            $freqIndex = $this->getFrequencyIndex($group);
237            if (!$freqIndex->exists()) continue;
238            foreach ($freqIndex as $line) {
239                foreach (TupleOps::parseTuples($line) as $entityId => $count) {
240                    $entityIds[$entityId] = true;
241                }
242            }
243        }
244
245        $names = $entityIndex->retrieveRows(array_keys($entityIds));
246        return array_values(array_filter($names, static fn($v) => $v !== ''));
247    }
248
249    /**
250     * Maximum suffix for the token indexes (eg. max word length currently stored)
251     *
252     * @return int
253     * @throws IndexLockException
254     */
255    public function getTokenIndexMaximum(): int
256    {
257        if ($this->idxToken instanceof AbstractIndex) {
258            return $this->idxToken->max();
259        }
260        return (new MemoryIndex($this->idxToken, ''))->max();
261    }
262
263    /**
264     * Add or update the tokens for a given entity
265     *
266     * The given list of tokens replaces the previously stored list for that entity. An empty list removes the
267     * entity from the index.
268     *
269     * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs
270     * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values.
271     * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete
272     * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value.
273     *
274     * @param string $entity The name of the entity
275     * @param string[] $tokens The list of tokens for this entity
276     * @return static
277     * @throws IndexAccessException
278     * @throws IndexWriteException
279     * @throws IndexLockException
280     */
281    public function addEntity(string $entity, array $tokens): static
282    {
283        if (!$this->isWritable) {
284            throw new IndexLockException('Indexes not locked. Forgot to call lock()?');
285        }
286
287        $entityIndex = $this->getEntityIndex();
288        $entityId = $entityIndex->accessCachedValue($entity);
289
290        $old = $this->getReverseAssignments($entity);
291        $new = $this->resolveTokens($tokens);
292
293        $merged = array_replace_recursive($old, $new);
294
295        $this->updateIndexes($merged, $entityId);
296        $this->saveReverseAssignments($entity, $merged);
297
298        return $this;
299    }
300
301    /**
302     * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]]
303     *
304     * Calls countTokens() to get token frequencies (subclass responsibility), then groups
305     * by token length if splitByLength is enabled, or under '' if not. Finally resolves
306     * token strings to IDs via the appropriate token index.
307     *
308     * @param string[] $tokens The raw token list
309     * @return array [group => [tokenId => frequency, ...], ...]
310     * @throws IndexLockException
311     * @throws IndexWriteException
312     */
313    protected function resolveTokens(array $tokens): array
314    {
315        $counted = $this->countTokens($tokens);
316
317        // group tokens by their index suffix
318        $groups = [];
319        foreach ($counted as $token => $freq) {
320            $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0;
321            $groups[$group][$token] = $freq;
322        }
323
324        // resolve token strings to IDs
325        $result = [];
326        foreach ($groups as $group => $tokenFreqs) {
327            $tokenIndex = $this->getTokenIndex($group);
328            $result[$group] = [];
329            foreach ($tokenFreqs as $token => $freq) {
330                $tokenId = $tokenIndex->getRowID((string)$token);
331                $result[$group][$tokenId] = $freq;
332            }
333            $tokenIndex->save();
334        }
335
336        return $result;
337    }
338
339    /**
340     * Count or deduplicate tokens and return their frequencies
341     *
342     * FrequencyCollections return actual occurrence counts.
343     * LookupCollections deduplicate and return 1 for each token.
344     *
345     * @param string[] $tokens The raw token list
346     * @return array [token => frequency, ...]
347     */
348    abstract protected function countTokens(array $tokens): array;
349
350    /**
351     * Get the token assignments for a given entity from the reverse index
352     *
353     * Returns the parsed reverse index record. The exact structure depends on the collection type.
354     *
355     * @param string $entity
356     * @return array
357     * @throws IndexAccessException
358     * @throws IndexWriteException
359     * @throws IndexLockException
360     */
361    public function getReverseAssignments(string $entity): array
362    {
363        $entityIndex = $this->getEntityIndex();
364        $entityId = $entityIndex->accessCachedValue($entity);
365
366        $reverseIndex = $this->getReverseIndex();
367        $record = $reverseIndex->retrieveRow($entityId);
368
369        if ($record === '') {
370            return [];
371        }
372
373        return $this->parseReverseRecord($record);
374    }
375
376    /**
377     * Store the reverse index info about what tokens are assigned to the entity
378     *
379     * @param string $entity
380     * @param array $data The assignment data to store
381     * @return void
382     * @throws IndexAccessException
383     * @throws IndexWriteException
384     * @throws IndexLockException
385     */
386    protected function saveReverseAssignments(string $entity, array $data): void
387    {
388        // remove tokens with frequency 0 (no longer assigned), then remove empty groups
389        $data = array_map('array_filter', $data);
390        $data = array_filter($data);
391
392        $record = $this->formatReverseRecord($data);
393
394        $entityIndex = $this->getEntityIndex();
395        $entityId = $entityIndex->accessCachedValue($entity);
396
397        $reverseIndex = $this->getReverseIndex();
398        $reverseIndex->changeRow($entityId, $record);
399    }
400
401    /**
402     * Parse a reverse index record into a two-level array
403     *
404     * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values
405     * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(),
406     * tokens absent from the new data retain 0, signaling deletion from the frequency index.
407     *
408     * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length.
409     * For non-split collections the group prefix is omitted: "tokenId:tokenId:..."
410     * This mirrors how TupleOps omits *1 for frequency 1.
411     *
412     * @param string $record The raw reverse index record
413     * @return array [group => [tokenId => 0, ...], ...]
414     */
415    protected function parseReverseRecord(string $record): array
416    {
417        $result = [];
418        foreach (explode(':', $record) as $entry) {
419            $parts = explode('*', $entry, 2);
420            $tokenId = array_pop($parts);
421            $group = (int)(array_pop($parts) ?? 0);
422            $result[$group][$tokenId] = 0;
423        }
424        return $result;
425    }
426
427    /**
428     * Format a two-level array into a reverse index record string
429     *
430     * @param array $data [group => [tokenId => freq, ...], ...]
431     * @return string The formatted record
432     */
433    protected function formatReverseRecord(array $data): string
434    {
435        $parts = [];
436        foreach ($data as $group => $tokens) {
437            $prefix = $group === 0 ? '' : "$group*";
438            foreach (array_keys($tokens) as $tokenId) {
439                $parts[] = $prefix . $tokenId;
440            }
441        }
442        return implode(':', $parts);
443    }
444
445    /**
446     * Update frequency indexes with the given data
447     *
448     * Iterates over the two-level structure [group => [tokenId => freq]] and updates the
449     * corresponding frequency index for each group. A frequency of 0 removes the entity
450     * from that token's frequency record.
451     *
452     * @param array $data [group => [tokenId => frequency, ...], ...]
453     * @param int $entityId The entity ID
454     * @throws IndexLockException
455     * @throws IndexWriteException
456     */
457    protected function updateIndexes(array $data, int $entityId): void
458    {
459        foreach ($data as $group => $tokens) {
460            $freqIndex = $this->getFrequencyIndex($group);
461            foreach ($tokens as $tokenId => $freq) {
462                $record = $freqIndex->retrieveRow($tokenId);
463                $record = TupleOps::updateTuple($record, $entityId, $freq);
464                $freqIndex->changeRow($tokenId, $record);
465            }
466            $freqIndex->save();
467        }
468    }
469}
470