xref: /dokuwiki/inc/Search/Collection/AbstractCollection.php (revision 95b16223691931fa8086866a2b318bc319b06a8f)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexAccessException;
6use dokuwiki\Search\Exception\IndexLockException;
7use dokuwiki\Search\Exception\IndexUsageException;
8use dokuwiki\Search\Exception\IndexWriteException;
9use dokuwiki\Search\Index\AbstractIndex;
10use dokuwiki\Search\Index\FileIndex;
11use dokuwiki\Search\Index\Lock;
12use dokuwiki\Search\Index\MemoryIndex;
13use dokuwiki\Search\Index\TupleOps;
14use dokuwiki\Search\Tokenizer;
15
16/**
17 * Abstract base class for index collections
18 *
19 * A collection manages a group of related indexes that together provide a specific search use case.
20 * Every collection works with four index types: entity, token, frequency, and reverse.
21 *
22 * entity - the list of the main entities (eg. pages)
23 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files)
24 * frequency - how often a token appears on a entity (can be split into multiple files)
25 * reverse - the list of tokens assigned to each entity
26 *
27 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
28 * @author Andreas Gohr <andi@splitbrain.org>
29 * @author Tom N Harris <tnharris@whoopdedo.org>
30 */
31abstract class AbstractCollection
32{
33    /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */
34    protected array $lockedIndexes = [];
35
36    /** @var bool Has a lock been acquired for all used indexes? */
37    protected bool $isWritable = false;
38
39    /**
40     * Initialize the collection with the names of the indexes it manages
41     *
42     * Entity and token indexes can be passed as already instantiated AbstractIndex objects
43     * for sharing between collections. When $idxToken is an object, $splitByLength must be false.
44     *
45     * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page'
46     * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words
47     * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies
48     * @param string $idxReverse Name of the reverse index, eg. 'pageword'
49     * @param bool $splitByLength Whether to split token/frequency indexes by token length
50     * @throws IndexUsageException
51     */
52    public function __construct(
53        protected string|AbstractIndex $idxEntity,
54        protected string|AbstractIndex $idxToken,
55        protected string $idxFrequency = '',
56        protected string $idxReverse = '',
57        protected bool   $splitByLength = false
58    )
59    {
60        if ($idxToken instanceof AbstractIndex && $splitByLength) {
61            throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index');
62        }
63    }
64
65    /**
66     * Destructor
67     *
68     * Ensures locks are released when the class is destroyed
69     */
70    public function __destruct()
71    {
72        $this->unlock();
73    }
74
75    /**
76     * Lock all indexes for writing
77     *
78     * @return $this can be used for chaining
79     * @throws IndexLockException
80     */
81    public function lock(): static
82    {
83        foreach ([
84            $this->idxEntity,
85            $this->idxToken,
86            $this->idxFrequency,
87            $this->idxReverse
88        ] as $idx) {
89            if ($idx === '') continue;
90            try {
91                if ($idx instanceof AbstractIndex) {
92                    $idx->lock();
93                    $this->lockedIndexes[] = $idx;
94                } else {
95                    Lock::acquire($idx);
96                    $this->lockedIndexes[] = $idx;
97                }
98            } catch (IndexLockException $e) {
99                $this->unlock();
100                throw $e;
101            }
102        }
103        $this->isWritable = true;
104        return $this;
105    }
106
107    /**
108     * Unlock all indexes that were successfully locked
109     *
110     * @return void
111     */
112    public function unlock(): void
113    {
114        foreach ($this->lockedIndexes as $idx) {
115            if ($idx instanceof AbstractIndex) {
116                $idx->unlock();
117            } else {
118                Lock::release($idx);
119            }
120        }
121        $this->lockedIndexes = [];
122        $this->isWritable = false;
123    }
124
125    /**
126     * @return AbstractIndex
127     * @throws IndexLockException
128     */
129    public function getEntityIndex(): AbstractIndex
130    {
131        if ($this->idxEntity instanceof AbstractIndex) {
132            return $this->idxEntity;
133        }
134        return new FileIndex($this->idxEntity, '', $this->isWritable);
135    }
136
137    /**
138     * @param int|string $suffix
139     * @return AbstractIndex
140     * @throws IndexLockException
141     */
142    public function getTokenIndex(int|string $suffix): AbstractIndex
143    {
144        if ($this->idxToken instanceof AbstractIndex) {
145            return $this->idxToken;
146        }
147        return new MemoryIndex($this->idxToken, $suffix, $this->isWritable);
148    }
149
150    /**
151     * @param int|string $suffix
152     * @return AbstractIndex
153     * @throws IndexLockException
154     */
155    public function getFrequencyIndex(int|string $suffix): AbstractIndex
156    {
157        return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable);
158    }
159
160    /**
161     * @return AbstractIndex
162     * @throws IndexLockException
163     */
164    public function getReverseIndex(): AbstractIndex
165    {
166        return new FileIndex($this->idxReverse, '', $this->isWritable);
167    }
168
169    /**
170     * Maximum suffix for the token indexes (eg. max word length currently stored)
171     *
172     * @return int
173     * @throws IndexLockException
174     */
175    public function getTokenIndexMaximum(): int
176    {
177        return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum
178    }
179
180    /**
181     * Add or update the tokens for a given entity
182     *
183     * The given list of tokens replaces the previously stored list for that entity. An empty list removes the
184     * entity from the index.
185     *
186     * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs
187     * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values.
188     * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete
189     * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value.
190     *
191     * @param string $entity The name of the entity
192     * @param string[] $tokens The list of tokens for this entity
193     * @throws IndexAccessException
194     * @throws IndexWriteException
195     * @throws IndexLockException
196     */
197    public function addEntity(string $entity, array $tokens): void
198    {
199        if (!$this->isWritable) {
200            throw new IndexLockException('Indexes not locked. Forgot to call lock()?');
201        }
202
203        $entityIndex = $this->getEntityIndex();
204        $entityId = $entityIndex->accessCachedValue($entity);
205
206        $old = $this->getReverseAssignments($entity);
207        $new = $this->resolveTokens($tokens);
208
209        $merged = array_replace_recursive($old, $new);
210
211        $this->updateIndexes($merged, $entityId);
212        $this->saveReverseAssignments($entity, $merged);
213    }
214
215    /**
216     * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]]
217     *
218     * Calls countTokens() to get token frequencies (subclass responsibility), then groups
219     * by token length if splitByLength is enabled, or under '' if not. Finally resolves
220     * token strings to IDs via the appropriate token index.
221     *
222     * @param string[] $tokens The raw token list
223     * @return array [group => [tokenId => frequency, ...], ...]
224     * @throws IndexLockException
225     * @throws IndexWriteException
226     */
227    protected function resolveTokens(array $tokens): array
228    {
229        $counted = $this->countTokens($tokens);
230
231        // group tokens by their index suffix
232        $groups = [];
233        foreach ($counted as $token => $freq) {
234            $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : '';
235            $groups[$group][$token] = $freq;
236        }
237
238        // resolve token strings to IDs
239        $result = [];
240        foreach ($groups as $group => $tokenFreqs) {
241            $tokenIndex = $this->getTokenIndex($group);
242            $result[$group] = [];
243            foreach ($tokenFreqs as $token => $freq) {
244                $tokenId = $tokenIndex->getRowID((string)$token);
245                $result[$group][$tokenId] = $freq;
246            }
247            $tokenIndex->save();
248        }
249
250        return $result;
251    }
252
253    /**
254     * Count or deduplicate tokens and return their frequencies
255     *
256     * FrequencyCollections return actual occurrence counts.
257     * LookupCollections deduplicate and return 1 for each token.
258     *
259     * @param string[] $tokens The raw token list
260     * @return array [token => frequency, ...]
261     */
262    abstract protected function countTokens(array $tokens): array;
263
264    /**
265     * Get the token assignments for a given entity from the reverse index
266     *
267     * Returns the parsed reverse index record. The exact structure depends on the collection type.
268     *
269     * @param string $entity
270     * @return array
271     * @throws IndexAccessException
272     * @throws IndexWriteException
273     * @throws IndexLockException
274     */
275    public function getReverseAssignments(string $entity): array
276    {
277        $entityIndex = $this->getEntityIndex();
278        $entityId = $entityIndex->accessCachedValue($entity);
279
280        $reverseIndex = $this->getReverseIndex();
281        $record = $reverseIndex->retrieveRow($entityId);
282
283        if ($record === '') {
284            return [];
285        }
286
287        return $this->parseReverseRecord($record);
288    }
289
290    /**
291     * Store the reverse index info about what tokens are assigned to the entity
292     *
293     * @param string $entity
294     * @param array $data The assignment data to store
295     * @return void
296     * @throws IndexAccessException
297     * @throws IndexWriteException
298     * @throws IndexLockException
299     */
300    protected function saveReverseAssignments(string $entity, array $data): void
301    {
302        // remove tokens with frequency 0 (no longer assigned), then remove empty groups
303        $data = array_map('array_filter', $data);
304        $data = array_filter($data);
305
306        $record = $this->formatReverseRecord($data);
307
308        $entityIndex = $this->getEntityIndex();
309        $entityId = $entityIndex->accessCachedValue($entity);
310
311        $reverseIndex = $this->getReverseIndex();
312        $reverseIndex->changeRow($entityId, $record);
313    }
314
315    /**
316     * Parse a reverse index record into a two-level array
317     *
318     * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values
319     * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(),
320     * tokens absent from the new data retain 0, signaling deletion from the frequency index.
321     *
322     * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length.
323     * For non-split collections the group prefix is omitted: "tokenId:tokenId:..."
324     * This mirrors how TupleOps omits *1 for frequency 1.
325     *
326     * @param string $record The raw reverse index record
327     * @return array [group => [tokenId => 0, ...], ...]
328     */
329    protected function parseReverseRecord(string $record): array
330    {
331        $result = [];
332        foreach (explode(':', $record) as $entry) {
333            $parts = explode('*', $entry, 2);
334            $tokenId = array_pop($parts);
335            $group = array_pop($parts) ?? '';
336            $result[$group][$tokenId] = 0;
337        }
338        return $result;
339    }
340
341    /**
342     * Format a two-level array into a reverse index record string
343     *
344     * @param array $data [group => [tokenId => freq, ...], ...]
345     * @return string The formatted record
346     */
347    protected function formatReverseRecord(array $data): string
348    {
349        $parts = [];
350        foreach ($data as $group => $tokens) {
351            $prefix = $group === '' ? '' : "$group*";
352            foreach (array_keys($tokens) as $tokenId) {
353                $parts[] = $prefix . $tokenId;
354            }
355        }
356        return implode(':', $parts);
357    }
358
359    /**
360     * Update frequency indexes with the given data
361     *
362     * Iterates over the two-level structure [group => [tokenId => freq]] and updates the
363     * corresponding frequency index for each group. A frequency of 0 removes the entity
364     * from that token's frequency record.
365     *
366     * @param array $data [group => [tokenId => frequency, ...], ...]
367     * @param int $entityId The entity ID
368     * @throws IndexLockException
369     * @throws IndexWriteException
370     */
371    protected function updateIndexes(array $data, int $entityId): void
372    {
373        foreach ($data as $group => $tokens) {
374            $freqIndex = $this->getFrequencyIndex($group);
375            foreach ($tokens as $tokenId => $freq) {
376                $record = $freqIndex->retrieveRow($tokenId);
377                $record = TupleOps::updateTuple($record, $entityId, $freq);
378                $freqIndex->changeRow($tokenId, $record);
379            }
380            $freqIndex->save();
381        }
382    }
383}
384