xref: /dokuwiki/inc/Search/Collection/AbstractCollection.php (revision 21fbd01b3c3eea88b767376b7b158f31f0f63127)
1f2bbffb5SAndreas Gohr<?php
2f2bbffb5SAndreas Gohr
3f2bbffb5SAndreas Gohrnamespace dokuwiki\Search\Collection;
4f2bbffb5SAndreas Gohr
5f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException;
6*21fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException;
7f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexLockException;
895b16223SAndreas Gohruse dokuwiki\Search\Exception\IndexUsageException;
9f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexWriteException;
1095b16223SAndreas Gohruse dokuwiki\Search\Index\AbstractIndex;
11f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\FileIndex;
12f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\Lock;
13f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex;
140a9fafedSAndreas Gohruse dokuwiki\Search\Index\TupleOps;
150a9fafedSAndreas Gohruse dokuwiki\Search\Tokenizer;
16f2bbffb5SAndreas Gohr
17f2bbffb5SAndreas Gohr/**
18f2bbffb5SAndreas Gohr * Abstract base class for index collections
19f2bbffb5SAndreas Gohr *
20f2bbffb5SAndreas Gohr * A collection manages a group of related indexes that together provide a specific search use case.
21f2bbffb5SAndreas Gohr * Every collection works with four index types: entity, token, frequency, and reverse.
22f2bbffb5SAndreas Gohr *
23f2bbffb5SAndreas Gohr * entity - the list of the main entities (eg. pages)
24f2bbffb5SAndreas Gohr * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files)
25f2bbffb5SAndreas Gohr * frequency - how often a token appears on a entity (can be split into multiple files)
26f2bbffb5SAndreas Gohr * reverse - the list of tokens assigned to each entity
27f2bbffb5SAndreas Gohr *
28f2bbffb5SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
29f2bbffb5SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
30f2bbffb5SAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org>
31f2bbffb5SAndreas Gohr */
32f2bbffb5SAndreas Gohrabstract class AbstractCollection
33f2bbffb5SAndreas Gohr{
3495b16223SAndreas Gohr    /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */
350a9fafedSAndreas Gohr    protected array $lockedIndexes = [];
360a9fafedSAndreas Gohr
37f2bbffb5SAndreas Gohr    /** @var bool Has a lock been acquired for all used indexes? */
38f2bbffb5SAndreas Gohr    protected bool $isWritable = false;
39f2bbffb5SAndreas Gohr
40f2bbffb5SAndreas Gohr    /**
41f2bbffb5SAndreas Gohr     * Initialize the collection with the names of the indexes it manages
42f2bbffb5SAndreas Gohr     *
4395b16223SAndreas Gohr     * Entity and token indexes can be passed as already instantiated AbstractIndex objects
4495b16223SAndreas Gohr     * for sharing between collections. When $idxToken is an object, $splitByLength must be false.
4595b16223SAndreas Gohr     *
4695b16223SAndreas Gohr     * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page'
4795b16223SAndreas Gohr     * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words
48f2bbffb5SAndreas Gohr     * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies
49f2bbffb5SAndreas Gohr     * @param string $idxReverse Name of the reverse index, eg. 'pageword'
50f2bbffb5SAndreas Gohr     * @param bool $splitByLength Whether to split token/frequency indexes by token length
5195b16223SAndreas Gohr     * @throws IndexUsageException
52f2bbffb5SAndreas Gohr     */
53f2bbffb5SAndreas Gohr    public function __construct(
5495b16223SAndreas Gohr        protected string|AbstractIndex $idxEntity,
5595b16223SAndreas Gohr        protected string|AbstractIndex $idxToken,
56d92c078cSAndreas Gohr        protected string $idxFrequency = '',
57d92c078cSAndreas Gohr        protected string $idxReverse = '',
58f2bbffb5SAndreas Gohr        protected bool   $splitByLength = false
590a9fafedSAndreas Gohr    )
600a9fafedSAndreas Gohr    {
6195b16223SAndreas Gohr        if ($idxToken instanceof AbstractIndex && $splitByLength) {
6295b16223SAndreas Gohr            throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index');
6395b16223SAndreas Gohr        }
64f2bbffb5SAndreas Gohr    }
65f2bbffb5SAndreas Gohr
66f2bbffb5SAndreas Gohr    /**
67f2bbffb5SAndreas Gohr     * Destructor
68f2bbffb5SAndreas Gohr     *
69f2bbffb5SAndreas Gohr     * Ensures locks are released when the class is destroyed
70f2bbffb5SAndreas Gohr     */
71f2bbffb5SAndreas Gohr    public function __destruct()
72f2bbffb5SAndreas Gohr    {
73f2bbffb5SAndreas Gohr        $this->unlock();
74f2bbffb5SAndreas Gohr    }
75f2bbffb5SAndreas Gohr
76f2bbffb5SAndreas Gohr    /**
77f2bbffb5SAndreas Gohr     * Lock all indexes for writing
78f2bbffb5SAndreas Gohr     *
79f2bbffb5SAndreas Gohr     * @return $this can be used for chaining
80f2bbffb5SAndreas Gohr     * @throws IndexLockException
81f2bbffb5SAndreas Gohr     */
82f2bbffb5SAndreas Gohr    public function lock(): static
83f2bbffb5SAndreas Gohr    {
8495b16223SAndreas Gohr        foreach ([
850a9fafedSAndreas Gohr            $this->idxEntity,
860a9fafedSAndreas Gohr            $this->idxToken,
870a9fafedSAndreas Gohr            $this->idxFrequency,
880a9fafedSAndreas Gohr            $this->idxReverse
8995b16223SAndreas Gohr        ] as $idx) {
9095b16223SAndreas Gohr            if ($idx === '') continue;
91c66b5ec6SAndreas Gohr            try {
9295b16223SAndreas Gohr                if ($idx instanceof AbstractIndex) {
9395b16223SAndreas Gohr                    $idx->lock();
9495b16223SAndreas Gohr                    $this->lockedIndexes[] = $idx;
9595b16223SAndreas Gohr                } else {
9695b16223SAndreas Gohr                    Lock::acquire($idx);
9795b16223SAndreas Gohr                    $this->lockedIndexes[] = $idx;
9895b16223SAndreas Gohr                }
99c66b5ec6SAndreas Gohr            } catch (IndexLockException $e) {
100c66b5ec6SAndreas Gohr                $this->unlock();
101c66b5ec6SAndreas Gohr                throw $e;
102c66b5ec6SAndreas Gohr            }
103f2bbffb5SAndreas Gohr        }
104f2bbffb5SAndreas Gohr        $this->isWritable = true;
105f2bbffb5SAndreas Gohr        return $this;
106f2bbffb5SAndreas Gohr    }
107f2bbffb5SAndreas Gohr
108f2bbffb5SAndreas Gohr    /**
1090a9fafedSAndreas Gohr     * Unlock all indexes that were successfully locked
110f2bbffb5SAndreas Gohr     *
11183b3acccSAndreas Gohr     * @return static
112f2bbffb5SAndreas Gohr     */
11383b3acccSAndreas Gohr    public function unlock(): static
114f2bbffb5SAndreas Gohr    {
11595b16223SAndreas Gohr        foreach ($this->lockedIndexes as $idx) {
11695b16223SAndreas Gohr            if ($idx instanceof AbstractIndex) {
11795b16223SAndreas Gohr                $idx->unlock();
11895b16223SAndreas Gohr            } else {
11995b16223SAndreas Gohr                Lock::release($idx);
12095b16223SAndreas Gohr            }
121f2bbffb5SAndreas Gohr        }
1220a9fafedSAndreas Gohr        $this->lockedIndexes = [];
123f2bbffb5SAndreas Gohr        $this->isWritable = false;
12483b3acccSAndreas Gohr        return $this;
125f2bbffb5SAndreas Gohr    }
126f2bbffb5SAndreas Gohr
127f2bbffb5SAndreas Gohr    /**
12895b16223SAndreas Gohr     * @return AbstractIndex
129c66b5ec6SAndreas Gohr     * @throws IndexLockException
130f2bbffb5SAndreas Gohr     */
13195b16223SAndreas Gohr    public function getEntityIndex(): AbstractIndex
132f2bbffb5SAndreas Gohr    {
13395b16223SAndreas Gohr        if ($this->idxEntity instanceof AbstractIndex) {
13495b16223SAndreas Gohr            return $this->idxEntity;
13595b16223SAndreas Gohr        }
136f2bbffb5SAndreas Gohr        return new FileIndex($this->idxEntity, '', $this->isWritable);
137f2bbffb5SAndreas Gohr    }
138f2bbffb5SAndreas Gohr
139f2bbffb5SAndreas Gohr    /**
1406734bb8cSAndreas Gohr     * @param int $group Index group (0 for non-split, token length for split)
14195b16223SAndreas Gohr     * @return AbstractIndex
142c66b5ec6SAndreas Gohr     * @throws IndexLockException
143f2bbffb5SAndreas Gohr     */
1446734bb8cSAndreas Gohr    public function getTokenIndex(int $group = 0): AbstractIndex
145f2bbffb5SAndreas Gohr    {
14695b16223SAndreas Gohr        if ($this->idxToken instanceof AbstractIndex) {
14795b16223SAndreas Gohr            return $this->idxToken;
14895b16223SAndreas Gohr        }
1496734bb8cSAndreas Gohr        return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable);
150f2bbffb5SAndreas Gohr    }
151f2bbffb5SAndreas Gohr
152f2bbffb5SAndreas Gohr    /**
1536734bb8cSAndreas Gohr     * @param int $group Index group (0 for non-split, token length for split)
15495b16223SAndreas Gohr     * @return AbstractIndex
155c66b5ec6SAndreas Gohr     * @throws IndexLockException
156f2bbffb5SAndreas Gohr     */
1576734bb8cSAndreas Gohr    public function getFrequencyIndex(int $group = 0): AbstractIndex
158f2bbffb5SAndreas Gohr    {
1596734bb8cSAndreas Gohr        return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable);
160f2bbffb5SAndreas Gohr    }
161f2bbffb5SAndreas Gohr
162f2bbffb5SAndreas Gohr    /**
16395b16223SAndreas Gohr     * @return AbstractIndex
164c66b5ec6SAndreas Gohr     * @throws IndexLockException
165f2bbffb5SAndreas Gohr     */
16695b16223SAndreas Gohr    public function getReverseIndex(): AbstractIndex
167f2bbffb5SAndreas Gohr    {
168f2bbffb5SAndreas Gohr        return new FileIndex($this->idxReverse, '', $this->isWritable);
169f2bbffb5SAndreas Gohr    }
170f2bbffb5SAndreas Gohr
171f2bbffb5SAndreas Gohr    /**
1726734bb8cSAndreas Gohr     * Whether this collection splits token/frequency indexes by token length
1736734bb8cSAndreas Gohr     *
1746734bb8cSAndreas Gohr     * @return bool
1756734bb8cSAndreas Gohr     */
1766734bb8cSAndreas Gohr    public function isSplitByLength(): bool
1776734bb8cSAndreas Gohr    {
1786734bb8cSAndreas Gohr        return $this->splitByLength;
1796734bb8cSAndreas Gohr    }
1806734bb8cSAndreas Gohr
1816734bb8cSAndreas Gohr    /**
1826734bb8cSAndreas Gohr     * Convert a logical group number to the index file suffix
1836734bb8cSAndreas Gohr     *
1846734bb8cSAndreas Gohr     * Group 0 represents non-split indexes (suffix '') while positive integers
1856734bb8cSAndreas Gohr     * represent split-by-length indexes (suffix = the length).
1866734bb8cSAndreas Gohr     *
1876734bb8cSAndreas Gohr     * @param int $group
1886734bb8cSAndreas Gohr     * @return string The file suffix ('' for group 0, the group number as string otherwise)
1896734bb8cSAndreas Gohr     * @throws IndexUsageException when group does not match the collection's split mode
1906734bb8cSAndreas Gohr     */
1916734bb8cSAndreas Gohr    protected function groupToSuffix(int $group): string
1926734bb8cSAndreas Gohr    {
1936734bb8cSAndreas Gohr        if ($group === 0 && $this->splitByLength) {
1946734bb8cSAndreas Gohr            throw new IndexUsageException('Group 0 is not valid for split-by-length collections');
1956734bb8cSAndreas Gohr        }
1966734bb8cSAndreas Gohr        if ($group !== 0 && !$this->splitByLength) {
1976734bb8cSAndreas Gohr            throw new IndexUsageException("Group $group is not valid for non-split collections");
1986734bb8cSAndreas Gohr        }
1996734bb8cSAndreas Gohr        return $group === 0 ? '' : (string)$group;
2006734bb8cSAndreas Gohr    }
2016734bb8cSAndreas Gohr
2026734bb8cSAndreas Gohr    /**
2036734bb8cSAndreas Gohr     * Resolve token IDs to entity frequencies
2046734bb8cSAndreas Gohr     *
2056734bb8cSAndreas Gohr     * Given a set of token IDs from a specific index group, returns the entities
2066734bb8cSAndreas Gohr     * that have those tokens and their frequencies. This encapsulates the frequency
2076734bb8cSAndreas Gohr     * index access so that subclasses (e.g. DirectCollection) can provide alternative
2086734bb8cSAndreas Gohr     * mappings.
2096734bb8cSAndreas Gohr     *
2106734bb8cSAndreas Gohr     * @param int $group Index group (0 for non-split, token length for split)
2116734bb8cSAndreas Gohr     * @param int[] $tokenIds The token IDs to resolve
2126734bb8cSAndreas Gohr     * @return array [tokenId => [entityId => frequency, ...], ...]
2136734bb8cSAndreas Gohr     */
2146734bb8cSAndreas Gohr    public function resolveTokenFrequencies(int $group, array $tokenIds): array
2156734bb8cSAndreas Gohr    {
2166734bb8cSAndreas Gohr        $freqIndex = $this->getFrequencyIndex($group);
2176734bb8cSAndreas Gohr        if (!$freqIndex->exists()) return [];
2186734bb8cSAndreas Gohr        return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds));
2196734bb8cSAndreas Gohr    }
2206734bb8cSAndreas Gohr
2216734bb8cSAndreas Gohr    /**
2226734bb8cSAndreas Gohr     * Return all entity names that have data in this collection
2236734bb8cSAndreas Gohr     *
2246734bb8cSAndreas Gohr     * @return string[] entity names
2256734bb8cSAndreas Gohr     */
2266734bb8cSAndreas Gohr    public function getEntitiesWithData(): array
2276734bb8cSAndreas Gohr    {
2286734bb8cSAndreas Gohr        $entityIndex = $this->getEntityIndex();
2296734bb8cSAndreas Gohr
2306734bb8cSAndreas Gohr        // collect entity IDs from all frequency index groups
231*21fbd01bSAndreas Gohr        $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0;
232*21fbd01bSAndreas Gohr        $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0];
2336734bb8cSAndreas Gohr
2346734bb8cSAndreas Gohr        $entityIds = [];
2356734bb8cSAndreas Gohr        foreach ($groups as $group) {
2366734bb8cSAndreas Gohr            $freqIndex = $this->getFrequencyIndex($group);
2376734bb8cSAndreas Gohr            if (!$freqIndex->exists()) continue;
2386734bb8cSAndreas Gohr            foreach ($freqIndex as $line) {
2396734bb8cSAndreas Gohr                foreach (TupleOps::parseTuples($line) as $entityId => $count) {
2406734bb8cSAndreas Gohr                    $entityIds[$entityId] = true;
2416734bb8cSAndreas Gohr                }
2426734bb8cSAndreas Gohr            }
2436734bb8cSAndreas Gohr        }
2446734bb8cSAndreas Gohr
2456734bb8cSAndreas Gohr        $names = $entityIndex->retrieveRows(array_keys($entityIds));
2466734bb8cSAndreas Gohr        return array_values(array_filter($names, static fn($v) => $v !== ''));
2476734bb8cSAndreas Gohr    }
2486734bb8cSAndreas Gohr
2496734bb8cSAndreas Gohr    /**
250f2bbffb5SAndreas Gohr     * Maximum suffix for the token indexes (eg. max word length currently stored)
251f2bbffb5SAndreas Gohr     *
252f2bbffb5SAndreas Gohr     * @return int
253c66b5ec6SAndreas Gohr     * @throws IndexLockException
254f2bbffb5SAndreas Gohr     */
255f2bbffb5SAndreas Gohr    public function getTokenIndexMaximum(): int
256f2bbffb5SAndreas Gohr    {
2576734bb8cSAndreas Gohr        if ($this->idxToken instanceof AbstractIndex) {
2586734bb8cSAndreas Gohr            return $this->idxToken->max();
2596734bb8cSAndreas Gohr        }
2606734bb8cSAndreas Gohr        return (new MemoryIndex($this->idxToken, ''))->max();
261f2bbffb5SAndreas Gohr    }
262f2bbffb5SAndreas Gohr
263f2bbffb5SAndreas Gohr    /**
264*21fbd01bSAndreas Gohr     * Check the structural integrity of this collection's indexes
265*21fbd01bSAndreas Gohr     *
266*21fbd01bSAndreas Gohr     * Verifies that paired indexes have matching line counts:
267*21fbd01bSAndreas Gohr     * - token == frequency (per group, both keyed by token RID)
268*21fbd01bSAndreas Gohr     * - entity == reverse (both keyed by entity RID)
269*21fbd01bSAndreas Gohr     *
270*21fbd01bSAndreas Gohr     * @throws IndexIntegrityException when a structural inconsistency is found
271*21fbd01bSAndreas Gohr     */
272*21fbd01bSAndreas Gohr    public function checkIntegrity(): void
273*21fbd01bSAndreas Gohr    {
274*21fbd01bSAndreas Gohr        // Check token/frequency pairs
275*21fbd01bSAndreas Gohr        $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0;
276*21fbd01bSAndreas Gohr        $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0];
277*21fbd01bSAndreas Gohr
278*21fbd01bSAndreas Gohr        foreach ($groups as $group) {
279*21fbd01bSAndreas Gohr            $tokenIndex = $this->getTokenIndex($group);
280*21fbd01bSAndreas Gohr            $freqIndex = $this->getFrequencyIndex($group);
281*21fbd01bSAndreas Gohr
282*21fbd01bSAndreas Gohr            if (!$tokenIndex->exists() && !$freqIndex->exists()) continue;
283*21fbd01bSAndreas Gohr
284*21fbd01bSAndreas Gohr            if ($tokenIndex->exists() !== $freqIndex->exists()) {
285*21fbd01bSAndreas Gohr                throw new IndexIntegrityException(
286*21fbd01bSAndreas Gohr                    "Group $group: missing " .
287*21fbd01bSAndreas Gohr                    ($tokenIndex->exists() ? 'frequency' : 'token') . ' index'
288*21fbd01bSAndreas Gohr                );
289*21fbd01bSAndreas Gohr            }
290*21fbd01bSAndreas Gohr
291*21fbd01bSAndreas Gohr            $tc = count($tokenIndex);
292*21fbd01bSAndreas Gohr            $fc = count($freqIndex);
293*21fbd01bSAndreas Gohr            if ($tc !== $fc) {
294*21fbd01bSAndreas Gohr                throw new IndexIntegrityException(
295*21fbd01bSAndreas Gohr                    "Group $group: token count ($tc) != frequency count ($fc)"
296*21fbd01bSAndreas Gohr                );
297*21fbd01bSAndreas Gohr            }
298*21fbd01bSAndreas Gohr        }
299*21fbd01bSAndreas Gohr
300*21fbd01bSAndreas Gohr        // Check entity/reverse pair
301*21fbd01bSAndreas Gohr        $entityIndex = $this->getEntityIndex();
302*21fbd01bSAndreas Gohr        $reverseIndex = $this->getReverseIndex();
303*21fbd01bSAndreas Gohr        if ($entityIndex->exists() && $reverseIndex->exists()) {
304*21fbd01bSAndreas Gohr            $ec = count($entityIndex);
305*21fbd01bSAndreas Gohr            $rc = count($reverseIndex);
306*21fbd01bSAndreas Gohr            if ($ec !== $rc) {
307*21fbd01bSAndreas Gohr                throw new IndexIntegrityException(
308*21fbd01bSAndreas Gohr                    "Entity count ($ec) != reverse count ($rc)"
309*21fbd01bSAndreas Gohr                );
310*21fbd01bSAndreas Gohr            }
311*21fbd01bSAndreas Gohr        }
312*21fbd01bSAndreas Gohr    }
313*21fbd01bSAndreas Gohr
314*21fbd01bSAndreas Gohr    /**
315f2bbffb5SAndreas Gohr     * Add or update the tokens for a given entity
316f2bbffb5SAndreas Gohr     *
317f2bbffb5SAndreas Gohr     * The given list of tokens replaces the previously stored list for that entity. An empty list removes the
318f2bbffb5SAndreas Gohr     * entity from the index.
319f2bbffb5SAndreas Gohr     *
320f2bbffb5SAndreas Gohr     * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs
321f2bbffb5SAndreas Gohr     * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values.
322f2bbffb5SAndreas Gohr     * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete
323f2bbffb5SAndreas Gohr     * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value.
324f2bbffb5SAndreas Gohr     *
325f2bbffb5SAndreas Gohr     * @param string $entity The name of the entity
326f2bbffb5SAndreas Gohr     * @param string[] $tokens The list of tokens for this entity
32783b3acccSAndreas Gohr     * @return static
328f2bbffb5SAndreas Gohr     * @throws IndexAccessException
329f2bbffb5SAndreas Gohr     * @throws IndexWriteException
330f2bbffb5SAndreas Gohr     * @throws IndexLockException
331f2bbffb5SAndreas Gohr     */
33283b3acccSAndreas Gohr    public function addEntity(string $entity, array $tokens): static
333f2bbffb5SAndreas Gohr    {
334f2bbffb5SAndreas Gohr        if (!$this->isWritable) {
335f2bbffb5SAndreas Gohr            throw new IndexLockException('Indexes not locked. Forgot to call lock()?');
336f2bbffb5SAndreas Gohr        }
337f2bbffb5SAndreas Gohr
338f2bbffb5SAndreas Gohr        $entityIndex = $this->getEntityIndex();
339f2bbffb5SAndreas Gohr        $entityId = $entityIndex->accessCachedValue($entity);
340f2bbffb5SAndreas Gohr
341f2bbffb5SAndreas Gohr        $old = $this->getReverseAssignments($entity);
342f2bbffb5SAndreas Gohr        $new = $this->resolveTokens($tokens);
343f2bbffb5SAndreas Gohr
344f2bbffb5SAndreas Gohr        $merged = array_replace_recursive($old, $new);
345f2bbffb5SAndreas Gohr
346f2bbffb5SAndreas Gohr        $this->updateIndexes($merged, $entityId);
347f2bbffb5SAndreas Gohr        $this->saveReverseAssignments($entity, $merged);
34883b3acccSAndreas Gohr
34983b3acccSAndreas Gohr        return $this;
350f2bbffb5SAndreas Gohr    }
351f2bbffb5SAndreas Gohr
352f2bbffb5SAndreas Gohr    /**
353f2bbffb5SAndreas Gohr     * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]]
354f2bbffb5SAndreas Gohr     *
355f2bbffb5SAndreas Gohr     * Calls countTokens() to get token frequencies (subclass responsibility), then groups
356f2bbffb5SAndreas Gohr     * by token length if splitByLength is enabled, or under '' if not. Finally resolves
357f2bbffb5SAndreas Gohr     * token strings to IDs via the appropriate token index.
358f2bbffb5SAndreas Gohr     *
359f2bbffb5SAndreas Gohr     * @param string[] $tokens The raw token list
360f2bbffb5SAndreas Gohr     * @return array [group => [tokenId => frequency, ...], ...]
361f2bbffb5SAndreas Gohr     * @throws IndexLockException
362f2bbffb5SAndreas Gohr     * @throws IndexWriteException
363f2bbffb5SAndreas Gohr     */
364f2bbffb5SAndreas Gohr    protected function resolveTokens(array $tokens): array
365f2bbffb5SAndreas Gohr    {
366f2bbffb5SAndreas Gohr        $counted = $this->countTokens($tokens);
367f2bbffb5SAndreas Gohr
368f2bbffb5SAndreas Gohr        // group tokens by their index suffix
369f2bbffb5SAndreas Gohr        $groups = [];
370f2bbffb5SAndreas Gohr        foreach ($counted as $token => $freq) {
3716734bb8cSAndreas Gohr            $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0;
372f2bbffb5SAndreas Gohr            $groups[$group][$token] = $freq;
373f2bbffb5SAndreas Gohr        }
374f2bbffb5SAndreas Gohr
375f2bbffb5SAndreas Gohr        // resolve token strings to IDs
376f2bbffb5SAndreas Gohr        $result = [];
377f2bbffb5SAndreas Gohr        foreach ($groups as $group => $tokenFreqs) {
378f2bbffb5SAndreas Gohr            $tokenIndex = $this->getTokenIndex($group);
379f2bbffb5SAndreas Gohr            $result[$group] = [];
380f2bbffb5SAndreas Gohr            foreach ($tokenFreqs as $token => $freq) {
381f2bbffb5SAndreas Gohr                $tokenId = $tokenIndex->getRowID((string)$token);
382f2bbffb5SAndreas Gohr                $result[$group][$tokenId] = $freq;
383f2bbffb5SAndreas Gohr            }
384f2bbffb5SAndreas Gohr            $tokenIndex->save();
385f2bbffb5SAndreas Gohr        }
386f2bbffb5SAndreas Gohr
387f2bbffb5SAndreas Gohr        return $result;
388f2bbffb5SAndreas Gohr    }
389f2bbffb5SAndreas Gohr
390f2bbffb5SAndreas Gohr    /**
391f2bbffb5SAndreas Gohr     * Count or deduplicate tokens and return their frequencies
392f2bbffb5SAndreas Gohr     *
393f2bbffb5SAndreas Gohr     * FrequencyCollections return actual occurrence counts.
394f2bbffb5SAndreas Gohr     * LookupCollections deduplicate and return 1 for each token.
395f2bbffb5SAndreas Gohr     *
396f2bbffb5SAndreas Gohr     * @param string[] $tokens The raw token list
397f2bbffb5SAndreas Gohr     * @return array [token => frequency, ...]
398f2bbffb5SAndreas Gohr     */
399f2bbffb5SAndreas Gohr    abstract protected function countTokens(array $tokens): array;
400f2bbffb5SAndreas Gohr
401f2bbffb5SAndreas Gohr    /**
402f2bbffb5SAndreas Gohr     * Get the token assignments for a given entity from the reverse index
403f2bbffb5SAndreas Gohr     *
404f2bbffb5SAndreas Gohr     * Returns the parsed reverse index record. The exact structure depends on the collection type.
405f2bbffb5SAndreas Gohr     *
406f2bbffb5SAndreas Gohr     * @param string $entity
407f2bbffb5SAndreas Gohr     * @return array
408f2bbffb5SAndreas Gohr     * @throws IndexAccessException
409f2bbffb5SAndreas Gohr     * @throws IndexWriteException
410c66b5ec6SAndreas Gohr     * @throws IndexLockException
411f2bbffb5SAndreas Gohr     */
412f2bbffb5SAndreas Gohr    public function getReverseAssignments(string $entity): array
413f2bbffb5SAndreas Gohr    {
414f2bbffb5SAndreas Gohr        $entityIndex = $this->getEntityIndex();
415f2bbffb5SAndreas Gohr        $entityId = $entityIndex->accessCachedValue($entity);
416f2bbffb5SAndreas Gohr
417f2bbffb5SAndreas Gohr        $reverseIndex = $this->getReverseIndex();
418f2bbffb5SAndreas Gohr        $record = $reverseIndex->retrieveRow($entityId);
419f2bbffb5SAndreas Gohr
420f2bbffb5SAndreas Gohr        if ($record === '') {
421f2bbffb5SAndreas Gohr            return [];
422f2bbffb5SAndreas Gohr        }
423f2bbffb5SAndreas Gohr
424f2bbffb5SAndreas Gohr        return $this->parseReverseRecord($record);
425f2bbffb5SAndreas Gohr    }
426f2bbffb5SAndreas Gohr
427f2bbffb5SAndreas Gohr    /**
428f2bbffb5SAndreas Gohr     * Store the reverse index info about what tokens are assigned to the entity
429f2bbffb5SAndreas Gohr     *
430f2bbffb5SAndreas Gohr     * @param string $entity
431f2bbffb5SAndreas Gohr     * @param array $data The assignment data to store
432f2bbffb5SAndreas Gohr     * @return void
433f2bbffb5SAndreas Gohr     * @throws IndexAccessException
434f2bbffb5SAndreas Gohr     * @throws IndexWriteException
435f2bbffb5SAndreas Gohr     * @throws IndexLockException
436f2bbffb5SAndreas Gohr     */
437f2bbffb5SAndreas Gohr    protected function saveReverseAssignments(string $entity, array $data): void
438f2bbffb5SAndreas Gohr    {
439f2bbffb5SAndreas Gohr        // remove tokens with frequency 0 (no longer assigned), then remove empty groups
440f2bbffb5SAndreas Gohr        $data = array_map('array_filter', $data);
441f2bbffb5SAndreas Gohr        $data = array_filter($data);
442f2bbffb5SAndreas Gohr
443f2bbffb5SAndreas Gohr        $record = $this->formatReverseRecord($data);
444f2bbffb5SAndreas Gohr
445f2bbffb5SAndreas Gohr        $entityIndex = $this->getEntityIndex();
446f2bbffb5SAndreas Gohr        $entityId = $entityIndex->accessCachedValue($entity);
447f2bbffb5SAndreas Gohr
448f2bbffb5SAndreas Gohr        $reverseIndex = $this->getReverseIndex();
449f2bbffb5SAndreas Gohr        $reverseIndex->changeRow($entityId, $record);
450f2bbffb5SAndreas Gohr    }
451f2bbffb5SAndreas Gohr
452f2bbffb5SAndreas Gohr    /**
453f2bbffb5SAndreas Gohr     * Parse a reverse index record into a two-level array
454f2bbffb5SAndreas Gohr     *
455f2bbffb5SAndreas Gohr     * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values
456f2bbffb5SAndreas Gohr     * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(),
457f2bbffb5SAndreas Gohr     * tokens absent from the new data retain 0, signaling deletion from the frequency index.
458f2bbffb5SAndreas Gohr     *
459f2bbffb5SAndreas Gohr     * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length.
460f2bbffb5SAndreas Gohr     * For non-split collections the group prefix is omitted: "tokenId:tokenId:..."
461f2bbffb5SAndreas Gohr     * This mirrors how TupleOps omits *1 for frequency 1.
462f2bbffb5SAndreas Gohr     *
463f2bbffb5SAndreas Gohr     * @param string $record The raw reverse index record
464f2bbffb5SAndreas Gohr     * @return array [group => [tokenId => 0, ...], ...]
465f2bbffb5SAndreas Gohr     */
466f2bbffb5SAndreas Gohr    protected function parseReverseRecord(string $record): array
467f2bbffb5SAndreas Gohr    {
468f2bbffb5SAndreas Gohr        $result = [];
469f2bbffb5SAndreas Gohr        foreach (explode(':', $record) as $entry) {
470f2bbffb5SAndreas Gohr            $parts = explode('*', $entry, 2);
471f2bbffb5SAndreas Gohr            $tokenId = array_pop($parts);
4726734bb8cSAndreas Gohr            $group = (int)(array_pop($parts) ?? 0);
473f2bbffb5SAndreas Gohr            $result[$group][$tokenId] = 0;
474f2bbffb5SAndreas Gohr        }
475f2bbffb5SAndreas Gohr        return $result;
476f2bbffb5SAndreas Gohr    }
477f2bbffb5SAndreas Gohr
478f2bbffb5SAndreas Gohr    /**
479f2bbffb5SAndreas Gohr     * Format a two-level array into a reverse index record string
480f2bbffb5SAndreas Gohr     *
481f2bbffb5SAndreas Gohr     * @param array $data [group => [tokenId => freq, ...], ...]
482f2bbffb5SAndreas Gohr     * @return string The formatted record
483f2bbffb5SAndreas Gohr     */
484f2bbffb5SAndreas Gohr    protected function formatReverseRecord(array $data): string
485f2bbffb5SAndreas Gohr    {
486f2bbffb5SAndreas Gohr        $parts = [];
487f2bbffb5SAndreas Gohr        foreach ($data as $group => $tokens) {
4886734bb8cSAndreas Gohr            $prefix = $group === 0 ? '' : "$group*";
489f2bbffb5SAndreas Gohr            foreach (array_keys($tokens) as $tokenId) {
490f2bbffb5SAndreas Gohr                $parts[] = $prefix . $tokenId;
491f2bbffb5SAndreas Gohr            }
492f2bbffb5SAndreas Gohr        }
493f2bbffb5SAndreas Gohr        return implode(':', $parts);
494f2bbffb5SAndreas Gohr    }
495f2bbffb5SAndreas Gohr
496f2bbffb5SAndreas Gohr    /**
497f2bbffb5SAndreas Gohr     * Update frequency indexes with the given data
498f2bbffb5SAndreas Gohr     *
499f2bbffb5SAndreas Gohr     * Iterates over the two-level structure [group => [tokenId => freq]] and updates the
500f2bbffb5SAndreas Gohr     * corresponding frequency index for each group. A frequency of 0 removes the entity
501f2bbffb5SAndreas Gohr     * from that token's frequency record.
502f2bbffb5SAndreas Gohr     *
503f2bbffb5SAndreas Gohr     * @param array $data [group => [tokenId => frequency, ...], ...]
504f2bbffb5SAndreas Gohr     * @param int $entityId The entity ID
505f2bbffb5SAndreas Gohr     * @throws IndexLockException
506f2bbffb5SAndreas Gohr     * @throws IndexWriteException
507f2bbffb5SAndreas Gohr     */
508f2bbffb5SAndreas Gohr    protected function updateIndexes(array $data, int $entityId): void
509f2bbffb5SAndreas Gohr    {
510f2bbffb5SAndreas Gohr        foreach ($data as $group => $tokens) {
511f2bbffb5SAndreas Gohr            $freqIndex = $this->getFrequencyIndex($group);
512f2bbffb5SAndreas Gohr            foreach ($tokens as $tokenId => $freq) {
513f2bbffb5SAndreas Gohr                $record = $freqIndex->retrieveRow($tokenId);
514f2bbffb5SAndreas Gohr                $record = TupleOps::updateTuple($record, $entityId, $freq);
515f2bbffb5SAndreas Gohr                $freqIndex->changeRow($tokenId, $record);
516f2bbffb5SAndreas Gohr            }
517f2bbffb5SAndreas Gohr            $freqIndex->save();
518f2bbffb5SAndreas Gohr        }
519f2bbffb5SAndreas Gohr    }
520f2bbffb5SAndreas Gohr}
521