1f2bbffb5SAndreas Gohr<?php 2f2bbffb5SAndreas Gohr 3f2bbffb5SAndreas Gohrnamespace dokuwiki\Search\Collection; 4f2bbffb5SAndreas Gohr 5f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException; 6*21fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException; 7f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexLockException; 895b16223SAndreas Gohruse dokuwiki\Search\Exception\IndexUsageException; 9f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexWriteException; 1095b16223SAndreas Gohruse dokuwiki\Search\Index\AbstractIndex; 11f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\FileIndex; 12f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\Lock; 13f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 140a9fafedSAndreas Gohruse dokuwiki\Search\Index\TupleOps; 150a9fafedSAndreas Gohruse dokuwiki\Search\Tokenizer; 16f2bbffb5SAndreas Gohr 17f2bbffb5SAndreas Gohr/** 18f2bbffb5SAndreas Gohr * Abstract base class for index collections 19f2bbffb5SAndreas Gohr * 20f2bbffb5SAndreas Gohr * A collection manages a group of related indexes that together provide a specific search use case. 21f2bbffb5SAndreas Gohr * Every collection works with four index types: entity, token, frequency, and reverse. 22f2bbffb5SAndreas Gohr * 23f2bbffb5SAndreas Gohr * entity - the list of the main entities (eg. pages) 24f2bbffb5SAndreas Gohr * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 25f2bbffb5SAndreas Gohr * frequency - how often a token appears on a entity (can be split into multiple files) 26f2bbffb5SAndreas Gohr * reverse - the list of tokens assigned to each entity 27f2bbffb5SAndreas Gohr * 28f2bbffb5SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 29f2bbffb5SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 30f2bbffb5SAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 31f2bbffb5SAndreas Gohr */ 32f2bbffb5SAndreas Gohrabstract class AbstractCollection 33f2bbffb5SAndreas Gohr{ 3495b16223SAndreas Gohr /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 350a9fafedSAndreas Gohr protected array $lockedIndexes = []; 360a9fafedSAndreas Gohr 37f2bbffb5SAndreas Gohr /** @var bool Has a lock been acquired for all used indexes? */ 38f2bbffb5SAndreas Gohr protected bool $isWritable = false; 39f2bbffb5SAndreas Gohr 40f2bbffb5SAndreas Gohr /** 41f2bbffb5SAndreas Gohr * Initialize the collection with the names of the indexes it manages 42f2bbffb5SAndreas Gohr * 4395b16223SAndreas Gohr * Entity and token indexes can be passed as already instantiated AbstractIndex objects 4495b16223SAndreas Gohr * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 4595b16223SAndreas Gohr * 4695b16223SAndreas Gohr * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 4795b16223SAndreas Gohr * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 48f2bbffb5SAndreas Gohr * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 49f2bbffb5SAndreas Gohr * @param string $idxReverse Name of the reverse index, eg. 'pageword' 50f2bbffb5SAndreas Gohr * @param bool $splitByLength Whether to split token/frequency indexes by token length 5195b16223SAndreas Gohr * @throws IndexUsageException 52f2bbffb5SAndreas Gohr */ 53f2bbffb5SAndreas Gohr public function __construct( 5495b16223SAndreas Gohr protected string|AbstractIndex $idxEntity, 5595b16223SAndreas Gohr protected string|AbstractIndex $idxToken, 56d92c078cSAndreas Gohr protected string $idxFrequency = '', 57d92c078cSAndreas Gohr protected string $idxReverse = '', 58f2bbffb5SAndreas Gohr protected bool $splitByLength = false 590a9fafedSAndreas Gohr ) 600a9fafedSAndreas Gohr { 6195b16223SAndreas Gohr if ($idxToken instanceof AbstractIndex && $splitByLength) { 6295b16223SAndreas Gohr throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 6395b16223SAndreas Gohr } 64f2bbffb5SAndreas Gohr } 65f2bbffb5SAndreas Gohr 66f2bbffb5SAndreas Gohr /** 67f2bbffb5SAndreas Gohr * Destructor 68f2bbffb5SAndreas Gohr * 69f2bbffb5SAndreas Gohr * Ensures locks are released when the class is destroyed 70f2bbffb5SAndreas Gohr */ 71f2bbffb5SAndreas Gohr public function __destruct() 72f2bbffb5SAndreas Gohr { 73f2bbffb5SAndreas Gohr $this->unlock(); 74f2bbffb5SAndreas Gohr } 75f2bbffb5SAndreas Gohr 76f2bbffb5SAndreas Gohr /** 77f2bbffb5SAndreas Gohr * Lock all indexes for writing 78f2bbffb5SAndreas Gohr * 79f2bbffb5SAndreas Gohr * @return $this can be used for chaining 80f2bbffb5SAndreas Gohr * @throws IndexLockException 81f2bbffb5SAndreas Gohr */ 82f2bbffb5SAndreas Gohr public function lock(): static 83f2bbffb5SAndreas Gohr { 8495b16223SAndreas Gohr foreach ([ 850a9fafedSAndreas Gohr $this->idxEntity, 860a9fafedSAndreas Gohr $this->idxToken, 870a9fafedSAndreas Gohr $this->idxFrequency, 880a9fafedSAndreas Gohr $this->idxReverse 8995b16223SAndreas Gohr ] as $idx) { 9095b16223SAndreas Gohr if ($idx === '') continue; 91c66b5ec6SAndreas Gohr try { 9295b16223SAndreas Gohr if ($idx instanceof AbstractIndex) { 9395b16223SAndreas Gohr $idx->lock(); 9495b16223SAndreas Gohr $this->lockedIndexes[] = $idx; 9595b16223SAndreas Gohr } else { 9695b16223SAndreas Gohr Lock::acquire($idx); 9795b16223SAndreas Gohr $this->lockedIndexes[] = $idx; 9895b16223SAndreas Gohr } 99c66b5ec6SAndreas Gohr } catch (IndexLockException $e) { 100c66b5ec6SAndreas Gohr $this->unlock(); 101c66b5ec6SAndreas Gohr throw $e; 102c66b5ec6SAndreas Gohr } 103f2bbffb5SAndreas Gohr } 104f2bbffb5SAndreas Gohr $this->isWritable = true; 105f2bbffb5SAndreas Gohr return $this; 106f2bbffb5SAndreas Gohr } 107f2bbffb5SAndreas Gohr 108f2bbffb5SAndreas Gohr /** 1090a9fafedSAndreas Gohr * Unlock all indexes that were successfully locked 110f2bbffb5SAndreas Gohr * 11183b3acccSAndreas Gohr * @return static 112f2bbffb5SAndreas Gohr */ 11383b3acccSAndreas Gohr public function unlock(): static 114f2bbffb5SAndreas Gohr { 11595b16223SAndreas Gohr foreach ($this->lockedIndexes as $idx) { 11695b16223SAndreas Gohr if ($idx instanceof AbstractIndex) { 11795b16223SAndreas Gohr $idx->unlock(); 11895b16223SAndreas Gohr } else { 11995b16223SAndreas Gohr Lock::release($idx); 12095b16223SAndreas Gohr } 121f2bbffb5SAndreas Gohr } 1220a9fafedSAndreas Gohr $this->lockedIndexes = []; 123f2bbffb5SAndreas Gohr $this->isWritable = false; 12483b3acccSAndreas Gohr return $this; 125f2bbffb5SAndreas Gohr } 126f2bbffb5SAndreas Gohr 127f2bbffb5SAndreas Gohr /** 12895b16223SAndreas Gohr * @return AbstractIndex 129c66b5ec6SAndreas Gohr * @throws IndexLockException 130f2bbffb5SAndreas Gohr */ 13195b16223SAndreas Gohr public function getEntityIndex(): AbstractIndex 132f2bbffb5SAndreas Gohr { 13395b16223SAndreas Gohr if ($this->idxEntity instanceof AbstractIndex) { 13495b16223SAndreas Gohr return $this->idxEntity; 13595b16223SAndreas Gohr } 136f2bbffb5SAndreas Gohr return new FileIndex($this->idxEntity, '', $this->isWritable); 137f2bbffb5SAndreas Gohr } 138f2bbffb5SAndreas Gohr 139f2bbffb5SAndreas Gohr /** 1406734bb8cSAndreas Gohr * @param int $group Index group (0 for non-split, token length for split) 14195b16223SAndreas Gohr * @return AbstractIndex 142c66b5ec6SAndreas Gohr * @throws IndexLockException 143f2bbffb5SAndreas Gohr */ 1446734bb8cSAndreas Gohr public function getTokenIndex(int $group = 0): AbstractIndex 145f2bbffb5SAndreas Gohr { 14695b16223SAndreas Gohr if ($this->idxToken instanceof AbstractIndex) { 14795b16223SAndreas Gohr return $this->idxToken; 14895b16223SAndreas Gohr } 1496734bb8cSAndreas Gohr return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable); 150f2bbffb5SAndreas Gohr } 151f2bbffb5SAndreas Gohr 152f2bbffb5SAndreas Gohr /** 1536734bb8cSAndreas Gohr * @param int $group Index group (0 for non-split, token length for split) 15495b16223SAndreas Gohr * @return AbstractIndex 155c66b5ec6SAndreas Gohr * @throws IndexLockException 156f2bbffb5SAndreas Gohr */ 1576734bb8cSAndreas Gohr public function getFrequencyIndex(int $group = 0): AbstractIndex 158f2bbffb5SAndreas Gohr { 1596734bb8cSAndreas Gohr return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable); 160f2bbffb5SAndreas Gohr } 161f2bbffb5SAndreas Gohr 162f2bbffb5SAndreas Gohr /** 16395b16223SAndreas Gohr * @return AbstractIndex 164c66b5ec6SAndreas Gohr * @throws IndexLockException 165f2bbffb5SAndreas Gohr */ 16695b16223SAndreas Gohr public function getReverseIndex(): AbstractIndex 167f2bbffb5SAndreas Gohr { 168f2bbffb5SAndreas Gohr return new FileIndex($this->idxReverse, '', $this->isWritable); 169f2bbffb5SAndreas Gohr } 170f2bbffb5SAndreas Gohr 171f2bbffb5SAndreas Gohr /** 1726734bb8cSAndreas Gohr * Whether this collection splits token/frequency indexes by token length 1736734bb8cSAndreas Gohr * 1746734bb8cSAndreas Gohr * @return bool 1756734bb8cSAndreas Gohr */ 1766734bb8cSAndreas Gohr public function isSplitByLength(): bool 1776734bb8cSAndreas Gohr { 1786734bb8cSAndreas Gohr return $this->splitByLength; 1796734bb8cSAndreas Gohr } 1806734bb8cSAndreas Gohr 1816734bb8cSAndreas Gohr /** 1826734bb8cSAndreas Gohr * Convert a logical group number to the index file suffix 1836734bb8cSAndreas Gohr * 1846734bb8cSAndreas Gohr * Group 0 represents non-split indexes (suffix '') while positive integers 1856734bb8cSAndreas Gohr * represent split-by-length indexes (suffix = the length). 1866734bb8cSAndreas Gohr * 1876734bb8cSAndreas Gohr * @param int $group 1886734bb8cSAndreas Gohr * @return string The file suffix ('' for group 0, the group number as string otherwise) 1896734bb8cSAndreas Gohr * @throws IndexUsageException when group does not match the collection's split mode 1906734bb8cSAndreas Gohr */ 1916734bb8cSAndreas Gohr protected function groupToSuffix(int $group): string 1926734bb8cSAndreas Gohr { 1936734bb8cSAndreas Gohr if ($group === 0 && $this->splitByLength) { 1946734bb8cSAndreas Gohr throw new IndexUsageException('Group 0 is not valid for split-by-length collections'); 1956734bb8cSAndreas Gohr } 1966734bb8cSAndreas Gohr if ($group !== 0 && !$this->splitByLength) { 1976734bb8cSAndreas Gohr throw new IndexUsageException("Group $group is not valid for non-split collections"); 1986734bb8cSAndreas Gohr } 1996734bb8cSAndreas Gohr return $group === 0 ? '' : (string)$group; 2006734bb8cSAndreas Gohr } 2016734bb8cSAndreas Gohr 2026734bb8cSAndreas Gohr /** 2036734bb8cSAndreas Gohr * Resolve token IDs to entity frequencies 2046734bb8cSAndreas Gohr * 2056734bb8cSAndreas Gohr * Given a set of token IDs from a specific index group, returns the entities 2066734bb8cSAndreas Gohr * that have those tokens and their frequencies. This encapsulates the frequency 2076734bb8cSAndreas Gohr * index access so that subclasses (e.g. DirectCollection) can provide alternative 2086734bb8cSAndreas Gohr * mappings. 2096734bb8cSAndreas Gohr * 2106734bb8cSAndreas Gohr * @param int $group Index group (0 for non-split, token length for split) 2116734bb8cSAndreas Gohr * @param int[] $tokenIds The token IDs to resolve 2126734bb8cSAndreas Gohr * @return array [tokenId => [entityId => frequency, ...], ...] 2136734bb8cSAndreas Gohr */ 2146734bb8cSAndreas Gohr public function resolveTokenFrequencies(int $group, array $tokenIds): array 2156734bb8cSAndreas Gohr { 2166734bb8cSAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 2176734bb8cSAndreas Gohr if (!$freqIndex->exists()) return []; 2186734bb8cSAndreas Gohr return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds)); 2196734bb8cSAndreas Gohr } 2206734bb8cSAndreas Gohr 2216734bb8cSAndreas Gohr /** 2226734bb8cSAndreas Gohr * Return all entity names that have data in this collection 2236734bb8cSAndreas Gohr * 2246734bb8cSAndreas Gohr * @return string[] entity names 2256734bb8cSAndreas Gohr */ 2266734bb8cSAndreas Gohr public function getEntitiesWithData(): array 2276734bb8cSAndreas Gohr { 2286734bb8cSAndreas Gohr $entityIndex = $this->getEntityIndex(); 2296734bb8cSAndreas Gohr 2306734bb8cSAndreas Gohr // collect entity IDs from all frequency index groups 231*21fbd01bSAndreas Gohr $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0; 232*21fbd01bSAndreas Gohr $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0]; 2336734bb8cSAndreas Gohr 2346734bb8cSAndreas Gohr $entityIds = []; 2356734bb8cSAndreas Gohr foreach ($groups as $group) { 2366734bb8cSAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 2376734bb8cSAndreas Gohr if (!$freqIndex->exists()) continue; 2386734bb8cSAndreas Gohr foreach ($freqIndex as $line) { 2396734bb8cSAndreas Gohr foreach (TupleOps::parseTuples($line) as $entityId => $count) { 2406734bb8cSAndreas Gohr $entityIds[$entityId] = true; 2416734bb8cSAndreas Gohr } 2426734bb8cSAndreas Gohr } 2436734bb8cSAndreas Gohr } 2446734bb8cSAndreas Gohr 2456734bb8cSAndreas Gohr $names = $entityIndex->retrieveRows(array_keys($entityIds)); 2466734bb8cSAndreas Gohr return array_values(array_filter($names, static fn($v) => $v !== '')); 2476734bb8cSAndreas Gohr } 2486734bb8cSAndreas Gohr 2496734bb8cSAndreas Gohr /** 250f2bbffb5SAndreas Gohr * Maximum suffix for the token indexes (eg. max word length currently stored) 251f2bbffb5SAndreas Gohr * 252f2bbffb5SAndreas Gohr * @return int 253c66b5ec6SAndreas Gohr * @throws IndexLockException 254f2bbffb5SAndreas Gohr */ 255f2bbffb5SAndreas Gohr public function getTokenIndexMaximum(): int 256f2bbffb5SAndreas Gohr { 2576734bb8cSAndreas Gohr if ($this->idxToken instanceof AbstractIndex) { 2586734bb8cSAndreas Gohr return $this->idxToken->max(); 2596734bb8cSAndreas Gohr } 2606734bb8cSAndreas Gohr return (new MemoryIndex($this->idxToken, ''))->max(); 261f2bbffb5SAndreas Gohr } 262f2bbffb5SAndreas Gohr 263f2bbffb5SAndreas Gohr /** 264*21fbd01bSAndreas Gohr * Check the structural integrity of this collection's indexes 265*21fbd01bSAndreas Gohr * 266*21fbd01bSAndreas Gohr * Verifies that paired indexes have matching line counts: 267*21fbd01bSAndreas Gohr * - token == frequency (per group, both keyed by token RID) 268*21fbd01bSAndreas Gohr * - entity == reverse (both keyed by entity RID) 269*21fbd01bSAndreas Gohr * 270*21fbd01bSAndreas Gohr * @throws IndexIntegrityException when a structural inconsistency is found 271*21fbd01bSAndreas Gohr */ 272*21fbd01bSAndreas Gohr public function checkIntegrity(): void 273*21fbd01bSAndreas Gohr { 274*21fbd01bSAndreas Gohr // Check token/frequency pairs 275*21fbd01bSAndreas Gohr $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0; 276*21fbd01bSAndreas Gohr $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0]; 277*21fbd01bSAndreas Gohr 278*21fbd01bSAndreas Gohr foreach ($groups as $group) { 279*21fbd01bSAndreas Gohr $tokenIndex = $this->getTokenIndex($group); 280*21fbd01bSAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 281*21fbd01bSAndreas Gohr 282*21fbd01bSAndreas Gohr if (!$tokenIndex->exists() && !$freqIndex->exists()) continue; 283*21fbd01bSAndreas Gohr 284*21fbd01bSAndreas Gohr if ($tokenIndex->exists() !== $freqIndex->exists()) { 285*21fbd01bSAndreas Gohr throw new IndexIntegrityException( 286*21fbd01bSAndreas Gohr "Group $group: missing " . 287*21fbd01bSAndreas Gohr ($tokenIndex->exists() ? 'frequency' : 'token') . ' index' 288*21fbd01bSAndreas Gohr ); 289*21fbd01bSAndreas Gohr } 290*21fbd01bSAndreas Gohr 291*21fbd01bSAndreas Gohr $tc = count($tokenIndex); 292*21fbd01bSAndreas Gohr $fc = count($freqIndex); 293*21fbd01bSAndreas Gohr if ($tc !== $fc) { 294*21fbd01bSAndreas Gohr throw new IndexIntegrityException( 295*21fbd01bSAndreas Gohr "Group $group: token count ($tc) != frequency count ($fc)" 296*21fbd01bSAndreas Gohr ); 297*21fbd01bSAndreas Gohr } 298*21fbd01bSAndreas Gohr } 299*21fbd01bSAndreas Gohr 300*21fbd01bSAndreas Gohr // Check entity/reverse pair 301*21fbd01bSAndreas Gohr $entityIndex = $this->getEntityIndex(); 302*21fbd01bSAndreas Gohr $reverseIndex = $this->getReverseIndex(); 303*21fbd01bSAndreas Gohr if ($entityIndex->exists() && $reverseIndex->exists()) { 304*21fbd01bSAndreas Gohr $ec = count($entityIndex); 305*21fbd01bSAndreas Gohr $rc = count($reverseIndex); 306*21fbd01bSAndreas Gohr if ($ec !== $rc) { 307*21fbd01bSAndreas Gohr throw new IndexIntegrityException( 308*21fbd01bSAndreas Gohr "Entity count ($ec) != reverse count ($rc)" 309*21fbd01bSAndreas Gohr ); 310*21fbd01bSAndreas Gohr } 311*21fbd01bSAndreas Gohr } 312*21fbd01bSAndreas Gohr } 313*21fbd01bSAndreas Gohr 314*21fbd01bSAndreas Gohr /** 315f2bbffb5SAndreas Gohr * Add or update the tokens for a given entity 316f2bbffb5SAndreas Gohr * 317f2bbffb5SAndreas Gohr * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 318f2bbffb5SAndreas Gohr * entity from the index. 319f2bbffb5SAndreas Gohr * 320f2bbffb5SAndreas Gohr * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 321f2bbffb5SAndreas Gohr * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 322f2bbffb5SAndreas Gohr * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 323f2bbffb5SAndreas Gohr * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 324f2bbffb5SAndreas Gohr * 325f2bbffb5SAndreas Gohr * @param string $entity The name of the entity 326f2bbffb5SAndreas Gohr * @param string[] $tokens The list of tokens for this entity 32783b3acccSAndreas Gohr * @return static 328f2bbffb5SAndreas Gohr * @throws IndexAccessException 329f2bbffb5SAndreas Gohr * @throws IndexWriteException 330f2bbffb5SAndreas Gohr * @throws IndexLockException 331f2bbffb5SAndreas Gohr */ 33283b3acccSAndreas Gohr public function addEntity(string $entity, array $tokens): static 333f2bbffb5SAndreas Gohr { 334f2bbffb5SAndreas Gohr if (!$this->isWritable) { 335f2bbffb5SAndreas Gohr throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 336f2bbffb5SAndreas Gohr } 337f2bbffb5SAndreas Gohr 338f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 339f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 340f2bbffb5SAndreas Gohr 341f2bbffb5SAndreas Gohr $old = $this->getReverseAssignments($entity); 342f2bbffb5SAndreas Gohr $new = $this->resolveTokens($tokens); 343f2bbffb5SAndreas Gohr 344f2bbffb5SAndreas Gohr $merged = array_replace_recursive($old, $new); 345f2bbffb5SAndreas Gohr 346f2bbffb5SAndreas Gohr $this->updateIndexes($merged, $entityId); 347f2bbffb5SAndreas Gohr $this->saveReverseAssignments($entity, $merged); 34883b3acccSAndreas Gohr 34983b3acccSAndreas Gohr return $this; 350f2bbffb5SAndreas Gohr } 351f2bbffb5SAndreas Gohr 352f2bbffb5SAndreas Gohr /** 353f2bbffb5SAndreas Gohr * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 354f2bbffb5SAndreas Gohr * 355f2bbffb5SAndreas Gohr * Calls countTokens() to get token frequencies (subclass responsibility), then groups 356f2bbffb5SAndreas Gohr * by token length if splitByLength is enabled, or under '' if not. Finally resolves 357f2bbffb5SAndreas Gohr * token strings to IDs via the appropriate token index. 358f2bbffb5SAndreas Gohr * 359f2bbffb5SAndreas Gohr * @param string[] $tokens The raw token list 360f2bbffb5SAndreas Gohr * @return array [group => [tokenId => frequency, ...], ...] 361f2bbffb5SAndreas Gohr * @throws IndexLockException 362f2bbffb5SAndreas Gohr * @throws IndexWriteException 363f2bbffb5SAndreas Gohr */ 364f2bbffb5SAndreas Gohr protected function resolveTokens(array $tokens): array 365f2bbffb5SAndreas Gohr { 366f2bbffb5SAndreas Gohr $counted = $this->countTokens($tokens); 367f2bbffb5SAndreas Gohr 368f2bbffb5SAndreas Gohr // group tokens by their index suffix 369f2bbffb5SAndreas Gohr $groups = []; 370f2bbffb5SAndreas Gohr foreach ($counted as $token => $freq) { 3716734bb8cSAndreas Gohr $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0; 372f2bbffb5SAndreas Gohr $groups[$group][$token] = $freq; 373f2bbffb5SAndreas Gohr } 374f2bbffb5SAndreas Gohr 375f2bbffb5SAndreas Gohr // resolve token strings to IDs 376f2bbffb5SAndreas Gohr $result = []; 377f2bbffb5SAndreas Gohr foreach ($groups as $group => $tokenFreqs) { 378f2bbffb5SAndreas Gohr $tokenIndex = $this->getTokenIndex($group); 379f2bbffb5SAndreas Gohr $result[$group] = []; 380f2bbffb5SAndreas Gohr foreach ($tokenFreqs as $token => $freq) { 381f2bbffb5SAndreas Gohr $tokenId = $tokenIndex->getRowID((string)$token); 382f2bbffb5SAndreas Gohr $result[$group][$tokenId] = $freq; 383f2bbffb5SAndreas Gohr } 384f2bbffb5SAndreas Gohr $tokenIndex->save(); 385f2bbffb5SAndreas Gohr } 386f2bbffb5SAndreas Gohr 387f2bbffb5SAndreas Gohr return $result; 388f2bbffb5SAndreas Gohr } 389f2bbffb5SAndreas Gohr 390f2bbffb5SAndreas Gohr /** 391f2bbffb5SAndreas Gohr * Count or deduplicate tokens and return their frequencies 392f2bbffb5SAndreas Gohr * 393f2bbffb5SAndreas Gohr * FrequencyCollections return actual occurrence counts. 394f2bbffb5SAndreas Gohr * LookupCollections deduplicate and return 1 for each token. 395f2bbffb5SAndreas Gohr * 396f2bbffb5SAndreas Gohr * @param string[] $tokens The raw token list 397f2bbffb5SAndreas Gohr * @return array [token => frequency, ...] 398f2bbffb5SAndreas Gohr */ 399f2bbffb5SAndreas Gohr abstract protected function countTokens(array $tokens): array; 400f2bbffb5SAndreas Gohr 401f2bbffb5SAndreas Gohr /** 402f2bbffb5SAndreas Gohr * Get the token assignments for a given entity from the reverse index 403f2bbffb5SAndreas Gohr * 404f2bbffb5SAndreas Gohr * Returns the parsed reverse index record. The exact structure depends on the collection type. 405f2bbffb5SAndreas Gohr * 406f2bbffb5SAndreas Gohr * @param string $entity 407f2bbffb5SAndreas Gohr * @return array 408f2bbffb5SAndreas Gohr * @throws IndexAccessException 409f2bbffb5SAndreas Gohr * @throws IndexWriteException 410c66b5ec6SAndreas Gohr * @throws IndexLockException 411f2bbffb5SAndreas Gohr */ 412f2bbffb5SAndreas Gohr public function getReverseAssignments(string $entity): array 413f2bbffb5SAndreas Gohr { 414f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 415f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 416f2bbffb5SAndreas Gohr 417f2bbffb5SAndreas Gohr $reverseIndex = $this->getReverseIndex(); 418f2bbffb5SAndreas Gohr $record = $reverseIndex->retrieveRow($entityId); 419f2bbffb5SAndreas Gohr 420f2bbffb5SAndreas Gohr if ($record === '') { 421f2bbffb5SAndreas Gohr return []; 422f2bbffb5SAndreas Gohr } 423f2bbffb5SAndreas Gohr 424f2bbffb5SAndreas Gohr return $this->parseReverseRecord($record); 425f2bbffb5SAndreas Gohr } 426f2bbffb5SAndreas Gohr 427f2bbffb5SAndreas Gohr /** 428f2bbffb5SAndreas Gohr * Store the reverse index info about what tokens are assigned to the entity 429f2bbffb5SAndreas Gohr * 430f2bbffb5SAndreas Gohr * @param string $entity 431f2bbffb5SAndreas Gohr * @param array $data The assignment data to store 432f2bbffb5SAndreas Gohr * @return void 433f2bbffb5SAndreas Gohr * @throws IndexAccessException 434f2bbffb5SAndreas Gohr * @throws IndexWriteException 435f2bbffb5SAndreas Gohr * @throws IndexLockException 436f2bbffb5SAndreas Gohr */ 437f2bbffb5SAndreas Gohr protected function saveReverseAssignments(string $entity, array $data): void 438f2bbffb5SAndreas Gohr { 439f2bbffb5SAndreas Gohr // remove tokens with frequency 0 (no longer assigned), then remove empty groups 440f2bbffb5SAndreas Gohr $data = array_map('array_filter', $data); 441f2bbffb5SAndreas Gohr $data = array_filter($data); 442f2bbffb5SAndreas Gohr 443f2bbffb5SAndreas Gohr $record = $this->formatReverseRecord($data); 444f2bbffb5SAndreas Gohr 445f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 446f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 447f2bbffb5SAndreas Gohr 448f2bbffb5SAndreas Gohr $reverseIndex = $this->getReverseIndex(); 449f2bbffb5SAndreas Gohr $reverseIndex->changeRow($entityId, $record); 450f2bbffb5SAndreas Gohr } 451f2bbffb5SAndreas Gohr 452f2bbffb5SAndreas Gohr /** 453f2bbffb5SAndreas Gohr * Parse a reverse index record into a two-level array 454f2bbffb5SAndreas Gohr * 455f2bbffb5SAndreas Gohr * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 456f2bbffb5SAndreas Gohr * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 457f2bbffb5SAndreas Gohr * tokens absent from the new data retain 0, signaling deletion from the frequency index. 458f2bbffb5SAndreas Gohr * 459f2bbffb5SAndreas Gohr * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 460f2bbffb5SAndreas Gohr * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 461f2bbffb5SAndreas Gohr * This mirrors how TupleOps omits *1 for frequency 1. 462f2bbffb5SAndreas Gohr * 463f2bbffb5SAndreas Gohr * @param string $record The raw reverse index record 464f2bbffb5SAndreas Gohr * @return array [group => [tokenId => 0, ...], ...] 465f2bbffb5SAndreas Gohr */ 466f2bbffb5SAndreas Gohr protected function parseReverseRecord(string $record): array 467f2bbffb5SAndreas Gohr { 468f2bbffb5SAndreas Gohr $result = []; 469f2bbffb5SAndreas Gohr foreach (explode(':', $record) as $entry) { 470f2bbffb5SAndreas Gohr $parts = explode('*', $entry, 2); 471f2bbffb5SAndreas Gohr $tokenId = array_pop($parts); 4726734bb8cSAndreas Gohr $group = (int)(array_pop($parts) ?? 0); 473f2bbffb5SAndreas Gohr $result[$group][$tokenId] = 0; 474f2bbffb5SAndreas Gohr } 475f2bbffb5SAndreas Gohr return $result; 476f2bbffb5SAndreas Gohr } 477f2bbffb5SAndreas Gohr 478f2bbffb5SAndreas Gohr /** 479f2bbffb5SAndreas Gohr * Format a two-level array into a reverse index record string 480f2bbffb5SAndreas Gohr * 481f2bbffb5SAndreas Gohr * @param array $data [group => [tokenId => freq, ...], ...] 482f2bbffb5SAndreas Gohr * @return string The formatted record 483f2bbffb5SAndreas Gohr */ 484f2bbffb5SAndreas Gohr protected function formatReverseRecord(array $data): string 485f2bbffb5SAndreas Gohr { 486f2bbffb5SAndreas Gohr $parts = []; 487f2bbffb5SAndreas Gohr foreach ($data as $group => $tokens) { 4886734bb8cSAndreas Gohr $prefix = $group === 0 ? '' : "$group*"; 489f2bbffb5SAndreas Gohr foreach (array_keys($tokens) as $tokenId) { 490f2bbffb5SAndreas Gohr $parts[] = $prefix . $tokenId; 491f2bbffb5SAndreas Gohr } 492f2bbffb5SAndreas Gohr } 493f2bbffb5SAndreas Gohr return implode(':', $parts); 494f2bbffb5SAndreas Gohr } 495f2bbffb5SAndreas Gohr 496f2bbffb5SAndreas Gohr /** 497f2bbffb5SAndreas Gohr * Update frequency indexes with the given data 498f2bbffb5SAndreas Gohr * 499f2bbffb5SAndreas Gohr * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 500f2bbffb5SAndreas Gohr * corresponding frequency index for each group. A frequency of 0 removes the entity 501f2bbffb5SAndreas Gohr * from that token's frequency record. 502f2bbffb5SAndreas Gohr * 503f2bbffb5SAndreas Gohr * @param array $data [group => [tokenId => frequency, ...], ...] 504f2bbffb5SAndreas Gohr * @param int $entityId The entity ID 505f2bbffb5SAndreas Gohr * @throws IndexLockException 506f2bbffb5SAndreas Gohr * @throws IndexWriteException 507f2bbffb5SAndreas Gohr */ 508f2bbffb5SAndreas Gohr protected function updateIndexes(array $data, int $entityId): void 509f2bbffb5SAndreas Gohr { 510f2bbffb5SAndreas Gohr foreach ($data as $group => $tokens) { 511f2bbffb5SAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 512f2bbffb5SAndreas Gohr foreach ($tokens as $tokenId => $freq) { 513f2bbffb5SAndreas Gohr $record = $freqIndex->retrieveRow($tokenId); 514f2bbffb5SAndreas Gohr $record = TupleOps::updateTuple($record, $entityId, $freq); 515f2bbffb5SAndreas Gohr $freqIndex->changeRow($tokenId, $record); 516f2bbffb5SAndreas Gohr } 517f2bbffb5SAndreas Gohr $freqIndex->save(); 518f2bbffb5SAndreas Gohr } 519f2bbffb5SAndreas Gohr } 520f2bbffb5SAndreas Gohr} 521