1f2bbffb5SAndreas Gohr<?php 2f2bbffb5SAndreas Gohr 3f2bbffb5SAndreas Gohrnamespace dokuwiki\Search\Collection; 4f2bbffb5SAndreas Gohr 5f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException; 6f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexLockException; 795b16223SAndreas Gohruse dokuwiki\Search\Exception\IndexUsageException; 8f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexWriteException; 995b16223SAndreas Gohruse dokuwiki\Search\Index\AbstractIndex; 10f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\FileIndex; 11f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\Lock; 12f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 130a9fafedSAndreas Gohruse dokuwiki\Search\Index\TupleOps; 140a9fafedSAndreas Gohruse dokuwiki\Search\Tokenizer; 15f2bbffb5SAndreas Gohr 16f2bbffb5SAndreas Gohr/** 17f2bbffb5SAndreas Gohr * Abstract base class for index collections 18f2bbffb5SAndreas Gohr * 19f2bbffb5SAndreas Gohr * A collection manages a group of related indexes that together provide a specific search use case. 20f2bbffb5SAndreas Gohr * Every collection works with four index types: entity, token, frequency, and reverse. 21f2bbffb5SAndreas Gohr * 22f2bbffb5SAndreas Gohr * entity - the list of the main entities (eg. pages) 23f2bbffb5SAndreas Gohr * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 24f2bbffb5SAndreas Gohr * frequency - how often a token appears on a entity (can be split into multiple files) 25f2bbffb5SAndreas Gohr * reverse - the list of tokens assigned to each entity 26f2bbffb5SAndreas Gohr * 27f2bbffb5SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 28f2bbffb5SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 29f2bbffb5SAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 30f2bbffb5SAndreas Gohr */ 31f2bbffb5SAndreas Gohrabstract class AbstractCollection 32f2bbffb5SAndreas Gohr{ 3395b16223SAndreas Gohr /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 340a9fafedSAndreas Gohr protected array $lockedIndexes = []; 350a9fafedSAndreas Gohr 36f2bbffb5SAndreas Gohr /** @var bool Has a lock been acquired for all used indexes? */ 37f2bbffb5SAndreas Gohr protected bool $isWritable = false; 38f2bbffb5SAndreas Gohr 39f2bbffb5SAndreas Gohr /** 40f2bbffb5SAndreas Gohr * Initialize the collection with the names of the indexes it manages 41f2bbffb5SAndreas Gohr * 4295b16223SAndreas Gohr * Entity and token indexes can be passed as already instantiated AbstractIndex objects 4395b16223SAndreas Gohr * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 4495b16223SAndreas Gohr * 4595b16223SAndreas Gohr * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 4695b16223SAndreas Gohr * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 47f2bbffb5SAndreas Gohr * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 48f2bbffb5SAndreas Gohr * @param string $idxReverse Name of the reverse index, eg. 'pageword' 49f2bbffb5SAndreas Gohr * @param bool $splitByLength Whether to split token/frequency indexes by token length 5095b16223SAndreas Gohr * @throws IndexUsageException 51f2bbffb5SAndreas Gohr */ 52f2bbffb5SAndreas Gohr public function __construct( 5395b16223SAndreas Gohr protected string|AbstractIndex $idxEntity, 5495b16223SAndreas Gohr protected string|AbstractIndex $idxToken, 55d92c078cSAndreas Gohr protected string $idxFrequency = '', 56d92c078cSAndreas Gohr protected string $idxReverse = '', 57f2bbffb5SAndreas Gohr protected bool $splitByLength = false 580a9fafedSAndreas Gohr ) 590a9fafedSAndreas Gohr { 6095b16223SAndreas Gohr if ($idxToken instanceof AbstractIndex && $splitByLength) { 6195b16223SAndreas Gohr throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 6295b16223SAndreas Gohr } 63f2bbffb5SAndreas Gohr } 64f2bbffb5SAndreas Gohr 65f2bbffb5SAndreas Gohr /** 66f2bbffb5SAndreas Gohr * Destructor 67f2bbffb5SAndreas Gohr * 68f2bbffb5SAndreas Gohr * Ensures locks are released when the class is destroyed 69f2bbffb5SAndreas Gohr */ 70f2bbffb5SAndreas Gohr public function __destruct() 71f2bbffb5SAndreas Gohr { 72f2bbffb5SAndreas Gohr $this->unlock(); 73f2bbffb5SAndreas Gohr } 74f2bbffb5SAndreas Gohr 75f2bbffb5SAndreas Gohr /** 76f2bbffb5SAndreas Gohr * Lock all indexes for writing 77f2bbffb5SAndreas Gohr * 78f2bbffb5SAndreas Gohr * @return $this can be used for chaining 79f2bbffb5SAndreas Gohr * @throws IndexLockException 80f2bbffb5SAndreas Gohr */ 81f2bbffb5SAndreas Gohr public function lock(): static 82f2bbffb5SAndreas Gohr { 8395b16223SAndreas Gohr foreach ([ 840a9fafedSAndreas Gohr $this->idxEntity, 850a9fafedSAndreas Gohr $this->idxToken, 860a9fafedSAndreas Gohr $this->idxFrequency, 870a9fafedSAndreas Gohr $this->idxReverse 8895b16223SAndreas Gohr ] as $idx) { 8995b16223SAndreas Gohr if ($idx === '') continue; 90c66b5ec6SAndreas Gohr try { 9195b16223SAndreas Gohr if ($idx instanceof AbstractIndex) { 9295b16223SAndreas Gohr $idx->lock(); 9395b16223SAndreas Gohr $this->lockedIndexes[] = $idx; 9495b16223SAndreas Gohr } else { 9595b16223SAndreas Gohr Lock::acquire($idx); 9695b16223SAndreas Gohr $this->lockedIndexes[] = $idx; 9795b16223SAndreas Gohr } 98c66b5ec6SAndreas Gohr } catch (IndexLockException $e) { 99c66b5ec6SAndreas Gohr $this->unlock(); 100c66b5ec6SAndreas Gohr throw $e; 101c66b5ec6SAndreas Gohr } 102f2bbffb5SAndreas Gohr } 103f2bbffb5SAndreas Gohr $this->isWritable = true; 104f2bbffb5SAndreas Gohr return $this; 105f2bbffb5SAndreas Gohr } 106f2bbffb5SAndreas Gohr 107f2bbffb5SAndreas Gohr /** 1080a9fafedSAndreas Gohr * Unlock all indexes that were successfully locked 109f2bbffb5SAndreas Gohr * 11083b3acccSAndreas Gohr * @return static 111f2bbffb5SAndreas Gohr */ 11283b3acccSAndreas Gohr public function unlock(): static 113f2bbffb5SAndreas Gohr { 11495b16223SAndreas Gohr foreach ($this->lockedIndexes as $idx) { 11595b16223SAndreas Gohr if ($idx instanceof AbstractIndex) { 11695b16223SAndreas Gohr $idx->unlock(); 11795b16223SAndreas Gohr } else { 11895b16223SAndreas Gohr Lock::release($idx); 11995b16223SAndreas Gohr } 120f2bbffb5SAndreas Gohr } 1210a9fafedSAndreas Gohr $this->lockedIndexes = []; 122f2bbffb5SAndreas Gohr $this->isWritable = false; 12383b3acccSAndreas Gohr return $this; 124f2bbffb5SAndreas Gohr } 125f2bbffb5SAndreas Gohr 126f2bbffb5SAndreas Gohr /** 12795b16223SAndreas Gohr * @return AbstractIndex 128c66b5ec6SAndreas Gohr * @throws IndexLockException 129f2bbffb5SAndreas Gohr */ 13095b16223SAndreas Gohr public function getEntityIndex(): AbstractIndex 131f2bbffb5SAndreas Gohr { 13295b16223SAndreas Gohr if ($this->idxEntity instanceof AbstractIndex) { 13395b16223SAndreas Gohr return $this->idxEntity; 13495b16223SAndreas Gohr } 135f2bbffb5SAndreas Gohr return new FileIndex($this->idxEntity, '', $this->isWritable); 136f2bbffb5SAndreas Gohr } 137f2bbffb5SAndreas Gohr 138f2bbffb5SAndreas Gohr /** 139*6734bb8cSAndreas Gohr * @param int $group Index group (0 for non-split, token length for split) 14095b16223SAndreas Gohr * @return AbstractIndex 141c66b5ec6SAndreas Gohr * @throws IndexLockException 142f2bbffb5SAndreas Gohr */ 143*6734bb8cSAndreas Gohr public function getTokenIndex(int $group = 0): AbstractIndex 144f2bbffb5SAndreas Gohr { 14595b16223SAndreas Gohr if ($this->idxToken instanceof AbstractIndex) { 14695b16223SAndreas Gohr return $this->idxToken; 14795b16223SAndreas Gohr } 148*6734bb8cSAndreas Gohr return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable); 149f2bbffb5SAndreas Gohr } 150f2bbffb5SAndreas Gohr 151f2bbffb5SAndreas Gohr /** 152*6734bb8cSAndreas Gohr * @param int $group Index group (0 for non-split, token length for split) 15395b16223SAndreas Gohr * @return AbstractIndex 154c66b5ec6SAndreas Gohr * @throws IndexLockException 155f2bbffb5SAndreas Gohr */ 156*6734bb8cSAndreas Gohr public function getFrequencyIndex(int $group = 0): AbstractIndex 157f2bbffb5SAndreas Gohr { 158*6734bb8cSAndreas Gohr return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable); 159f2bbffb5SAndreas Gohr } 160f2bbffb5SAndreas Gohr 161f2bbffb5SAndreas Gohr /** 16295b16223SAndreas Gohr * @return AbstractIndex 163c66b5ec6SAndreas Gohr * @throws IndexLockException 164f2bbffb5SAndreas Gohr */ 16595b16223SAndreas Gohr public function getReverseIndex(): AbstractIndex 166f2bbffb5SAndreas Gohr { 167f2bbffb5SAndreas Gohr return new FileIndex($this->idxReverse, '', $this->isWritable); 168f2bbffb5SAndreas Gohr } 169f2bbffb5SAndreas Gohr 170f2bbffb5SAndreas Gohr /** 171*6734bb8cSAndreas Gohr * Whether this collection splits token/frequency indexes by token length 172*6734bb8cSAndreas Gohr * 173*6734bb8cSAndreas Gohr * @return bool 174*6734bb8cSAndreas Gohr */ 175*6734bb8cSAndreas Gohr public function isSplitByLength(): bool 176*6734bb8cSAndreas Gohr { 177*6734bb8cSAndreas Gohr return $this->splitByLength; 178*6734bb8cSAndreas Gohr } 179*6734bb8cSAndreas Gohr 180*6734bb8cSAndreas Gohr /** 181*6734bb8cSAndreas Gohr * Convert a logical group number to the index file suffix 182*6734bb8cSAndreas Gohr * 183*6734bb8cSAndreas Gohr * Group 0 represents non-split indexes (suffix '') while positive integers 184*6734bb8cSAndreas Gohr * represent split-by-length indexes (suffix = the length). 185*6734bb8cSAndreas Gohr * 186*6734bb8cSAndreas Gohr * @param int $group 187*6734bb8cSAndreas Gohr * @return string The file suffix ('' for group 0, the group number as string otherwise) 188*6734bb8cSAndreas Gohr * @throws IndexUsageException when group does not match the collection's split mode 189*6734bb8cSAndreas Gohr */ 190*6734bb8cSAndreas Gohr protected function groupToSuffix(int $group): string 191*6734bb8cSAndreas Gohr { 192*6734bb8cSAndreas Gohr if ($group === 0 && $this->splitByLength) { 193*6734bb8cSAndreas Gohr throw new IndexUsageException('Group 0 is not valid for split-by-length collections'); 194*6734bb8cSAndreas Gohr } 195*6734bb8cSAndreas Gohr if ($group !== 0 && !$this->splitByLength) { 196*6734bb8cSAndreas Gohr throw new IndexUsageException("Group $group is not valid for non-split collections"); 197*6734bb8cSAndreas Gohr } 198*6734bb8cSAndreas Gohr return $group === 0 ? '' : (string)$group; 199*6734bb8cSAndreas Gohr } 200*6734bb8cSAndreas Gohr 201*6734bb8cSAndreas Gohr /** 202*6734bb8cSAndreas Gohr * Resolve token IDs to entity frequencies 203*6734bb8cSAndreas Gohr * 204*6734bb8cSAndreas Gohr * Given a set of token IDs from a specific index group, returns the entities 205*6734bb8cSAndreas Gohr * that have those tokens and their frequencies. This encapsulates the frequency 206*6734bb8cSAndreas Gohr * index access so that subclasses (e.g. DirectCollection) can provide alternative 207*6734bb8cSAndreas Gohr * mappings. 208*6734bb8cSAndreas Gohr * 209*6734bb8cSAndreas Gohr * @param int $group Index group (0 for non-split, token length for split) 210*6734bb8cSAndreas Gohr * @param int[] $tokenIds The token IDs to resolve 211*6734bb8cSAndreas Gohr * @return array [tokenId => [entityId => frequency, ...], ...] 212*6734bb8cSAndreas Gohr */ 213*6734bb8cSAndreas Gohr public function resolveTokenFrequencies(int $group, array $tokenIds): array 214*6734bb8cSAndreas Gohr { 215*6734bb8cSAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 216*6734bb8cSAndreas Gohr if (!$freqIndex->exists()) return []; 217*6734bb8cSAndreas Gohr return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds)); 218*6734bb8cSAndreas Gohr } 219*6734bb8cSAndreas Gohr 220*6734bb8cSAndreas Gohr /** 221*6734bb8cSAndreas Gohr * Return all entity names that have data in this collection 222*6734bb8cSAndreas Gohr * 223*6734bb8cSAndreas Gohr * @return string[] entity names 224*6734bb8cSAndreas Gohr */ 225*6734bb8cSAndreas Gohr public function getEntitiesWithData(): array 226*6734bb8cSAndreas Gohr { 227*6734bb8cSAndreas Gohr $entityIndex = $this->getEntityIndex(); 228*6734bb8cSAndreas Gohr 229*6734bb8cSAndreas Gohr // collect entity IDs from all frequency index groups 230*6734bb8cSAndreas Gohr $groups = $this->splitByLength 231*6734bb8cSAndreas Gohr ? range(1, $this->getTokenIndexMaximum()) 232*6734bb8cSAndreas Gohr : [0]; 233*6734bb8cSAndreas Gohr 234*6734bb8cSAndreas Gohr $entityIds = []; 235*6734bb8cSAndreas Gohr foreach ($groups as $group) { 236*6734bb8cSAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 237*6734bb8cSAndreas Gohr if (!$freqIndex->exists()) continue; 238*6734bb8cSAndreas Gohr foreach ($freqIndex as $line) { 239*6734bb8cSAndreas Gohr foreach (TupleOps::parseTuples($line) as $entityId => $count) { 240*6734bb8cSAndreas Gohr $entityIds[$entityId] = true; 241*6734bb8cSAndreas Gohr } 242*6734bb8cSAndreas Gohr } 243*6734bb8cSAndreas Gohr } 244*6734bb8cSAndreas Gohr 245*6734bb8cSAndreas Gohr $names = $entityIndex->retrieveRows(array_keys($entityIds)); 246*6734bb8cSAndreas Gohr return array_values(array_filter($names, static fn($v) => $v !== '')); 247*6734bb8cSAndreas Gohr } 248*6734bb8cSAndreas Gohr 249*6734bb8cSAndreas Gohr /** 250f2bbffb5SAndreas Gohr * Maximum suffix for the token indexes (eg. max word length currently stored) 251f2bbffb5SAndreas Gohr * 252f2bbffb5SAndreas Gohr * @return int 253c66b5ec6SAndreas Gohr * @throws IndexLockException 254f2bbffb5SAndreas Gohr */ 255f2bbffb5SAndreas Gohr public function getTokenIndexMaximum(): int 256f2bbffb5SAndreas Gohr { 257*6734bb8cSAndreas Gohr if ($this->idxToken instanceof AbstractIndex) { 258*6734bb8cSAndreas Gohr return $this->idxToken->max(); 259*6734bb8cSAndreas Gohr } 260*6734bb8cSAndreas Gohr return (new MemoryIndex($this->idxToken, ''))->max(); 261f2bbffb5SAndreas Gohr } 262f2bbffb5SAndreas Gohr 263f2bbffb5SAndreas Gohr /** 264f2bbffb5SAndreas Gohr * Add or update the tokens for a given entity 265f2bbffb5SAndreas Gohr * 266f2bbffb5SAndreas Gohr * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 267f2bbffb5SAndreas Gohr * entity from the index. 268f2bbffb5SAndreas Gohr * 269f2bbffb5SAndreas Gohr * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 270f2bbffb5SAndreas Gohr * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 271f2bbffb5SAndreas Gohr * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 272f2bbffb5SAndreas Gohr * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 273f2bbffb5SAndreas Gohr * 274f2bbffb5SAndreas Gohr * @param string $entity The name of the entity 275f2bbffb5SAndreas Gohr * @param string[] $tokens The list of tokens for this entity 27683b3acccSAndreas Gohr * @return static 277f2bbffb5SAndreas Gohr * @throws IndexAccessException 278f2bbffb5SAndreas Gohr * @throws IndexWriteException 279f2bbffb5SAndreas Gohr * @throws IndexLockException 280f2bbffb5SAndreas Gohr */ 28183b3acccSAndreas Gohr public function addEntity(string $entity, array $tokens): static 282f2bbffb5SAndreas Gohr { 283f2bbffb5SAndreas Gohr if (!$this->isWritable) { 284f2bbffb5SAndreas Gohr throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 285f2bbffb5SAndreas Gohr } 286f2bbffb5SAndreas Gohr 287f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 288f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 289f2bbffb5SAndreas Gohr 290f2bbffb5SAndreas Gohr $old = $this->getReverseAssignments($entity); 291f2bbffb5SAndreas Gohr $new = $this->resolveTokens($tokens); 292f2bbffb5SAndreas Gohr 293f2bbffb5SAndreas Gohr $merged = array_replace_recursive($old, $new); 294f2bbffb5SAndreas Gohr 295f2bbffb5SAndreas Gohr $this->updateIndexes($merged, $entityId); 296f2bbffb5SAndreas Gohr $this->saveReverseAssignments($entity, $merged); 29783b3acccSAndreas Gohr 29883b3acccSAndreas Gohr return $this; 299f2bbffb5SAndreas Gohr } 300f2bbffb5SAndreas Gohr 301f2bbffb5SAndreas Gohr /** 302f2bbffb5SAndreas Gohr * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 303f2bbffb5SAndreas Gohr * 304f2bbffb5SAndreas Gohr * Calls countTokens() to get token frequencies (subclass responsibility), then groups 305f2bbffb5SAndreas Gohr * by token length if splitByLength is enabled, or under '' if not. Finally resolves 306f2bbffb5SAndreas Gohr * token strings to IDs via the appropriate token index. 307f2bbffb5SAndreas Gohr * 308f2bbffb5SAndreas Gohr * @param string[] $tokens The raw token list 309f2bbffb5SAndreas Gohr * @return array [group => [tokenId => frequency, ...], ...] 310f2bbffb5SAndreas Gohr * @throws IndexLockException 311f2bbffb5SAndreas Gohr * @throws IndexWriteException 312f2bbffb5SAndreas Gohr */ 313f2bbffb5SAndreas Gohr protected function resolveTokens(array $tokens): array 314f2bbffb5SAndreas Gohr { 315f2bbffb5SAndreas Gohr $counted = $this->countTokens($tokens); 316f2bbffb5SAndreas Gohr 317f2bbffb5SAndreas Gohr // group tokens by their index suffix 318f2bbffb5SAndreas Gohr $groups = []; 319f2bbffb5SAndreas Gohr foreach ($counted as $token => $freq) { 320*6734bb8cSAndreas Gohr $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0; 321f2bbffb5SAndreas Gohr $groups[$group][$token] = $freq; 322f2bbffb5SAndreas Gohr } 323f2bbffb5SAndreas Gohr 324f2bbffb5SAndreas Gohr // resolve token strings to IDs 325f2bbffb5SAndreas Gohr $result = []; 326f2bbffb5SAndreas Gohr foreach ($groups as $group => $tokenFreqs) { 327f2bbffb5SAndreas Gohr $tokenIndex = $this->getTokenIndex($group); 328f2bbffb5SAndreas Gohr $result[$group] = []; 329f2bbffb5SAndreas Gohr foreach ($tokenFreqs as $token => $freq) { 330f2bbffb5SAndreas Gohr $tokenId = $tokenIndex->getRowID((string)$token); 331f2bbffb5SAndreas Gohr $result[$group][$tokenId] = $freq; 332f2bbffb5SAndreas Gohr } 333f2bbffb5SAndreas Gohr $tokenIndex->save(); 334f2bbffb5SAndreas Gohr } 335f2bbffb5SAndreas Gohr 336f2bbffb5SAndreas Gohr return $result; 337f2bbffb5SAndreas Gohr } 338f2bbffb5SAndreas Gohr 339f2bbffb5SAndreas Gohr /** 340f2bbffb5SAndreas Gohr * Count or deduplicate tokens and return their frequencies 341f2bbffb5SAndreas Gohr * 342f2bbffb5SAndreas Gohr * FrequencyCollections return actual occurrence counts. 343f2bbffb5SAndreas Gohr * LookupCollections deduplicate and return 1 for each token. 344f2bbffb5SAndreas Gohr * 345f2bbffb5SAndreas Gohr * @param string[] $tokens The raw token list 346f2bbffb5SAndreas Gohr * @return array [token => frequency, ...] 347f2bbffb5SAndreas Gohr */ 348f2bbffb5SAndreas Gohr abstract protected function countTokens(array $tokens): array; 349f2bbffb5SAndreas Gohr 350f2bbffb5SAndreas Gohr /** 351f2bbffb5SAndreas Gohr * Get the token assignments for a given entity from the reverse index 352f2bbffb5SAndreas Gohr * 353f2bbffb5SAndreas Gohr * Returns the parsed reverse index record. The exact structure depends on the collection type. 354f2bbffb5SAndreas Gohr * 355f2bbffb5SAndreas Gohr * @param string $entity 356f2bbffb5SAndreas Gohr * @return array 357f2bbffb5SAndreas Gohr * @throws IndexAccessException 358f2bbffb5SAndreas Gohr * @throws IndexWriteException 359c66b5ec6SAndreas Gohr * @throws IndexLockException 360f2bbffb5SAndreas Gohr */ 361f2bbffb5SAndreas Gohr public function getReverseAssignments(string $entity): array 362f2bbffb5SAndreas Gohr { 363f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 364f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 365f2bbffb5SAndreas Gohr 366f2bbffb5SAndreas Gohr $reverseIndex = $this->getReverseIndex(); 367f2bbffb5SAndreas Gohr $record = $reverseIndex->retrieveRow($entityId); 368f2bbffb5SAndreas Gohr 369f2bbffb5SAndreas Gohr if ($record === '') { 370f2bbffb5SAndreas Gohr return []; 371f2bbffb5SAndreas Gohr } 372f2bbffb5SAndreas Gohr 373f2bbffb5SAndreas Gohr return $this->parseReverseRecord($record); 374f2bbffb5SAndreas Gohr } 375f2bbffb5SAndreas Gohr 376f2bbffb5SAndreas Gohr /** 377f2bbffb5SAndreas Gohr * Store the reverse index info about what tokens are assigned to the entity 378f2bbffb5SAndreas Gohr * 379f2bbffb5SAndreas Gohr * @param string $entity 380f2bbffb5SAndreas Gohr * @param array $data The assignment data to store 381f2bbffb5SAndreas Gohr * @return void 382f2bbffb5SAndreas Gohr * @throws IndexAccessException 383f2bbffb5SAndreas Gohr * @throws IndexWriteException 384f2bbffb5SAndreas Gohr * @throws IndexLockException 385f2bbffb5SAndreas Gohr */ 386f2bbffb5SAndreas Gohr protected function saveReverseAssignments(string $entity, array $data): void 387f2bbffb5SAndreas Gohr { 388f2bbffb5SAndreas Gohr // remove tokens with frequency 0 (no longer assigned), then remove empty groups 389f2bbffb5SAndreas Gohr $data = array_map('array_filter', $data); 390f2bbffb5SAndreas Gohr $data = array_filter($data); 391f2bbffb5SAndreas Gohr 392f2bbffb5SAndreas Gohr $record = $this->formatReverseRecord($data); 393f2bbffb5SAndreas Gohr 394f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 395f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 396f2bbffb5SAndreas Gohr 397f2bbffb5SAndreas Gohr $reverseIndex = $this->getReverseIndex(); 398f2bbffb5SAndreas Gohr $reverseIndex->changeRow($entityId, $record); 399f2bbffb5SAndreas Gohr } 400f2bbffb5SAndreas Gohr 401f2bbffb5SAndreas Gohr /** 402f2bbffb5SAndreas Gohr * Parse a reverse index record into a two-level array 403f2bbffb5SAndreas Gohr * 404f2bbffb5SAndreas Gohr * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 405f2bbffb5SAndreas Gohr * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 406f2bbffb5SAndreas Gohr * tokens absent from the new data retain 0, signaling deletion from the frequency index. 407f2bbffb5SAndreas Gohr * 408f2bbffb5SAndreas Gohr * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 409f2bbffb5SAndreas Gohr * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 410f2bbffb5SAndreas Gohr * This mirrors how TupleOps omits *1 for frequency 1. 411f2bbffb5SAndreas Gohr * 412f2bbffb5SAndreas Gohr * @param string $record The raw reverse index record 413f2bbffb5SAndreas Gohr * @return array [group => [tokenId => 0, ...], ...] 414f2bbffb5SAndreas Gohr */ 415f2bbffb5SAndreas Gohr protected function parseReverseRecord(string $record): array 416f2bbffb5SAndreas Gohr { 417f2bbffb5SAndreas Gohr $result = []; 418f2bbffb5SAndreas Gohr foreach (explode(':', $record) as $entry) { 419f2bbffb5SAndreas Gohr $parts = explode('*', $entry, 2); 420f2bbffb5SAndreas Gohr $tokenId = array_pop($parts); 421*6734bb8cSAndreas Gohr $group = (int)(array_pop($parts) ?? 0); 422f2bbffb5SAndreas Gohr $result[$group][$tokenId] = 0; 423f2bbffb5SAndreas Gohr } 424f2bbffb5SAndreas Gohr return $result; 425f2bbffb5SAndreas Gohr } 426f2bbffb5SAndreas Gohr 427f2bbffb5SAndreas Gohr /** 428f2bbffb5SAndreas Gohr * Format a two-level array into a reverse index record string 429f2bbffb5SAndreas Gohr * 430f2bbffb5SAndreas Gohr * @param array $data [group => [tokenId => freq, ...], ...] 431f2bbffb5SAndreas Gohr * @return string The formatted record 432f2bbffb5SAndreas Gohr */ 433f2bbffb5SAndreas Gohr protected function formatReverseRecord(array $data): string 434f2bbffb5SAndreas Gohr { 435f2bbffb5SAndreas Gohr $parts = []; 436f2bbffb5SAndreas Gohr foreach ($data as $group => $tokens) { 437*6734bb8cSAndreas Gohr $prefix = $group === 0 ? '' : "$group*"; 438f2bbffb5SAndreas Gohr foreach (array_keys($tokens) as $tokenId) { 439f2bbffb5SAndreas Gohr $parts[] = $prefix . $tokenId; 440f2bbffb5SAndreas Gohr } 441f2bbffb5SAndreas Gohr } 442f2bbffb5SAndreas Gohr return implode(':', $parts); 443f2bbffb5SAndreas Gohr } 444f2bbffb5SAndreas Gohr 445f2bbffb5SAndreas Gohr /** 446f2bbffb5SAndreas Gohr * Update frequency indexes with the given data 447f2bbffb5SAndreas Gohr * 448f2bbffb5SAndreas Gohr * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 449f2bbffb5SAndreas Gohr * corresponding frequency index for each group. A frequency of 0 removes the entity 450f2bbffb5SAndreas Gohr * from that token's frequency record. 451f2bbffb5SAndreas Gohr * 452f2bbffb5SAndreas Gohr * @param array $data [group => [tokenId => frequency, ...], ...] 453f2bbffb5SAndreas Gohr * @param int $entityId The entity ID 454f2bbffb5SAndreas Gohr * @throws IndexLockException 455f2bbffb5SAndreas Gohr * @throws IndexWriteException 456f2bbffb5SAndreas Gohr */ 457f2bbffb5SAndreas Gohr protected function updateIndexes(array $data, int $entityId): void 458f2bbffb5SAndreas Gohr { 459f2bbffb5SAndreas Gohr foreach ($data as $group => $tokens) { 460f2bbffb5SAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 461f2bbffb5SAndreas Gohr foreach ($tokens as $tokenId => $freq) { 462f2bbffb5SAndreas Gohr $record = $freqIndex->retrieveRow($tokenId); 463f2bbffb5SAndreas Gohr $record = TupleOps::updateTuple($record, $entityId, $freq); 464f2bbffb5SAndreas Gohr $freqIndex->changeRow($tokenId, $record); 465f2bbffb5SAndreas Gohr } 466f2bbffb5SAndreas Gohr $freqIndex->save(); 467f2bbffb5SAndreas Gohr } 468f2bbffb5SAndreas Gohr } 469f2bbffb5SAndreas Gohr} 470