1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexIntegrityException; 7use dokuwiki\Search\Exception\IndexLockException; 8use dokuwiki\Search\Exception\IndexUsageException; 9use dokuwiki\Search\Exception\IndexWriteException; 10use dokuwiki\Search\Index\AbstractIndex; 11use dokuwiki\Search\Index\FileIndex; 12use dokuwiki\Search\Index\Lock; 13use dokuwiki\Search\Index\MemoryIndex; 14use dokuwiki\Search\Index\TupleOps; 15use dokuwiki\Search\Tokenizer; 16 17/** 18 * Abstract base class for index collections 19 * 20 * A collection manages a group of related indexes that together provide a specific search use case. 21 * Every collection works with four index types: entity, token, frequency, and reverse. 22 * 23 * entity - the list of the main entities (eg. pages) 24 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 25 * frequency - how often a token appears on a entity (can be split into multiple files) 26 * reverse - the list of tokens assigned to each entity 27 * 28 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 29 * @author Andreas Gohr <andi@splitbrain.org> 30 * @author Tom N Harris <tnharris@whoopdedo.org> 31 */ 32abstract class AbstractCollection 33{ 34 /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 35 protected array $lockedIndexes = []; 36 37 /** @var bool Has a lock been acquired for all used indexes? */ 38 protected bool $isWritable = false; 39 40 /** 41 * Initialize the collection with the names of the indexes it manages 42 * 43 * Entity and token indexes can be passed as already instantiated AbstractIndex objects 44 * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 45 * 46 * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 47 * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 48 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 49 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 50 * @param bool $splitByLength Whether to split token/frequency indexes by token length 51 * @throws IndexUsageException 52 */ 53 public function __construct( 54 protected string|AbstractIndex $idxEntity, 55 protected string|AbstractIndex $idxToken, 56 protected string $idxFrequency = '', 57 protected string $idxReverse = '', 58 protected bool $splitByLength = false 59 ) 60 { 61 if ($idxToken instanceof AbstractIndex && $splitByLength) { 62 throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 63 } 64 } 65 66 /** 67 * Destructor 68 * 69 * Ensures locks are released when the class is destroyed 70 */ 71 public function __destruct() 72 { 73 $this->unlock(); 74 } 75 76 /** 77 * Lock all indexes for writing 78 * 79 * @return $this can be used for chaining 80 * @throws IndexLockException 81 */ 82 public function lock(): static 83 { 84 foreach ([ 85 $this->idxEntity, 86 $this->idxToken, 87 $this->idxFrequency, 88 $this->idxReverse 89 ] as $idx) { 90 if ($idx === '') continue; 91 try { 92 if ($idx instanceof AbstractIndex) { 93 $idx->lock(); 94 $this->lockedIndexes[] = $idx; 95 } else { 96 Lock::acquire($idx); 97 $this->lockedIndexes[] = $idx; 98 } 99 } catch (IndexLockException $e) { 100 $this->unlock(); 101 throw $e; 102 } 103 } 104 $this->isWritable = true; 105 return $this; 106 } 107 108 /** 109 * Unlock all indexes that were successfully locked 110 * 111 * @return static 112 */ 113 public function unlock(): static 114 { 115 foreach ($this->lockedIndexes as $idx) { 116 if ($idx instanceof AbstractIndex) { 117 $idx->unlock(); 118 } else { 119 Lock::release($idx); 120 } 121 } 122 $this->lockedIndexes = []; 123 $this->isWritable = false; 124 return $this; 125 } 126 127 /** 128 * @return AbstractIndex 129 * @throws IndexLockException 130 */ 131 public function getEntityIndex(): AbstractIndex 132 { 133 if ($this->idxEntity instanceof AbstractIndex) { 134 return $this->idxEntity; 135 } 136 return new FileIndex($this->idxEntity, '', $this->isWritable); 137 } 138 139 /** 140 * @param int $group Index group (0 for non-split, token length for split) 141 * @return AbstractIndex 142 * @throws IndexLockException 143 */ 144 public function getTokenIndex(int $group = 0): AbstractIndex 145 { 146 if ($this->idxToken instanceof AbstractIndex) { 147 return $this->idxToken; 148 } 149 return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable); 150 } 151 152 /** 153 * @param int $group Index group (0 for non-split, token length for split) 154 * @return AbstractIndex 155 * @throws IndexLockException 156 */ 157 public function getFrequencyIndex(int $group = 0): AbstractIndex 158 { 159 return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable); 160 } 161 162 /** 163 * @return AbstractIndex 164 * @throws IndexLockException 165 */ 166 public function getReverseIndex(): AbstractIndex 167 { 168 return new FileIndex($this->idxReverse, '', $this->isWritable); 169 } 170 171 /** 172 * Whether this collection splits token/frequency indexes by token length 173 * 174 * @return bool 175 */ 176 public function isSplitByLength(): bool 177 { 178 return $this->splitByLength; 179 } 180 181 /** 182 * Convert a logical group number to the index file suffix 183 * 184 * Group 0 represents non-split indexes (suffix '') while positive integers 185 * represent split-by-length indexes (suffix = the length). 186 * 187 * @param int $group 188 * @return string The file suffix ('' for group 0, the group number as string otherwise) 189 * @throws IndexUsageException when group does not match the collection's split mode 190 */ 191 protected function groupToSuffix(int $group): string 192 { 193 if ($group === 0 && $this->splitByLength) { 194 throw new IndexUsageException('Group 0 is not valid for split-by-length collections'); 195 } 196 if ($group !== 0 && !$this->splitByLength) { 197 throw new IndexUsageException("Group $group is not valid for non-split collections"); 198 } 199 return $group === 0 ? '' : (string)$group; 200 } 201 202 /** 203 * Resolve token IDs to entity frequencies 204 * 205 * Given a set of token IDs from a specific index group, returns the entities 206 * that have those tokens and their frequencies. This encapsulates the frequency 207 * index access so that subclasses (e.g. DirectCollection) can provide alternative 208 * mappings. 209 * 210 * @param int $group Index group (0 for non-split, token length for split) 211 * @param int[] $tokenIds The token IDs to resolve 212 * @return array [tokenId => [entityId => frequency, ...], ...] 213 */ 214 public function resolveTokenFrequencies(int $group, array $tokenIds): array 215 { 216 $freqIndex = $this->getFrequencyIndex($group); 217 if (!$freqIndex->exists()) return []; 218 return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds)); 219 } 220 221 /** 222 * Return all entity names that have data in this collection 223 * 224 * @return string[] entity names 225 */ 226 public function getEntitiesWithData(): array 227 { 228 $entityIndex = $this->getEntityIndex(); 229 230 // collect entity IDs from all frequency index groups 231 $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0; 232 $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0]; 233 234 $entityIds = []; 235 foreach ($groups as $group) { 236 $freqIndex = $this->getFrequencyIndex($group); 237 if (!$freqIndex->exists()) continue; 238 foreach ($freqIndex as $line) { 239 foreach (TupleOps::parseTuples($line) as $entityId => $count) { 240 $entityIds[$entityId] = true; 241 } 242 } 243 } 244 245 $names = $entityIndex->retrieveRows(array_keys($entityIds)); 246 return array_values(array_filter($names, static fn($v) => $v !== '')); 247 } 248 249 /** 250 * Maximum suffix for the token indexes (eg. max word length currently stored) 251 * 252 * @return int 253 * @throws IndexLockException 254 */ 255 public function getTokenIndexMaximum(): int 256 { 257 if ($this->idxToken instanceof AbstractIndex) { 258 return $this->idxToken->max(); 259 } 260 return (new MemoryIndex($this->idxToken, ''))->max(); 261 } 262 263 /** 264 * Check the structural integrity of this collection's indexes 265 * 266 * Verifies that paired indexes have matching line counts: 267 * - token == frequency (per group, both keyed by token RID) 268 * - entity == reverse (both keyed by entity RID) 269 * 270 * @throws IndexIntegrityException when a structural inconsistency is found 271 */ 272 public function checkIntegrity(): void 273 { 274 // Check token/frequency pairs 275 $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0; 276 $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0]; 277 278 foreach ($groups as $group) { 279 $tokenIndex = $this->getTokenIndex($group); 280 $freqIndex = $this->getFrequencyIndex($group); 281 282 if (!$tokenIndex->exists() && !$freqIndex->exists()) continue; 283 284 if ($tokenIndex->exists() !== $freqIndex->exists()) { 285 throw new IndexIntegrityException( 286 "Group $group: missing " . 287 ($tokenIndex->exists() ? 'frequency' : 'token') . ' index' 288 ); 289 } 290 291 $tc = count($tokenIndex); 292 $fc = count($freqIndex); 293 if ($tc !== $fc) { 294 throw new IndexIntegrityException( 295 "Group $group: token count ($tc) != frequency count ($fc)" 296 ); 297 } 298 } 299 300 // Check entity/reverse pair 301 $entityIndex = $this->getEntityIndex(); 302 $reverseIndex = $this->getReverseIndex(); 303 if ($entityIndex->exists() && $reverseIndex->exists()) { 304 $ec = count($entityIndex); 305 $rc = count($reverseIndex); 306 if ($ec !== $rc) { 307 throw new IndexIntegrityException( 308 "Entity count ($ec) != reverse count ($rc)" 309 ); 310 } 311 } 312 } 313 314 /** 315 * Add or update the tokens for a given entity 316 * 317 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 318 * entity from the index. 319 * 320 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 321 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 322 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 323 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 324 * 325 * @param string $entity The name of the entity 326 * @param string[] $tokens The list of tokens for this entity 327 * @return static 328 * @throws IndexAccessException 329 * @throws IndexWriteException 330 * @throws IndexLockException 331 */ 332 public function addEntity(string $entity, array $tokens): static 333 { 334 if (!$this->isWritable) { 335 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 336 } 337 338 $entityIndex = $this->getEntityIndex(); 339 $entityId = $entityIndex->accessCachedValue($entity); 340 341 $old = $this->getReverseAssignments($entity); 342 $new = $this->resolveTokens($tokens); 343 344 $merged = array_replace_recursive($old, $new); 345 346 $this->updateIndexes($merged, $entityId); 347 $this->saveReverseAssignments($entity, $merged); 348 349 return $this; 350 } 351 352 /** 353 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 354 * 355 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 356 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 357 * token strings to IDs via the appropriate token index. 358 * 359 * @param string[] $tokens The raw token list 360 * @return array [group => [tokenId => frequency, ...], ...] 361 * @throws IndexLockException 362 * @throws IndexWriteException 363 */ 364 protected function resolveTokens(array $tokens): array 365 { 366 $counted = $this->countTokens($tokens); 367 368 // group tokens by their index suffix 369 $groups = []; 370 foreach ($counted as $token => $freq) { 371 $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0; 372 $groups[$group][$token] = $freq; 373 } 374 375 // resolve token strings to IDs 376 $result = []; 377 foreach ($groups as $group => $tokenFreqs) { 378 $tokenIndex = $this->getTokenIndex($group); 379 $result[$group] = []; 380 foreach ($tokenFreqs as $token => $freq) { 381 $tokenId = $tokenIndex->getRowID((string)$token); 382 $result[$group][$tokenId] = $freq; 383 } 384 $tokenIndex->save(); 385 } 386 387 return $result; 388 } 389 390 /** 391 * Count or deduplicate tokens and return their frequencies 392 * 393 * FrequencyCollections return actual occurrence counts. 394 * LookupCollections deduplicate and return 1 for each token. 395 * 396 * @param string[] $tokens The raw token list 397 * @return array [token => frequency, ...] 398 */ 399 abstract protected function countTokens(array $tokens): array; 400 401 /** 402 * Get the token assignments for a given entity from the reverse index 403 * 404 * Returns the parsed reverse index record. The exact structure depends on the collection type. 405 * 406 * @param string $entity 407 * @return array 408 * @throws IndexAccessException 409 * @throws IndexWriteException 410 * @throws IndexLockException 411 */ 412 public function getReverseAssignments(string $entity): array 413 { 414 $entityIndex = $this->getEntityIndex(); 415 $entityId = $entityIndex->accessCachedValue($entity); 416 417 $reverseIndex = $this->getReverseIndex(); 418 $record = $reverseIndex->retrieveRow($entityId); 419 420 if ($record === '') { 421 return []; 422 } 423 424 return $this->parseReverseRecord($record); 425 } 426 427 /** 428 * Store the reverse index info about what tokens are assigned to the entity 429 * 430 * @param string $entity 431 * @param array $data The assignment data to store 432 * @return void 433 * @throws IndexAccessException 434 * @throws IndexWriteException 435 * @throws IndexLockException 436 */ 437 protected function saveReverseAssignments(string $entity, array $data): void 438 { 439 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 440 $data = array_map('array_filter', $data); 441 $data = array_filter($data); 442 443 $record = $this->formatReverseRecord($data); 444 445 $entityIndex = $this->getEntityIndex(); 446 $entityId = $entityIndex->accessCachedValue($entity); 447 448 $reverseIndex = $this->getReverseIndex(); 449 $reverseIndex->changeRow($entityId, $record); 450 } 451 452 /** 453 * Parse a reverse index record into a two-level array 454 * 455 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 456 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 457 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 458 * 459 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 460 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 461 * This mirrors how TupleOps omits *1 for frequency 1. 462 * 463 * @param string $record The raw reverse index record 464 * @return array [group => [tokenId => 0, ...], ...] 465 */ 466 protected function parseReverseRecord(string $record): array 467 { 468 $result = []; 469 foreach (explode(':', $record) as $entry) { 470 $parts = explode('*', $entry, 2); 471 $tokenId = array_pop($parts); 472 $group = (int)(array_pop($parts) ?? 0); 473 $result[$group][$tokenId] = 0; 474 } 475 return $result; 476 } 477 478 /** 479 * Format a two-level array into a reverse index record string 480 * 481 * @param array $data [group => [tokenId => freq, ...], ...] 482 * @return string The formatted record 483 */ 484 protected function formatReverseRecord(array $data): string 485 { 486 $parts = []; 487 foreach ($data as $group => $tokens) { 488 $prefix = $group === 0 ? '' : "$group*"; 489 foreach (array_keys($tokens) as $tokenId) { 490 $parts[] = $prefix . $tokenId; 491 } 492 } 493 return implode(':', $parts); 494 } 495 496 /** 497 * Update frequency indexes with the given data 498 * 499 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 500 * corresponding frequency index for each group. A frequency of 0 removes the entity 501 * from that token's frequency record. 502 * 503 * @param array $data [group => [tokenId => frequency, ...], ...] 504 * @param int $entityId The entity ID 505 * @throws IndexLockException 506 * @throws IndexWriteException 507 */ 508 protected function updateIndexes(array $data, int $entityId): void 509 { 510 foreach ($data as $group => $tokens) { 511 $freqIndex = $this->getFrequencyIndex($group); 512 foreach ($tokens as $tokenId => $freq) { 513 $record = $freqIndex->retrieveRow($tokenId); 514 $record = TupleOps::updateTuple($record, $entityId, $freq); 515 $freqIndex->changeRow($tokenId, $record); 516 } 517 $freqIndex->save(); 518 } 519 } 520} 521