1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexIntegrityException; 7use dokuwiki\Search\Exception\IndexLockException; 8use dokuwiki\Search\Exception\IndexUsageException; 9use dokuwiki\Search\Exception\IndexWriteException; 10use dokuwiki\Search\Index\AbstractIndex; 11use dokuwiki\Search\Index\FileIndex; 12use dokuwiki\Search\Index\Lock; 13use dokuwiki\Search\Index\MemoryIndex; 14use dokuwiki\Search\Index\TupleOps; 15use dokuwiki\Search\Tokenizer; 16 17/** 18 * Abstract base class for index collections 19 * 20 * A collection manages a group of related indexes that together provide a specific search use case. 21 * Every collection works with four index types: entity, token, frequency, and reverse. 22 * 23 * entity - the list of the main entities (eg. pages) 24 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 25 * frequency - how often a token appears on a entity (can be split into multiple files) 26 * reverse - the list of tokens assigned to each entity 27 * 28 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 29 * @author Andreas Gohr <andi@splitbrain.org> 30 * @author Tom N Harris <tnharris@whoopdedo.org> 31 */ 32abstract class AbstractCollection 33{ 34 /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 35 protected array $lockedIndexes = []; 36 37 /** @var bool Has a lock been acquired for all used indexes? */ 38 protected bool $isWritable = false; 39 40 /** 41 * Initialize the collection with the names of the indexes it manages 42 * 43 * Entity and token indexes can be passed as already instantiated AbstractIndex objects 44 * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 45 * 46 * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 47 * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 48 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 49 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 50 * @param bool $splitByLength Whether to split token/frequency indexes by token length 51 * @throws IndexUsageException 52 */ 53 public function __construct( 54 protected string|AbstractIndex $idxEntity, 55 protected string|AbstractIndex $idxToken, 56 protected string $idxFrequency = '', 57 protected string $idxReverse = '', 58 protected bool $splitByLength = false 59 ) { 60 if ($idxToken instanceof AbstractIndex && $splitByLength) { 61 throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 62 } 63 } 64 65 /** 66 * Destructor 67 * 68 * Ensures locks are released when the class is destroyed 69 */ 70 public function __destruct() 71 { 72 $this->unlock(); 73 } 74 75 /** 76 * Lock all indexes for writing 77 * 78 * @return $this can be used for chaining 79 * @throws IndexLockException 80 */ 81 public function lock(): static 82 { 83 foreach ( 84 [ 85 $this->idxEntity, 86 $this->idxToken, 87 $this->idxFrequency, 88 $this->idxReverse 89 ] as $idx 90 ) { 91 if ($idx === '') continue; 92 try { 93 if ($idx instanceof AbstractIndex) { 94 $idx->lock(); 95 } else { 96 Lock::acquire($idx); 97 } 98 $this->lockedIndexes[] = $idx; 99 } catch (IndexLockException $e) { 100 $this->unlock(); 101 throw $e; 102 } 103 } 104 $this->isWritable = true; 105 return $this; 106 } 107 108 /** 109 * Unlock all indexes that were successfully locked 110 * 111 * @return static 112 */ 113 public function unlock(): static 114 { 115 foreach ($this->lockedIndexes as $idx) { 116 if ($idx instanceof AbstractIndex) { 117 $idx->unlock(); 118 } else { 119 Lock::release($idx); 120 } 121 } 122 $this->lockedIndexes = []; 123 $this->isWritable = false; 124 return $this; 125 } 126 127 /** 128 * @return AbstractIndex 129 * @throws IndexLockException 130 */ 131 public function getEntityIndex(): AbstractIndex 132 { 133 if ($this->idxEntity instanceof AbstractIndex) { 134 return $this->idxEntity; 135 } 136 return new FileIndex($this->idxEntity, '', $this->isWritable); 137 } 138 139 /** 140 * @param int $group Index group (0 for non-split, token length for split) 141 * @return AbstractIndex 142 * @throws IndexLockException 143 */ 144 public function getTokenIndex(int $group = 0): AbstractIndex 145 { 146 if ($this->idxToken instanceof AbstractIndex) { 147 return $this->idxToken; 148 } 149 return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable); 150 } 151 152 /** 153 * @param int $group Index group (0 for non-split, token length for split) 154 * @return AbstractIndex 155 * @throws IndexLockException 156 */ 157 public function getFrequencyIndex(int $group = 0): AbstractIndex 158 { 159 return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable); 160 } 161 162 /** 163 * @return AbstractIndex 164 * @throws IndexLockException 165 */ 166 public function getReverseIndex(): AbstractIndex 167 { 168 return new FileIndex($this->idxReverse, '', $this->isWritable); 169 } 170 171 /** 172 * Whether this collection splits token/frequency indexes by token length 173 * 174 * @return bool 175 */ 176 public function isSplitByLength(): bool 177 { 178 return $this->splitByLength; 179 } 180 181 /** 182 * Convert a logical group number to the index file suffix 183 * 184 * Group 0 represents non-split indexes (suffix '') while positive integers 185 * represent split-by-length indexes (suffix = the length). 186 * 187 * @param int $group 188 * @return string The file suffix ('' for group 0, the group number as string otherwise) 189 * @throws IndexUsageException when group does not match the collection's split mode 190 */ 191 protected function groupToSuffix(int $group): string 192 { 193 if ($group === 0 && $this->splitByLength) { 194 throw new IndexUsageException('Group 0 is not valid for split-by-length collections'); 195 } 196 if ($group !== 0 && !$this->splitByLength) { 197 throw new IndexUsageException("Group $group is not valid for non-split collections"); 198 } 199 return $group === 0 ? '' : (string)$group; 200 } 201 202 /** 203 * Resolve token IDs to entity frequencies 204 * 205 * Given a set of token IDs from a specific index group, returns the entities 206 * that have those tokens and their frequencies. This encapsulates the frequency 207 * index access so that subclasses (e.g. DirectCollection) can provide alternative 208 * mappings. 209 * 210 * @param int $group Index group (0 for non-split, token length for split) 211 * @param int[] $tokenIds The token IDs to resolve 212 * @return array [tokenId => [entityId => frequency, ...], ...] 213 */ 214 public function resolveTokenFrequencies(int $group, array $tokenIds): array 215 { 216 $freqIndex = $this->getFrequencyIndex($group); 217 if (!$freqIndex->exists()) return []; 218 return array_map(TupleOps::parseTuples(...), $freqIndex->retrieveRows($tokenIds)); 219 } 220 221 /** 222 * Return all entity names that have data in this collection 223 * 224 * @return string[] entity names 225 */ 226 public function getEntitiesWithData(): array 227 { 228 $entityIndex = $this->getEntityIndex(); 229 230 // collect entity IDs from all frequency index groups 231 $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0; 232 $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0]; 233 234 $entityIds = []; 235 foreach ($groups as $group) { 236 $freqIndex = $this->getFrequencyIndex($group); 237 if (!$freqIndex->exists()) continue; 238 foreach ($freqIndex as $line) { 239 foreach (array_keys(TupleOps::parseTuples($line)) as $entityId) { 240 $entityIds[$entityId] = true; 241 } 242 } 243 } 244 245 $names = $entityIndex->retrieveRows(array_keys($entityIds)); 246 return array_values(array_filter($names, static fn($v) => $v !== '')); 247 } 248 249 /** 250 * Maximum suffix for the token indexes (eg. max word length currently stored) 251 * 252 * @return int 253 * @throws IndexLockException 254 */ 255 public function getTokenIndexMaximum(): int 256 { 257 if ($this->idxToken instanceof AbstractIndex) { 258 return $this->idxToken->max(); 259 } 260 return (new MemoryIndex($this->idxToken, ''))->max(); 261 } 262 263 /** 264 * Check the structural integrity of this collection's indexes 265 * 266 * Verifies that paired indexes have matching line counts: 267 * - token == frequency (per group, both keyed by token RID) 268 * - entity == reverse (both keyed by entity RID) 269 * 270 * @throws IndexIntegrityException when a structural inconsistency is found 271 */ 272 public function checkIntegrity(): void 273 { 274 // Check token/frequency pairs 275 $max = $this->splitByLength ? $this->getTokenIndexMaximum() : 0; 276 $groups = $this->splitByLength ? ($max > 0 ? range(1, $max) : []) : [0]; 277 278 foreach ($groups as $group) { 279 $tokenIndex = $this->getTokenIndex($group); 280 $freqIndex = $this->getFrequencyIndex($group); 281 282 if (!$tokenIndex->exists() && !$freqIndex->exists()) continue; 283 284 if ($tokenIndex->exists() !== $freqIndex->exists()) { 285 throw new IndexIntegrityException( 286 "Group $group: missing " . 287 ($tokenIndex->exists() ? 'frequency' : 'token') . ' index' 288 ); 289 } 290 291 $tc = count($tokenIndex); 292 $fc = count($freqIndex); 293 if ($tc !== $fc) { 294 throw new IndexIntegrityException( 295 "Group $group: token count ($tc) != frequency count ($fc)" 296 ); 297 } 298 } 299 300 // Check entity/reverse pair 301 $entityIndex = $this->getEntityIndex(); 302 $reverseIndex = $this->getReverseIndex(); 303 if ($entityIndex->exists() && $reverseIndex->exists()) { 304 $ec = count($entityIndex); 305 $rc = count($reverseIndex); 306 if ($ec !== $rc) { 307 throw new IndexIntegrityException( 308 "Entity count ($ec) != reverse count ($rc)" 309 ); 310 } 311 } 312 } 313 314 /** 315 * Add or update the tokens for a given entity 316 * 317 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 318 * entity from the index. 319 * 320 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 321 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 322 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 323 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 324 * 325 * @param string $entity The name of the entity 326 * @param string[] $tokens The list of tokens for this entity 327 * @return static 328 * @throws IndexAccessException 329 * @throws IndexWriteException 330 * @throws IndexLockException 331 */ 332 public function addEntity(string $entity, array $tokens): static 333 { 334 if (!$this->isWritable) { 335 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 336 } 337 338 $entityIndex = $this->getEntityIndex(); 339 $entityId = $entityIndex->accessCachedValue($entity); 340 341 $old = $this->getReverseAssignments($entity); 342 $new = $this->resolveTokens($tokens); 343 344 $merged = array_replace_recursive($old, $new); 345 346 $this->updateIndexes($merged, $entityId); 347 $this->saveReverseAssignments($entity, $merged); 348 349 return $this; 350 } 351 352 /** 353 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 354 * 355 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 356 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 357 * token strings to IDs via the appropriate token index. 358 * 359 * @param string[] $tokens The raw token list 360 * @return array [group => [tokenId => frequency, ...], ...] 361 * @throws IndexLockException 362 */ 363 protected function resolveTokens(array $tokens): array 364 { 365 $counted = $this->countTokens($tokens); 366 367 // group tokens by their index suffix 368 $groups = []; 369 foreach ($counted as $token => $freq) { 370 $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0; 371 $groups[$group][$token] = $freq; 372 } 373 374 // resolve token strings to IDs 375 $result = []; 376 foreach ($groups as $group => $tokenFreqs) { 377 $tokenIndex = $this->getTokenIndex($group); 378 $result[$group] = []; 379 foreach ($tokenFreqs as $token => $freq) { 380 $tokenId = $tokenIndex->getRowID((string)$token); 381 $result[$group][$tokenId] = $freq; 382 } 383 $tokenIndex->save(); 384 } 385 386 return $result; 387 } 388 389 /** 390 * Count or deduplicate tokens and return their frequencies 391 * 392 * FrequencyCollections return actual occurrence counts. 393 * LookupCollections deduplicate and return 1 for each token. 394 * 395 * @param string[] $tokens The raw token list 396 * @return array [token => frequency, ...] 397 */ 398 abstract protected function countTokens(array $tokens): array; 399 400 /** 401 * Get the token assignments for a given entity from the reverse index 402 * 403 * Returns the parsed reverse index record. The exact structure depends on the collection type. 404 * 405 * @param string $entity 406 * @return array 407 * @throws IndexAccessException 408 * @throws IndexWriteException 409 * @throws IndexLockException 410 */ 411 public function getReverseAssignments(string $entity): array 412 { 413 $entityIndex = $this->getEntityIndex(); 414 $entityId = $entityIndex->accessCachedValue($entity); 415 416 $reverseIndex = $this->getReverseIndex(); 417 $record = $reverseIndex->retrieveRow($entityId); 418 419 if ($record === '') { 420 return []; 421 } 422 423 return $this->parseReverseRecord($record); 424 } 425 426 /** 427 * Store the reverse index info about what tokens are assigned to the entity 428 * 429 * @param string $entity 430 * @param array $data The assignment data to store 431 * @return void 432 * @throws IndexAccessException 433 * @throws IndexWriteException 434 * @throws IndexLockException 435 */ 436 protected function saveReverseAssignments(string $entity, array $data): void 437 { 438 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 439 $data = array_map(array_filter(...), $data); 440 $data = array_filter($data); 441 442 $record = $this->formatReverseRecord($data); 443 444 $entityIndex = $this->getEntityIndex(); 445 $entityId = $entityIndex->accessCachedValue($entity); 446 447 $reverseIndex = $this->getReverseIndex(); 448 $reverseIndex->changeRow($entityId, $record); 449 } 450 451 /** 452 * Parse a reverse index record into a two-level array 453 * 454 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 455 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 456 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 457 * 458 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 459 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 460 * This mirrors how TupleOps omits *1 for frequency 1. 461 * 462 * @param string $record The raw reverse index record 463 * @return array [group => [tokenId => 0, ...], ...] 464 */ 465 protected function parseReverseRecord(string $record): array 466 { 467 $result = []; 468 foreach (explode(':', $record) as $entry) { 469 $parts = explode('*', $entry, 2); 470 $tokenId = array_pop($parts); 471 $group = (int)(array_pop($parts) ?? 0); 472 $result[$group][$tokenId] = 0; 473 } 474 return $result; 475 } 476 477 /** 478 * Format a two-level array into a reverse index record string 479 * 480 * @param array $data [group => [tokenId => freq, ...], ...] 481 * @return string The formatted record 482 */ 483 protected function formatReverseRecord(array $data): string 484 { 485 $parts = []; 486 foreach ($data as $group => $tokens) { 487 $prefix = $group === 0 ? '' : "$group*"; 488 foreach (array_keys($tokens) as $tokenId) { 489 $parts[] = $prefix . $tokenId; 490 } 491 } 492 return implode(':', $parts); 493 } 494 495 /** 496 * Update frequency indexes with the given data 497 * 498 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 499 * corresponding frequency index for each group. A frequency of 0 removes the entity 500 * from that token's frequency record. 501 * 502 * @param array $data [group => [tokenId => frequency, ...], ...] 503 * @param int $entityId The entity ID 504 * @throws IndexLockException 505 */ 506 protected function updateIndexes(array $data, int $entityId): void 507 { 508 foreach ($data as $group => $tokens) { 509 $freqIndex = $this->getFrequencyIndex($group); 510 foreach ($tokens as $tokenId => $freq) { 511 $record = $freqIndex->retrieveRow($tokenId); 512 $record = TupleOps::updateTuple($record, $entityId, $freq); 513 $freqIndex->changeRow($tokenId, $record); 514 } 515 $freqIndex->save(); 516 } 517 } 518} 519