1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexLockException; 7use dokuwiki\Search\Exception\IndexUsageException; 8use dokuwiki\Search\Exception\IndexWriteException; 9use dokuwiki\Search\Index\AbstractIndex; 10use dokuwiki\Search\Index\FileIndex; 11use dokuwiki\Search\Index\Lock; 12use dokuwiki\Search\Index\MemoryIndex; 13use dokuwiki\Search\Index\TupleOps; 14use dokuwiki\Search\Tokenizer; 15 16/** 17 * Abstract base class for index collections 18 * 19 * A collection manages a group of related indexes that together provide a specific search use case. 20 * Every collection works with four index types: entity, token, frequency, and reverse. 21 * 22 * entity - the list of the main entities (eg. pages) 23 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 24 * frequency - how often a token appears on a entity (can be split into multiple files) 25 * reverse - the list of tokens assigned to each entity 26 * 27 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 28 * @author Andreas Gohr <andi@splitbrain.org> 29 * @author Tom N Harris <tnharris@whoopdedo.org> 30 */ 31abstract class AbstractCollection 32{ 33 /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 34 protected array $lockedIndexes = []; 35 36 /** @var bool Has a lock been acquired for all used indexes? */ 37 protected bool $isWritable = false; 38 39 /** 40 * Initialize the collection with the names of the indexes it manages 41 * 42 * Entity and token indexes can be passed as already instantiated AbstractIndex objects 43 * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 44 * 45 * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 46 * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 47 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 48 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 49 * @param bool $splitByLength Whether to split token/frequency indexes by token length 50 * @throws IndexUsageException 51 */ 52 public function __construct( 53 protected string|AbstractIndex $idxEntity, 54 protected string|AbstractIndex $idxToken, 55 protected string $idxFrequency = '', 56 protected string $idxReverse = '', 57 protected bool $splitByLength = false 58 ) 59 { 60 if ($idxToken instanceof AbstractIndex && $splitByLength) { 61 throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 62 } 63 } 64 65 /** 66 * Destructor 67 * 68 * Ensures locks are released when the class is destroyed 69 */ 70 public function __destruct() 71 { 72 $this->unlock(); 73 } 74 75 /** 76 * Lock all indexes for writing 77 * 78 * @return $this can be used for chaining 79 * @throws IndexLockException 80 */ 81 public function lock(): static 82 { 83 foreach ([ 84 $this->idxEntity, 85 $this->idxToken, 86 $this->idxFrequency, 87 $this->idxReverse 88 ] as $idx) { 89 if ($idx === '') continue; 90 try { 91 if ($idx instanceof AbstractIndex) { 92 $idx->lock(); 93 $this->lockedIndexes[] = $idx; 94 } else { 95 Lock::acquire($idx); 96 $this->lockedIndexes[] = $idx; 97 } 98 } catch (IndexLockException $e) { 99 $this->unlock(); 100 throw $e; 101 } 102 } 103 $this->isWritable = true; 104 return $this; 105 } 106 107 /** 108 * Unlock all indexes that were successfully locked 109 * 110 * @return static 111 */ 112 public function unlock(): static 113 { 114 foreach ($this->lockedIndexes as $idx) { 115 if ($idx instanceof AbstractIndex) { 116 $idx->unlock(); 117 } else { 118 Lock::release($idx); 119 } 120 } 121 $this->lockedIndexes = []; 122 $this->isWritable = false; 123 return $this; 124 } 125 126 /** 127 * @return AbstractIndex 128 * @throws IndexLockException 129 */ 130 public function getEntityIndex(): AbstractIndex 131 { 132 if ($this->idxEntity instanceof AbstractIndex) { 133 return $this->idxEntity; 134 } 135 return new FileIndex($this->idxEntity, '', $this->isWritable); 136 } 137 138 /** 139 * @param int $group Index group (0 for non-split, token length for split) 140 * @return AbstractIndex 141 * @throws IndexLockException 142 */ 143 public function getTokenIndex(int $group = 0): AbstractIndex 144 { 145 if ($this->idxToken instanceof AbstractIndex) { 146 return $this->idxToken; 147 } 148 return new MemoryIndex($this->idxToken, $this->groupToSuffix($group), $this->isWritable); 149 } 150 151 /** 152 * @param int $group Index group (0 for non-split, token length for split) 153 * @return AbstractIndex 154 * @throws IndexLockException 155 */ 156 public function getFrequencyIndex(int $group = 0): AbstractIndex 157 { 158 return new MemoryIndex($this->idxFrequency, $this->groupToSuffix($group), $this->isWritable); 159 } 160 161 /** 162 * @return AbstractIndex 163 * @throws IndexLockException 164 */ 165 public function getReverseIndex(): AbstractIndex 166 { 167 return new FileIndex($this->idxReverse, '', $this->isWritable); 168 } 169 170 /** 171 * Whether this collection splits token/frequency indexes by token length 172 * 173 * @return bool 174 */ 175 public function isSplitByLength(): bool 176 { 177 return $this->splitByLength; 178 } 179 180 /** 181 * Convert a logical group number to the index file suffix 182 * 183 * Group 0 represents non-split indexes (suffix '') while positive integers 184 * represent split-by-length indexes (suffix = the length). 185 * 186 * @param int $group 187 * @return string The file suffix ('' for group 0, the group number as string otherwise) 188 * @throws IndexUsageException when group does not match the collection's split mode 189 */ 190 protected function groupToSuffix(int $group): string 191 { 192 if ($group === 0 && $this->splitByLength) { 193 throw new IndexUsageException('Group 0 is not valid for split-by-length collections'); 194 } 195 if ($group !== 0 && !$this->splitByLength) { 196 throw new IndexUsageException("Group $group is not valid for non-split collections"); 197 } 198 return $group === 0 ? '' : (string)$group; 199 } 200 201 /** 202 * Resolve token IDs to entity frequencies 203 * 204 * Given a set of token IDs from a specific index group, returns the entities 205 * that have those tokens and their frequencies. This encapsulates the frequency 206 * index access so that subclasses (e.g. DirectCollection) can provide alternative 207 * mappings. 208 * 209 * @param int $group Index group (0 for non-split, token length for split) 210 * @param int[] $tokenIds The token IDs to resolve 211 * @return array [tokenId => [entityId => frequency, ...], ...] 212 */ 213 public function resolveTokenFrequencies(int $group, array $tokenIds): array 214 { 215 $freqIndex = $this->getFrequencyIndex($group); 216 if (!$freqIndex->exists()) return []; 217 return array_map([TupleOps::class, 'parseTuples'], $freqIndex->retrieveRows($tokenIds)); 218 } 219 220 /** 221 * Return all entity names that have data in this collection 222 * 223 * @return string[] entity names 224 */ 225 public function getEntitiesWithData(): array 226 { 227 $entityIndex = $this->getEntityIndex(); 228 229 // collect entity IDs from all frequency index groups 230 $groups = $this->splitByLength 231 ? range(1, $this->getTokenIndexMaximum()) 232 : [0]; 233 234 $entityIds = []; 235 foreach ($groups as $group) { 236 $freqIndex = $this->getFrequencyIndex($group); 237 if (!$freqIndex->exists()) continue; 238 foreach ($freqIndex as $line) { 239 foreach (TupleOps::parseTuples($line) as $entityId => $count) { 240 $entityIds[$entityId] = true; 241 } 242 } 243 } 244 245 $names = $entityIndex->retrieveRows(array_keys($entityIds)); 246 return array_values(array_filter($names, static fn($v) => $v !== '')); 247 } 248 249 /** 250 * Maximum suffix for the token indexes (eg. max word length currently stored) 251 * 252 * @return int 253 * @throws IndexLockException 254 */ 255 public function getTokenIndexMaximum(): int 256 { 257 if ($this->idxToken instanceof AbstractIndex) { 258 return $this->idxToken->max(); 259 } 260 return (new MemoryIndex($this->idxToken, ''))->max(); 261 } 262 263 /** 264 * Add or update the tokens for a given entity 265 * 266 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 267 * entity from the index. 268 * 269 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 270 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 271 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 272 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 273 * 274 * @param string $entity The name of the entity 275 * @param string[] $tokens The list of tokens for this entity 276 * @return static 277 * @throws IndexAccessException 278 * @throws IndexWriteException 279 * @throws IndexLockException 280 */ 281 public function addEntity(string $entity, array $tokens): static 282 { 283 if (!$this->isWritable) { 284 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 285 } 286 287 $entityIndex = $this->getEntityIndex(); 288 $entityId = $entityIndex->accessCachedValue($entity); 289 290 $old = $this->getReverseAssignments($entity); 291 $new = $this->resolveTokens($tokens); 292 293 $merged = array_replace_recursive($old, $new); 294 295 $this->updateIndexes($merged, $entityId); 296 $this->saveReverseAssignments($entity, $merged); 297 298 return $this; 299 } 300 301 /** 302 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 303 * 304 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 305 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 306 * token strings to IDs via the appropriate token index. 307 * 308 * @param string[] $tokens The raw token list 309 * @return array [group => [tokenId => frequency, ...], ...] 310 * @throws IndexLockException 311 * @throws IndexWriteException 312 */ 313 protected function resolveTokens(array $tokens): array 314 { 315 $counted = $this->countTokens($tokens); 316 317 // group tokens by their index suffix 318 $groups = []; 319 foreach ($counted as $token => $freq) { 320 $group = $this->splitByLength ? Tokenizer::tokenLength($token) : 0; 321 $groups[$group][$token] = $freq; 322 } 323 324 // resolve token strings to IDs 325 $result = []; 326 foreach ($groups as $group => $tokenFreqs) { 327 $tokenIndex = $this->getTokenIndex($group); 328 $result[$group] = []; 329 foreach ($tokenFreqs as $token => $freq) { 330 $tokenId = $tokenIndex->getRowID((string)$token); 331 $result[$group][$tokenId] = $freq; 332 } 333 $tokenIndex->save(); 334 } 335 336 return $result; 337 } 338 339 /** 340 * Count or deduplicate tokens and return their frequencies 341 * 342 * FrequencyCollections return actual occurrence counts. 343 * LookupCollections deduplicate and return 1 for each token. 344 * 345 * @param string[] $tokens The raw token list 346 * @return array [token => frequency, ...] 347 */ 348 abstract protected function countTokens(array $tokens): array; 349 350 /** 351 * Get the token assignments for a given entity from the reverse index 352 * 353 * Returns the parsed reverse index record. The exact structure depends on the collection type. 354 * 355 * @param string $entity 356 * @return array 357 * @throws IndexAccessException 358 * @throws IndexWriteException 359 * @throws IndexLockException 360 */ 361 public function getReverseAssignments(string $entity): array 362 { 363 $entityIndex = $this->getEntityIndex(); 364 $entityId = $entityIndex->accessCachedValue($entity); 365 366 $reverseIndex = $this->getReverseIndex(); 367 $record = $reverseIndex->retrieveRow($entityId); 368 369 if ($record === '') { 370 return []; 371 } 372 373 return $this->parseReverseRecord($record); 374 } 375 376 /** 377 * Store the reverse index info about what tokens are assigned to the entity 378 * 379 * @param string $entity 380 * @param array $data The assignment data to store 381 * @return void 382 * @throws IndexAccessException 383 * @throws IndexWriteException 384 * @throws IndexLockException 385 */ 386 protected function saveReverseAssignments(string $entity, array $data): void 387 { 388 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 389 $data = array_map('array_filter', $data); 390 $data = array_filter($data); 391 392 $record = $this->formatReverseRecord($data); 393 394 $entityIndex = $this->getEntityIndex(); 395 $entityId = $entityIndex->accessCachedValue($entity); 396 397 $reverseIndex = $this->getReverseIndex(); 398 $reverseIndex->changeRow($entityId, $record); 399 } 400 401 /** 402 * Parse a reverse index record into a two-level array 403 * 404 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 405 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 406 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 407 * 408 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 409 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 410 * This mirrors how TupleOps omits *1 for frequency 1. 411 * 412 * @param string $record The raw reverse index record 413 * @return array [group => [tokenId => 0, ...], ...] 414 */ 415 protected function parseReverseRecord(string $record): array 416 { 417 $result = []; 418 foreach (explode(':', $record) as $entry) { 419 $parts = explode('*', $entry, 2); 420 $tokenId = array_pop($parts); 421 $group = (int)(array_pop($parts) ?? 0); 422 $result[$group][$tokenId] = 0; 423 } 424 return $result; 425 } 426 427 /** 428 * Format a two-level array into a reverse index record string 429 * 430 * @param array $data [group => [tokenId => freq, ...], ...] 431 * @return string The formatted record 432 */ 433 protected function formatReverseRecord(array $data): string 434 { 435 $parts = []; 436 foreach ($data as $group => $tokens) { 437 $prefix = $group === 0 ? '' : "$group*"; 438 foreach (array_keys($tokens) as $tokenId) { 439 $parts[] = $prefix . $tokenId; 440 } 441 } 442 return implode(':', $parts); 443 } 444 445 /** 446 * Update frequency indexes with the given data 447 * 448 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 449 * corresponding frequency index for each group. A frequency of 0 removes the entity 450 * from that token's frequency record. 451 * 452 * @param array $data [group => [tokenId => frequency, ...], ...] 453 * @param int $entityId The entity ID 454 * @throws IndexLockException 455 * @throws IndexWriteException 456 */ 457 protected function updateIndexes(array $data, int $entityId): void 458 { 459 foreach ($data as $group => $tokens) { 460 $freqIndex = $this->getFrequencyIndex($group); 461 foreach ($tokens as $tokenId => $freq) { 462 $record = $freqIndex->retrieveRow($tokenId); 463 $record = TupleOps::updateTuple($record, $entityId, $freq); 464 $freqIndex->changeRow($tokenId, $record); 465 } 466 $freqIndex->save(); 467 } 468 } 469} 470