1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexLockException; 7use dokuwiki\Search\Exception\IndexUsageException; 8use dokuwiki\Search\Exception\IndexWriteException; 9use dokuwiki\Search\Index\AbstractIndex; 10use dokuwiki\Search\Index\FileIndex; 11use dokuwiki\Search\Index\Lock; 12use dokuwiki\Search\Index\MemoryIndex; 13use dokuwiki\Search\Index\TupleOps; 14use dokuwiki\Search\Tokenizer; 15 16/** 17 * Abstract base class for index collections 18 * 19 * A collection manages a group of related indexes that together provide a specific search use case. 20 * Every collection works with four index types: entity, token, frequency, and reverse. 21 * 22 * entity - the list of the main entities (eg. pages) 23 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 24 * frequency - how often a token appears on a entity (can be split into multiple files) 25 * reverse - the list of tokens assigned to each entity 26 * 27 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 28 * @author Andreas Gohr <andi@splitbrain.org> 29 * @author Tom N Harris <tnharris@whoopdedo.org> 30 */ 31abstract class AbstractCollection 32{ 33 /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 34 protected array $lockedIndexes = []; 35 36 /** @var bool Has a lock been acquired for all used indexes? */ 37 protected bool $isWritable = false; 38 39 /** 40 * Initialize the collection with the names of the indexes it manages 41 * 42 * Entity and token indexes can be passed as already instantiated AbstractIndex objects 43 * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 44 * 45 * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 46 * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 47 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 48 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 49 * @param bool $splitByLength Whether to split token/frequency indexes by token length 50 * @throws IndexUsageException 51 */ 52 public function __construct( 53 protected string|AbstractIndex $idxEntity, 54 protected string|AbstractIndex $idxToken, 55 protected string $idxFrequency = '', 56 protected string $idxReverse = '', 57 protected bool $splitByLength = false 58 ) 59 { 60 if ($idxToken instanceof AbstractIndex && $splitByLength) { 61 throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 62 } 63 } 64 65 /** 66 * Destructor 67 * 68 * Ensures locks are released when the class is destroyed 69 */ 70 public function __destruct() 71 { 72 $this->unlock(); 73 } 74 75 /** 76 * Lock all indexes for writing 77 * 78 * @return $this can be used for chaining 79 * @throws IndexLockException 80 */ 81 public function lock(): static 82 { 83 foreach ([ 84 $this->idxEntity, 85 $this->idxToken, 86 $this->idxFrequency, 87 $this->idxReverse 88 ] as $idx) { 89 if ($idx === '') continue; 90 try { 91 if ($idx instanceof AbstractIndex) { 92 $idx->lock(); 93 $this->lockedIndexes[] = $idx; 94 } else { 95 Lock::acquire($idx); 96 $this->lockedIndexes[] = $idx; 97 } 98 } catch (IndexLockException $e) { 99 $this->unlock(); 100 throw $e; 101 } 102 } 103 $this->isWritable = true; 104 return $this; 105 } 106 107 /** 108 * Unlock all indexes that were successfully locked 109 * 110 * @return static 111 */ 112 public function unlock(): static 113 { 114 foreach ($this->lockedIndexes as $idx) { 115 if ($idx instanceof AbstractIndex) { 116 $idx->unlock(); 117 } else { 118 Lock::release($idx); 119 } 120 } 121 $this->lockedIndexes = []; 122 $this->isWritable = false; 123 return $this; 124 } 125 126 /** 127 * @return AbstractIndex 128 * @throws IndexLockException 129 */ 130 public function getEntityIndex(): AbstractIndex 131 { 132 if ($this->idxEntity instanceof AbstractIndex) { 133 return $this->idxEntity; 134 } 135 return new FileIndex($this->idxEntity, '', $this->isWritable); 136 } 137 138 /** 139 * @param int|string $suffix 140 * @return AbstractIndex 141 * @throws IndexLockException 142 */ 143 public function getTokenIndex(int|string $suffix): AbstractIndex 144 { 145 if ($this->idxToken instanceof AbstractIndex) { 146 return $this->idxToken; 147 } 148 return new MemoryIndex($this->idxToken, $suffix, $this->isWritable); 149 } 150 151 /** 152 * @param int|string $suffix 153 * @return AbstractIndex 154 * @throws IndexLockException 155 */ 156 public function getFrequencyIndex(int|string $suffix): AbstractIndex 157 { 158 return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable); 159 } 160 161 /** 162 * @return AbstractIndex 163 * @throws IndexLockException 164 */ 165 public function getReverseIndex(): AbstractIndex 166 { 167 return new FileIndex($this->idxReverse, '', $this->isWritable); 168 } 169 170 /** 171 * Maximum suffix for the token indexes (eg. max word length currently stored) 172 * 173 * @return int 174 * @throws IndexLockException 175 */ 176 public function getTokenIndexMaximum(): int 177 { 178 return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum 179 } 180 181 /** 182 * Add or update the tokens for a given entity 183 * 184 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 185 * entity from the index. 186 * 187 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 188 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 189 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 190 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 191 * 192 * @param string $entity The name of the entity 193 * @param string[] $tokens The list of tokens for this entity 194 * @return static 195 * @throws IndexAccessException 196 * @throws IndexWriteException 197 * @throws IndexLockException 198 */ 199 public function addEntity(string $entity, array $tokens): static 200 { 201 if (!$this->isWritable) { 202 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 203 } 204 205 $entityIndex = $this->getEntityIndex(); 206 $entityId = $entityIndex->accessCachedValue($entity); 207 208 $old = $this->getReverseAssignments($entity); 209 $new = $this->resolveTokens($tokens); 210 211 $merged = array_replace_recursive($old, $new); 212 213 $this->updateIndexes($merged, $entityId); 214 $this->saveReverseAssignments($entity, $merged); 215 216 return $this; 217 } 218 219 /** 220 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 221 * 222 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 223 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 224 * token strings to IDs via the appropriate token index. 225 * 226 * @param string[] $tokens The raw token list 227 * @return array [group => [tokenId => frequency, ...], ...] 228 * @throws IndexLockException 229 * @throws IndexWriteException 230 */ 231 protected function resolveTokens(array $tokens): array 232 { 233 $counted = $this->countTokens($tokens); 234 235 // group tokens by their index suffix 236 $groups = []; 237 foreach ($counted as $token => $freq) { 238 $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : ''; 239 $groups[$group][$token] = $freq; 240 } 241 242 // resolve token strings to IDs 243 $result = []; 244 foreach ($groups as $group => $tokenFreqs) { 245 $tokenIndex = $this->getTokenIndex($group); 246 $result[$group] = []; 247 foreach ($tokenFreqs as $token => $freq) { 248 $tokenId = $tokenIndex->getRowID((string)$token); 249 $result[$group][$tokenId] = $freq; 250 } 251 $tokenIndex->save(); 252 } 253 254 return $result; 255 } 256 257 /** 258 * Count or deduplicate tokens and return their frequencies 259 * 260 * FrequencyCollections return actual occurrence counts. 261 * LookupCollections deduplicate and return 1 for each token. 262 * 263 * @param string[] $tokens The raw token list 264 * @return array [token => frequency, ...] 265 */ 266 abstract protected function countTokens(array $tokens): array; 267 268 /** 269 * Get the token assignments for a given entity from the reverse index 270 * 271 * Returns the parsed reverse index record. The exact structure depends on the collection type. 272 * 273 * @param string $entity 274 * @return array 275 * @throws IndexAccessException 276 * @throws IndexWriteException 277 * @throws IndexLockException 278 */ 279 public function getReverseAssignments(string $entity): array 280 { 281 $entityIndex = $this->getEntityIndex(); 282 $entityId = $entityIndex->accessCachedValue($entity); 283 284 $reverseIndex = $this->getReverseIndex(); 285 $record = $reverseIndex->retrieveRow($entityId); 286 287 if ($record === '') { 288 return []; 289 } 290 291 return $this->parseReverseRecord($record); 292 } 293 294 /** 295 * Store the reverse index info about what tokens are assigned to the entity 296 * 297 * @param string $entity 298 * @param array $data The assignment data to store 299 * @return void 300 * @throws IndexAccessException 301 * @throws IndexWriteException 302 * @throws IndexLockException 303 */ 304 protected function saveReverseAssignments(string $entity, array $data): void 305 { 306 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 307 $data = array_map('array_filter', $data); 308 $data = array_filter($data); 309 310 $record = $this->formatReverseRecord($data); 311 312 $entityIndex = $this->getEntityIndex(); 313 $entityId = $entityIndex->accessCachedValue($entity); 314 315 $reverseIndex = $this->getReverseIndex(); 316 $reverseIndex->changeRow($entityId, $record); 317 } 318 319 /** 320 * Parse a reverse index record into a two-level array 321 * 322 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 323 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 324 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 325 * 326 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 327 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 328 * This mirrors how TupleOps omits *1 for frequency 1. 329 * 330 * @param string $record The raw reverse index record 331 * @return array [group => [tokenId => 0, ...], ...] 332 */ 333 protected function parseReverseRecord(string $record): array 334 { 335 $result = []; 336 foreach (explode(':', $record) as $entry) { 337 $parts = explode('*', $entry, 2); 338 $tokenId = array_pop($parts); 339 $group = array_pop($parts) ?? ''; 340 $result[$group][$tokenId] = 0; 341 } 342 return $result; 343 } 344 345 /** 346 * Format a two-level array into a reverse index record string 347 * 348 * @param array $data [group => [tokenId => freq, ...], ...] 349 * @return string The formatted record 350 */ 351 protected function formatReverseRecord(array $data): string 352 { 353 $parts = []; 354 foreach ($data as $group => $tokens) { 355 $prefix = $group === '' ? '' : "$group*"; 356 foreach (array_keys($tokens) as $tokenId) { 357 $parts[] = $prefix . $tokenId; 358 } 359 } 360 return implode(':', $parts); 361 } 362 363 /** 364 * Update frequency indexes with the given data 365 * 366 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 367 * corresponding frequency index for each group. A frequency of 0 removes the entity 368 * from that token's frequency record. 369 * 370 * @param array $data [group => [tokenId => frequency, ...], ...] 371 * @param int $entityId The entity ID 372 * @throws IndexLockException 373 * @throws IndexWriteException 374 */ 375 protected function updateIndexes(array $data, int $entityId): void 376 { 377 foreach ($data as $group => $tokens) { 378 $freqIndex = $this->getFrequencyIndex($group); 379 foreach ($tokens as $tokenId => $freq) { 380 $record = $freqIndex->retrieveRow($tokenId); 381 $record = TupleOps::updateTuple($record, $entityId, $freq); 382 $freqIndex->changeRow($tokenId, $record); 383 } 384 $freqIndex->save(); 385 } 386 } 387} 388