1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexLockException; 7use dokuwiki\Search\Exception\IndexUsageException; 8use dokuwiki\Search\Exception\IndexWriteException; 9use dokuwiki\Search\Index\AbstractIndex; 10use dokuwiki\Search\Index\FileIndex; 11use dokuwiki\Search\Index\Lock; 12use dokuwiki\Search\Index\MemoryIndex; 13use dokuwiki\Search\Index\TupleOps; 14use dokuwiki\Search\Tokenizer; 15 16/** 17 * Abstract base class for index collections 18 * 19 * A collection manages a group of related indexes that together provide a specific search use case. 20 * Every collection works with four index types: entity, token, frequency, and reverse. 21 * 22 * entity - the list of the main entities (eg. pages) 23 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 24 * frequency - how often a token appears on a entity (can be split into multiple files) 25 * reverse - the list of tokens assigned to each entity 26 * 27 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 28 * @author Andreas Gohr <andi@splitbrain.org> 29 * @author Tom N Harris <tnharris@whoopdedo.org> 30 */ 31abstract class AbstractCollection 32{ 33 /** @var array<string|AbstractIndex> Index names or objects that have been successfully locked */ 34 protected array $lockedIndexes = []; 35 36 /** @var bool Has a lock been acquired for all used indexes? */ 37 protected bool $isWritable = false; 38 39 /** 40 * Initialize the collection with the names of the indexes it manages 41 * 42 * Entity and token indexes can be passed as already instantiated AbstractIndex objects 43 * for sharing between collections. When $idxToken is an object, $splitByLength must be false. 44 * 45 * @param string|AbstractIndex $idxEntity Name or instance of the primary entity index, eg. 'page' 46 * @param string|AbstractIndex $idxToken Name or instance of the secondary entity index, eg. 'w' for words 47 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 48 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 49 * @param bool $splitByLength Whether to split token/frequency indexes by token length 50 * @throws IndexUsageException 51 */ 52 public function __construct( 53 protected string|AbstractIndex $idxEntity, 54 protected string|AbstractIndex $idxToken, 55 protected string $idxFrequency = '', 56 protected string $idxReverse = '', 57 protected bool $splitByLength = false 58 ) 59 { 60 if ($idxToken instanceof AbstractIndex && $splitByLength) { 61 throw new IndexUsageException('Cannot split by length when using a pre-instantiated token index'); 62 } 63 } 64 65 /** 66 * Destructor 67 * 68 * Ensures locks are released when the class is destroyed 69 */ 70 public function __destruct() 71 { 72 $this->unlock(); 73 } 74 75 /** 76 * Lock all indexes for writing 77 * 78 * @return $this can be used for chaining 79 * @throws IndexLockException 80 */ 81 public function lock(): static 82 { 83 foreach ([ 84 $this->idxEntity, 85 $this->idxToken, 86 $this->idxFrequency, 87 $this->idxReverse 88 ] as $idx) { 89 if ($idx === '') continue; 90 try { 91 if ($idx instanceof AbstractIndex) { 92 $idx->lock(); 93 $this->lockedIndexes[] = $idx; 94 } else { 95 Lock::acquire($idx); 96 $this->lockedIndexes[] = $idx; 97 } 98 } catch (IndexLockException $e) { 99 $this->unlock(); 100 throw $e; 101 } 102 } 103 $this->isWritable = true; 104 return $this; 105 } 106 107 /** 108 * Unlock all indexes that were successfully locked 109 * 110 * @return void 111 */ 112 public function unlock(): void 113 { 114 foreach ($this->lockedIndexes as $idx) { 115 if ($idx instanceof AbstractIndex) { 116 $idx->unlock(); 117 } else { 118 Lock::release($idx); 119 } 120 } 121 $this->lockedIndexes = []; 122 $this->isWritable = false; 123 } 124 125 /** 126 * @return AbstractIndex 127 * @throws IndexLockException 128 */ 129 public function getEntityIndex(): AbstractIndex 130 { 131 if ($this->idxEntity instanceof AbstractIndex) { 132 return $this->idxEntity; 133 } 134 return new FileIndex($this->idxEntity, '', $this->isWritable); 135 } 136 137 /** 138 * @param int|string $suffix 139 * @return AbstractIndex 140 * @throws IndexLockException 141 */ 142 public function getTokenIndex(int|string $suffix): AbstractIndex 143 { 144 if ($this->idxToken instanceof AbstractIndex) { 145 return $this->idxToken; 146 } 147 return new MemoryIndex($this->idxToken, $suffix, $this->isWritable); 148 } 149 150 /** 151 * @param int|string $suffix 152 * @return AbstractIndex 153 * @throws IndexLockException 154 */ 155 public function getFrequencyIndex(int|string $suffix): AbstractIndex 156 { 157 return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable); 158 } 159 160 /** 161 * @return AbstractIndex 162 * @throws IndexLockException 163 */ 164 public function getReverseIndex(): AbstractIndex 165 { 166 return new FileIndex($this->idxReverse, '', $this->isWritable); 167 } 168 169 /** 170 * Maximum suffix for the token indexes (eg. max word length currently stored) 171 * 172 * @return int 173 * @throws IndexLockException 174 */ 175 public function getTokenIndexMaximum(): int 176 { 177 return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum 178 } 179 180 /** 181 * Add or update the tokens for a given entity 182 * 183 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 184 * entity from the index. 185 * 186 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 187 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 188 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 189 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 190 * 191 * @param string $entity The name of the entity 192 * @param string[] $tokens The list of tokens for this entity 193 * @throws IndexAccessException 194 * @throws IndexWriteException 195 * @throws IndexLockException 196 */ 197 public function addEntity(string $entity, array $tokens): void 198 { 199 if (!$this->isWritable) { 200 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 201 } 202 203 $entityIndex = $this->getEntityIndex(); 204 $entityId = $entityIndex->accessCachedValue($entity); 205 206 $old = $this->getReverseAssignments($entity); 207 $new = $this->resolveTokens($tokens); 208 209 $merged = array_replace_recursive($old, $new); 210 211 $this->updateIndexes($merged, $entityId); 212 $this->saveReverseAssignments($entity, $merged); 213 } 214 215 /** 216 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 217 * 218 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 219 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 220 * token strings to IDs via the appropriate token index. 221 * 222 * @param string[] $tokens The raw token list 223 * @return array [group => [tokenId => frequency, ...], ...] 224 * @throws IndexLockException 225 * @throws IndexWriteException 226 */ 227 protected function resolveTokens(array $tokens): array 228 { 229 $counted = $this->countTokens($tokens); 230 231 // group tokens by their index suffix 232 $groups = []; 233 foreach ($counted as $token => $freq) { 234 $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : ''; 235 $groups[$group][$token] = $freq; 236 } 237 238 // resolve token strings to IDs 239 $result = []; 240 foreach ($groups as $group => $tokenFreqs) { 241 $tokenIndex = $this->getTokenIndex($group); 242 $result[$group] = []; 243 foreach ($tokenFreqs as $token => $freq) { 244 $tokenId = $tokenIndex->getRowID((string)$token); 245 $result[$group][$tokenId] = $freq; 246 } 247 $tokenIndex->save(); 248 } 249 250 return $result; 251 } 252 253 /** 254 * Count or deduplicate tokens and return their frequencies 255 * 256 * FrequencyCollections return actual occurrence counts. 257 * LookupCollections deduplicate and return 1 for each token. 258 * 259 * @param string[] $tokens The raw token list 260 * @return array [token => frequency, ...] 261 */ 262 abstract protected function countTokens(array $tokens): array; 263 264 /** 265 * Get the token assignments for a given entity from the reverse index 266 * 267 * Returns the parsed reverse index record. The exact structure depends on the collection type. 268 * 269 * @param string $entity 270 * @return array 271 * @throws IndexAccessException 272 * @throws IndexWriteException 273 * @throws IndexLockException 274 */ 275 public function getReverseAssignments(string $entity): array 276 { 277 $entityIndex = $this->getEntityIndex(); 278 $entityId = $entityIndex->accessCachedValue($entity); 279 280 $reverseIndex = $this->getReverseIndex(); 281 $record = $reverseIndex->retrieveRow($entityId); 282 283 if ($record === '') { 284 return []; 285 } 286 287 return $this->parseReverseRecord($record); 288 } 289 290 /** 291 * Store the reverse index info about what tokens are assigned to the entity 292 * 293 * @param string $entity 294 * @param array $data The assignment data to store 295 * @return void 296 * @throws IndexAccessException 297 * @throws IndexWriteException 298 * @throws IndexLockException 299 */ 300 protected function saveReverseAssignments(string $entity, array $data): void 301 { 302 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 303 $data = array_map('array_filter', $data); 304 $data = array_filter($data); 305 306 $record = $this->formatReverseRecord($data); 307 308 $entityIndex = $this->getEntityIndex(); 309 $entityId = $entityIndex->accessCachedValue($entity); 310 311 $reverseIndex = $this->getReverseIndex(); 312 $reverseIndex->changeRow($entityId, $record); 313 } 314 315 /** 316 * Parse a reverse index record into a two-level array 317 * 318 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 319 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 320 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 321 * 322 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 323 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 324 * This mirrors how TupleOps omits *1 for frequency 1. 325 * 326 * @param string $record The raw reverse index record 327 * @return array [group => [tokenId => 0, ...], ...] 328 */ 329 protected function parseReverseRecord(string $record): array 330 { 331 $result = []; 332 foreach (explode(':', $record) as $entry) { 333 $parts = explode('*', $entry, 2); 334 $tokenId = array_pop($parts); 335 $group = array_pop($parts) ?? ''; 336 $result[$group][$tokenId] = 0; 337 } 338 return $result; 339 } 340 341 /** 342 * Format a two-level array into a reverse index record string 343 * 344 * @param array $data [group => [tokenId => freq, ...], ...] 345 * @return string The formatted record 346 */ 347 protected function formatReverseRecord(array $data): string 348 { 349 $parts = []; 350 foreach ($data as $group => $tokens) { 351 $prefix = $group === '' ? '' : "$group*"; 352 foreach (array_keys($tokens) as $tokenId) { 353 $parts[] = $prefix . $tokenId; 354 } 355 } 356 return implode(':', $parts); 357 } 358 359 /** 360 * Update frequency indexes with the given data 361 * 362 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 363 * corresponding frequency index for each group. A frequency of 0 removes the entity 364 * from that token's frequency record. 365 * 366 * @param array $data [group => [tokenId => frequency, ...], ...] 367 * @param int $entityId The entity ID 368 * @throws IndexLockException 369 * @throws IndexWriteException 370 */ 371 protected function updateIndexes(array $data, int $entityId): void 372 { 373 foreach ($data as $group => $tokens) { 374 $freqIndex = $this->getFrequencyIndex($group); 375 foreach ($tokens as $tokenId => $freq) { 376 $record = $freqIndex->retrieveRow($tokenId); 377 $record = TupleOps::updateTuple($record, $entityId, $freq); 378 $freqIndex->changeRow($tokenId, $record); 379 } 380 $freqIndex->save(); 381 } 382 } 383} 384