1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexLockException; 7use dokuwiki\Search\Exception\IndexWriteException; 8use dokuwiki\Search\Index\FileIndex; 9use dokuwiki\Search\Index\Lock; 10use dokuwiki\Search\Index\MemoryIndex; 11use dokuwiki\Search\Index\TupleOps; 12use dokuwiki\Search\Tokenizer; 13 14/** 15 * Abstract base class for index collections 16 * 17 * A collection manages a group of related indexes that together provide a specific search use case. 18 * Every collection works with four index types: entity, token, frequency, and reverse. 19 * 20 * entity - the list of the main entities (eg. pages) 21 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 22 * frequency - how often a token appears on a entity (can be split into multiple files) 23 * reverse - the list of tokens assigned to each entity 24 * 25 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 26 * @author Andreas Gohr <andi@splitbrain.org> 27 * @author Tom N Harris <tnharris@whoopdedo.org> 28 */ 29abstract class AbstractCollection 30{ 31 /** @var string[] Index names that have been successfully locked */ 32 protected array $lockedIndexes = []; 33 34 /** @var bool Has a lock been acquired for all used indexes? */ 35 protected bool $isWritable = false; 36 37 /** 38 * Initialize the collection with the names of the indexes it manages 39 * 40 * @param string $idxEntity Name of the primary entity index, eg. 'page' 41 * @param string $idxToken Base name of the secondary entity index, eg. 'w' for words 42 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 43 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 44 * @param bool $splitByLength Whether to split token/frequency indexes by token length 45 */ 46 public function __construct( 47 protected string $idxEntity, 48 protected string $idxToken, 49 protected string $idxFrequency = '', 50 protected string $idxReverse = '', 51 protected bool $splitByLength = false 52 ) 53 { 54 } 55 56 /** 57 * Destructor 58 * 59 * Ensures locks are released when the class is destroyed 60 */ 61 public function __destruct() 62 { 63 $this->unlock(); 64 } 65 66 /** 67 * Lock all indexes for writing 68 * 69 * @return $this can be used for chaining 70 * @throws IndexLockException 71 */ 72 public function lock(): static 73 { 74 foreach (array_filter([ 75 $this->idxEntity, 76 $this->idxToken, 77 $this->idxFrequency, 78 $this->idxReverse 79 ]) as $idxName) { 80 try { 81 Lock::acquire($idxName); 82 $this->lockedIndexes[] = $idxName; 83 } catch (IndexLockException $e) { 84 $this->unlock(); 85 throw $e; 86 } 87 } 88 $this->isWritable = true; 89 return $this; 90 } 91 92 /** 93 * Unlock all indexes that were successfully locked 94 * 95 * @return void 96 */ 97 public function unlock(): void 98 { 99 foreach ($this->lockedIndexes as $idxName) { 100 Lock::release($idxName); 101 } 102 $this->lockedIndexes = []; 103 $this->isWritable = false; 104 } 105 106 /** 107 * @return FileIndex 108 * @throws IndexLockException 109 */ 110 public function getEntityIndex(): FileIndex 111 { 112 return new FileIndex($this->idxEntity, '', $this->isWritable); 113 } 114 115 /** 116 * @param int|string $suffix 117 * @return MemoryIndex 118 * @throws IndexLockException 119 */ 120 public function getTokenIndex(int|string $suffix): MemoryIndex 121 { 122 return new MemoryIndex($this->idxToken, $suffix, $this->isWritable); 123 } 124 125 /** 126 * @param int|string $suffix 127 * @return MemoryIndex 128 * @throws IndexLockException 129 */ 130 public function getFrequencyIndex(int|string $suffix): MemoryIndex 131 { 132 return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable); 133 } 134 135 /** 136 * @return FileIndex 137 * @throws IndexLockException 138 */ 139 public function getReverseIndex(): FileIndex 140 { 141 return new FileIndex($this->idxReverse, '', $this->isWritable); 142 } 143 144 /** 145 * Maximum suffix for the token indexes (eg. max word length currently stored) 146 * 147 * @return int 148 * @throws IndexLockException 149 */ 150 public function getTokenIndexMaximum(): int 151 { 152 return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum 153 } 154 155 /** 156 * Add or update the tokens for a given entity 157 * 158 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 159 * entity from the index. 160 * 161 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 162 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 163 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 164 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 165 * 166 * @param string $entity The name of the entity 167 * @param string[] $tokens The list of tokens for this entity 168 * @throws IndexAccessException 169 * @throws IndexWriteException 170 * @throws IndexLockException 171 */ 172 public function addEntity(string $entity, array $tokens): void 173 { 174 if (!$this->isWritable) { 175 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 176 } 177 178 $entityIndex = $this->getEntityIndex(); 179 $entityId = $entityIndex->accessCachedValue($entity); 180 181 $old = $this->getReverseAssignments($entity); 182 $new = $this->resolveTokens($tokens); 183 184 $merged = array_replace_recursive($old, $new); 185 186 $this->updateIndexes($merged, $entityId); 187 $this->saveReverseAssignments($entity, $merged); 188 } 189 190 /** 191 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 192 * 193 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 194 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 195 * token strings to IDs via the appropriate token index. 196 * 197 * @param string[] $tokens The raw token list 198 * @return array [group => [tokenId => frequency, ...], ...] 199 * @throws IndexLockException 200 * @throws IndexWriteException 201 */ 202 protected function resolveTokens(array $tokens): array 203 { 204 $counted = $this->countTokens($tokens); 205 206 // group tokens by their index suffix 207 $groups = []; 208 foreach ($counted as $token => $freq) { 209 $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : ''; 210 $groups[$group][$token] = $freq; 211 } 212 213 // resolve token strings to IDs 214 $result = []; 215 foreach ($groups as $group => $tokenFreqs) { 216 $tokenIndex = $this->getTokenIndex($group); 217 $result[$group] = []; 218 foreach ($tokenFreqs as $token => $freq) { 219 $tokenId = $tokenIndex->getRowID((string)$token); 220 $result[$group][$tokenId] = $freq; 221 } 222 $tokenIndex->save(); 223 } 224 225 return $result; 226 } 227 228 /** 229 * Count or deduplicate tokens and return their frequencies 230 * 231 * FrequencyCollections return actual occurrence counts. 232 * LookupCollections deduplicate and return 1 for each token. 233 * 234 * @param string[] $tokens The raw token list 235 * @return array [token => frequency, ...] 236 */ 237 abstract protected function countTokens(array $tokens): array; 238 239 /** 240 * Get the token assignments for a given entity from the reverse index 241 * 242 * Returns the parsed reverse index record. The exact structure depends on the collection type. 243 * 244 * @param string $entity 245 * @return array 246 * @throws IndexAccessException 247 * @throws IndexWriteException 248 * @throws IndexLockException 249 */ 250 public function getReverseAssignments(string $entity): array 251 { 252 $entityIndex = $this->getEntityIndex(); 253 $entityId = $entityIndex->accessCachedValue($entity); 254 255 $reverseIndex = $this->getReverseIndex(); 256 $record = $reverseIndex->retrieveRow($entityId); 257 258 if ($record === '') { 259 return []; 260 } 261 262 return $this->parseReverseRecord($record); 263 } 264 265 /** 266 * Store the reverse index info about what tokens are assigned to the entity 267 * 268 * @param string $entity 269 * @param array $data The assignment data to store 270 * @return void 271 * @throws IndexAccessException 272 * @throws IndexWriteException 273 * @throws IndexLockException 274 */ 275 protected function saveReverseAssignments(string $entity, array $data): void 276 { 277 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 278 $data = array_map('array_filter', $data); 279 $data = array_filter($data); 280 281 $record = $this->formatReverseRecord($data); 282 283 $entityIndex = $this->getEntityIndex(); 284 $entityId = $entityIndex->accessCachedValue($entity); 285 286 $reverseIndex = $this->getReverseIndex(); 287 $reverseIndex->changeRow($entityId, $record); 288 } 289 290 /** 291 * Parse a reverse index record into a two-level array 292 * 293 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 294 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 295 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 296 * 297 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 298 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 299 * This mirrors how TupleOps omits *1 for frequency 1. 300 * 301 * @param string $record The raw reverse index record 302 * @return array [group => [tokenId => 0, ...], ...] 303 */ 304 protected function parseReverseRecord(string $record): array 305 { 306 $result = []; 307 foreach (explode(':', $record) as $entry) { 308 $parts = explode('*', $entry, 2); 309 $tokenId = array_pop($parts); 310 $group = array_pop($parts) ?? ''; 311 $result[$group][$tokenId] = 0; 312 } 313 return $result; 314 } 315 316 /** 317 * Format a two-level array into a reverse index record string 318 * 319 * @param array $data [group => [tokenId => freq, ...], ...] 320 * @return string The formatted record 321 */ 322 protected function formatReverseRecord(array $data): string 323 { 324 $parts = []; 325 foreach ($data as $group => $tokens) { 326 $prefix = $group === '' ? '' : "$group*"; 327 foreach (array_keys($tokens) as $tokenId) { 328 $parts[] = $prefix . $tokenId; 329 } 330 } 331 return implode(':', $parts); 332 } 333 334 /** 335 * Update frequency indexes with the given data 336 * 337 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 338 * corresponding frequency index for each group. A frequency of 0 removes the entity 339 * from that token's frequency record. 340 * 341 * @param array $data [group => [tokenId => frequency, ...], ...] 342 * @param int $entityId The entity ID 343 * @throws IndexLockException 344 * @throws IndexWriteException 345 */ 346 protected function updateIndexes(array $data, int $entityId): void 347 { 348 foreach ($data as $group => $tokens) { 349 $freqIndex = $this->getFrequencyIndex($group); 350 foreach ($tokens as $tokenId => $freq) { 351 $record = $freqIndex->retrieveRow($tokenId); 352 $record = TupleOps::updateTuple($record, $entityId, $freq); 353 $freqIndex->changeRow($tokenId, $record); 354 } 355 $freqIndex->save(); 356 } 357 } 358} 359