1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexLockException; 7use dokuwiki\Search\Exception\IndexWriteException; 8use dokuwiki\Search\Index\FileIndex; 9use dokuwiki\Search\Index\Lock; 10use dokuwiki\Search\Index\MemoryIndex; 11use dokuwiki\Search\Index\TupleOps; 12use dokuwiki\Search\Tokenizer; 13 14/** 15 * Abstract base class for index collections 16 * 17 * A collection manages a group of related indexes that together provide a specific search use case. 18 * Every collection works with four index types: entity, token, frequency, and reverse. 19 * 20 * entity - the list of the main entities (eg. pages) 21 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 22 * frequency - how often a token appears on a entity (can be split into multiple files) 23 * reverse - the list of tokens assigned to each entity 24 * 25 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 26 * @author Andreas Gohr <andi@splitbrain.org> 27 * @author Tom N Harris <tnharris@whoopdedo.org> 28 */ 29abstract class AbstractCollection 30{ 31 /** @var string[] Index names that have been successfully locked */ 32 protected array $lockedIndexes = []; 33 34 /** @var bool Has a lock been acquired for all used indexes? */ 35 protected bool $isWritable = false; 36 37 /** 38 * Initialize the collection with the names of the indexes it manages 39 * 40 * @param string $idxEntity Name of the primary entity index, eg. 'page' 41 * @param string $idxToken Base name of the secondary entity index, eg. 'w' for words 42 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 43 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 44 * @param bool $splitByLength Whether to split token/frequency indexes by token length 45 */ 46 public function __construct( 47 protected string $idxEntity, 48 protected string $idxToken, 49 protected string $idxFrequency = '', 50 protected string $idxReverse = '', 51 protected bool $splitByLength = false 52 ) 53 { 54 } 55 56 /** 57 * Destructor 58 * 59 * Ensures locks are released when the class is destroyed 60 */ 61 public function __destruct() 62 { 63 $this->unlock(); 64 } 65 66 /** 67 * Lock all indexes for writing 68 * 69 * @return $this can be used for chaining 70 * @throws IndexLockException 71 */ 72 public function lock(): static 73 { 74 foreach (array_filter([ 75 $this->idxEntity, 76 $this->idxToken, 77 $this->idxFrequency, 78 $this->idxReverse 79 ]) as $idxName) { 80 if (!(new Lock($idxName))->acquire()) { 81 $this->unlock(); 82 throw new IndexLockException('Could not lock ' . $idxName . ' for writing'); 83 } 84 $this->lockedIndexes[] = $idxName; 85 } 86 $this->isWritable = true; 87 return $this; 88 } 89 90 /** 91 * Unlock all indexes that were successfully locked 92 * 93 * @return void 94 */ 95 public function unlock(): void 96 { 97 foreach ($this->lockedIndexes as $idxName) { 98 (new Lock($idxName))->release(); 99 } 100 $this->lockedIndexes = []; 101 $this->isWritable = false; 102 } 103 104 /** 105 * @return FileIndex 106 */ 107 public function getEntityIndex(): FileIndex 108 { 109 return new FileIndex($this->idxEntity, '', $this->isWritable); 110 } 111 112 /** 113 * @param int|string $suffix 114 * @return MemoryIndex 115 */ 116 public function getTokenIndex(int|string $suffix): MemoryIndex 117 { 118 return new MemoryIndex($this->idxToken, $suffix, $this->isWritable); 119 } 120 121 /** 122 * @param int|string $suffix 123 * @return MemoryIndex 124 */ 125 public function getFrequencyIndex(int|string $suffix): MemoryIndex 126 { 127 return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable); 128 } 129 130 /** 131 * @return FileIndex 132 */ 133 public function getReverseIndex(): FileIndex 134 { 135 return new FileIndex($this->idxReverse, '', $this->isWritable); 136 } 137 138 /** 139 * Maximum suffix for the token indexes (eg. max word length currently stored) 140 * 141 * @return int 142 */ 143 public function getTokenIndexMaximum(): int 144 { 145 return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum 146 } 147 148 /** 149 * Add or update the tokens for a given entity 150 * 151 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 152 * entity from the index. 153 * 154 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 155 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 156 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 157 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 158 * 159 * @param string $entity The name of the entity 160 * @param string[] $tokens The list of tokens for this entity 161 * @throws IndexAccessException 162 * @throws IndexWriteException 163 * @throws IndexLockException 164 */ 165 public function addEntity(string $entity, array $tokens): void 166 { 167 if (!$this->isWritable) { 168 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 169 } 170 171 $entityIndex = $this->getEntityIndex(); 172 $entityId = $entityIndex->accessCachedValue($entity); 173 174 $old = $this->getReverseAssignments($entity); 175 $new = $this->resolveTokens($tokens); 176 177 $merged = array_replace_recursive($old, $new); 178 179 $this->updateIndexes($merged, $entityId); 180 $this->saveReverseAssignments($entity, $merged); 181 } 182 183 /** 184 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 185 * 186 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 187 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 188 * token strings to IDs via the appropriate token index. 189 * 190 * @param string[] $tokens The raw token list 191 * @return array [group => [tokenId => frequency, ...], ...] 192 * @throws IndexLockException 193 * @throws IndexWriteException 194 */ 195 protected function resolveTokens(array $tokens): array 196 { 197 $counted = $this->countTokens($tokens); 198 199 // group tokens by their index suffix 200 $groups = []; 201 foreach ($counted as $token => $freq) { 202 $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : ''; 203 $groups[$group][$token] = $freq; 204 } 205 206 // resolve token strings to IDs 207 $result = []; 208 foreach ($groups as $group => $tokenFreqs) { 209 $tokenIndex = $this->getTokenIndex($group); 210 $result[$group] = []; 211 foreach ($tokenFreqs as $token => $freq) { 212 $tokenId = $tokenIndex->getRowID((string)$token); 213 $result[$group][$tokenId] = $freq; 214 } 215 $tokenIndex->save(); 216 } 217 218 return $result; 219 } 220 221 /** 222 * Count or deduplicate tokens and return their frequencies 223 * 224 * FrequencyCollections return actual occurrence counts. 225 * LookupCollections deduplicate and return 1 for each token. 226 * 227 * @param string[] $tokens The raw token list 228 * @return array [token => frequency, ...] 229 */ 230 abstract protected function countTokens(array $tokens): array; 231 232 /** 233 * Get the token assignments for a given entity from the reverse index 234 * 235 * Returns the parsed reverse index record. The exact structure depends on the collection type. 236 * 237 * @param string $entity 238 * @return array 239 * @throws IndexAccessException 240 * @throws IndexWriteException 241 */ 242 public function getReverseAssignments(string $entity): array 243 { 244 $entityIndex = $this->getEntityIndex(); 245 $entityId = $entityIndex->accessCachedValue($entity); 246 247 $reverseIndex = $this->getReverseIndex(); 248 $record = $reverseIndex->retrieveRow($entityId); 249 250 if ($record === '') { 251 return []; 252 } 253 254 return $this->parseReverseRecord($record); 255 } 256 257 /** 258 * Store the reverse index info about what tokens are assigned to the entity 259 * 260 * @param string $entity 261 * @param array $data The assignment data to store 262 * @return void 263 * @throws IndexAccessException 264 * @throws IndexWriteException 265 * @throws IndexLockException 266 */ 267 protected function saveReverseAssignments(string $entity, array $data): void 268 { 269 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 270 $data = array_map('array_filter', $data); 271 $data = array_filter($data); 272 273 $record = $this->formatReverseRecord($data); 274 275 $entityIndex = $this->getEntityIndex(); 276 $entityId = $entityIndex->accessCachedValue($entity); 277 278 $reverseIndex = $this->getReverseIndex(); 279 $reverseIndex->changeRow($entityId, $record); 280 } 281 282 /** 283 * Parse a reverse index record into a two-level array 284 * 285 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 286 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 287 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 288 * 289 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 290 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 291 * This mirrors how TupleOps omits *1 for frequency 1. 292 * 293 * @param string $record The raw reverse index record 294 * @return array [group => [tokenId => 0, ...], ...] 295 */ 296 protected function parseReverseRecord(string $record): array 297 { 298 $result = []; 299 foreach (explode(':', $record) as $entry) { 300 $parts = explode('*', $entry, 2); 301 $tokenId = array_pop($parts); 302 $group = array_pop($parts) ?? ''; 303 $result[$group][$tokenId] = 0; 304 } 305 return $result; 306 } 307 308 /** 309 * Format a two-level array into a reverse index record string 310 * 311 * @param array $data [group => [tokenId => freq, ...], ...] 312 * @return string The formatted record 313 */ 314 protected function formatReverseRecord(array $data): string 315 { 316 $parts = []; 317 foreach ($data as $group => $tokens) { 318 $prefix = $group === '' ? '' : "$group*"; 319 foreach (array_keys($tokens) as $tokenId) { 320 $parts[] = $prefix . $tokenId; 321 } 322 } 323 return implode(':', $parts); 324 } 325 326 /** 327 * Update frequency indexes with the given data 328 * 329 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 330 * corresponding frequency index for each group. A frequency of 0 removes the entity 331 * from that token's frequency record. 332 * 333 * @param array $data [group => [tokenId => frequency, ...], ...] 334 * @param int $entityId The entity ID 335 * @throws IndexLockException 336 * @throws IndexWriteException 337 */ 338 protected function updateIndexes(array $data, int $entityId): void 339 { 340 foreach ($data as $group => $tokens) { 341 $freqIndex = $this->getFrequencyIndex($group); 342 foreach ($tokens as $tokenId => $freq) { 343 $record = $freqIndex->retrieveRow($tokenId); 344 $record = TupleOps::updateTuple($record, $entityId, $freq); 345 $freqIndex->changeRow($tokenId, $record); 346 } 347 $freqIndex->save(); 348 } 349 } 350} 351