1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexAccessException; 6use dokuwiki\Search\Exception\IndexLockException; 7use dokuwiki\Search\Exception\IndexWriteException; 8use dokuwiki\Search\Index\FileIndex; 9use dokuwiki\Search\Index\TupleOps; 10use dokuwiki\Search\Tokenizer; 11use dokuwiki\Search\Index\Lock; 12use dokuwiki\Search\Index\MemoryIndex; 13 14/** 15 * Abstract base class for index collections 16 * 17 * A collection manages a group of related indexes that together provide a specific search use case. 18 * Every collection works with four index types: entity, token, frequency, and reverse. 19 * 20 * entity - the list of the main entities (eg. pages) 21 * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 22 * frequency - how often a token appears on a entity (can be split into multiple files) 23 * reverse - the list of tokens assigned to each entity 24 * 25 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 26 * @author Andreas Gohr <andi@splitbrain.org> 27 * @author Tom N Harris <tnharris@whoopdedo.org> 28 */ 29abstract class AbstractCollection 30{ 31 /** @var bool Has a lock been acquired for all used indexes? */ 32 protected bool $isWritable = false; 33 34 /** 35 * Initialize the collection with the names of the indexes it manages 36 * 37 * @param string $idxEntity Name of the primary entity index, eg. 'page' 38 * @param string $idxToken Base name of the secondary entity index, eg. 'w' for words 39 * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 40 * @param string $idxReverse Name of the reverse index, eg. 'pageword' 41 * @param bool $splitByLength Whether to split token/frequency indexes by token length 42 */ 43 public function __construct( 44 protected string $idxEntity, 45 protected string $idxToken, 46 protected string $idxFrequency = '', 47 protected string $idxReverse = '', 48 protected bool $splitByLength = false 49 ) { 50 } 51 52 /** 53 * Destructor 54 * 55 * Ensures locks are released when the class is destroyed 56 */ 57 public function __destruct() 58 { 59 if ($this->isWritable) { 60 $this->unlock(); 61 } 62 } 63 64 /** 65 * Lock all indexes for writing 66 * 67 * @return $this can be used for chaining 68 * @throws IndexLockException 69 */ 70 public function lock(): static 71 { 72 foreach (array_filter([$this->idxEntity, $this->idxToken, $this->idxFrequency, $this->idxReverse]) as $idxName) { 73 if (!(new Lock($idxName))->acquire()) { 74 $this->unlock(); // release any already acquired locks 75 throw new IndexLockException('Could not lock ' . $idxName . ' for writing'); 76 } 77 } 78 // locking succeeded 79 $this->isWritable = true; 80 return $this; 81 } 82 83 /** 84 * Unlock all indexes 85 * 86 * @return void 87 */ 88 public function unlock(): void 89 { 90 foreach (array_filter([$this->idxEntity, $this->idxToken, $this->idxFrequency, $this->idxReverse]) as $idxName) { 91 (new Lock($idxName))->release(); 92 } 93 $this->isWritable = false; 94 } 95 96 /** 97 * @return FileIndex 98 */ 99 public function getEntityIndex(): FileIndex 100 { 101 return new FileIndex($this->idxEntity, '', $this->isWritable); 102 } 103 104 /** 105 * @param int|string $suffix 106 * @return MemoryIndex 107 */ 108 public function getTokenIndex(int|string $suffix): MemoryIndex 109 { 110 return new MemoryIndex($this->idxToken, $suffix, $this->isWritable); 111 } 112 113 /** 114 * @param int|string $suffix 115 * @return MemoryIndex 116 */ 117 public function getFrequencyIndex(int|string $suffix): MemoryIndex 118 { 119 return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable); 120 } 121 122 /** 123 * @return FileIndex 124 */ 125 public function getReverseIndex(): FileIndex 126 { 127 return new FileIndex($this->idxReverse, '', $this->isWritable); 128 } 129 130 /** 131 * Maximum suffix for the token indexes (eg. max word length currently stored) 132 * 133 * @return int 134 */ 135 public function getTokenIndexMaximum(): int 136 { 137 return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum 138 } 139 140 /** 141 * Add or update the tokens for a given entity 142 * 143 * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 144 * entity from the index. 145 * 146 * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 147 * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 148 * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 149 * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 150 * 151 * @param string $entity The name of the entity 152 * @param string[] $tokens The list of tokens for this entity 153 * @throws IndexAccessException 154 * @throws IndexWriteException 155 * @throws IndexLockException 156 */ 157 public function addEntity(string $entity, array $tokens): void 158 { 159 if (!$this->isWritable) { 160 throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 161 } 162 163 $entityIndex = $this->getEntityIndex(); 164 $entityId = $entityIndex->accessCachedValue($entity); 165 166 $old = $this->getReverseAssignments($entity); 167 $new = $this->resolveTokens($tokens); 168 169 $merged = array_replace_recursive($old, $new); 170 171 $this->updateIndexes($merged, $entityId); 172 $this->saveReverseAssignments($entity, $merged); 173 } 174 175 /** 176 * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 177 * 178 * Calls countTokens() to get token frequencies (subclass responsibility), then groups 179 * by token length if splitByLength is enabled, or under '' if not. Finally resolves 180 * token strings to IDs via the appropriate token index. 181 * 182 * @param string[] $tokens The raw token list 183 * @return array [group => [tokenId => frequency, ...], ...] 184 * @throws IndexLockException 185 * @throws IndexWriteException 186 */ 187 protected function resolveTokens(array $tokens): array 188 { 189 $counted = $this->countTokens($tokens); 190 191 // group tokens by their index suffix 192 $groups = []; 193 foreach ($counted as $token => $freq) { 194 $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : ''; 195 $groups[$group][$token] = $freq; 196 } 197 198 // resolve token strings to IDs 199 $result = []; 200 foreach ($groups as $group => $tokenFreqs) { 201 $tokenIndex = $this->getTokenIndex($group); 202 $result[$group] = []; 203 foreach ($tokenFreqs as $token => $freq) { 204 $tokenId = $tokenIndex->getRowID((string)$token); 205 $result[$group][$tokenId] = $freq; 206 } 207 $tokenIndex->save(); 208 } 209 210 return $result; 211 } 212 213 /** 214 * Count or deduplicate tokens and return their frequencies 215 * 216 * FrequencyCollections return actual occurrence counts. 217 * LookupCollections deduplicate and return 1 for each token. 218 * 219 * @param string[] $tokens The raw token list 220 * @return array [token => frequency, ...] 221 */ 222 abstract protected function countTokens(array $tokens): array; 223 224 /** 225 * Get the token assignments for a given entity from the reverse index 226 * 227 * Returns the parsed reverse index record. The exact structure depends on the collection type. 228 * 229 * @param string $entity 230 * @return array 231 * @throws IndexAccessException 232 * @throws IndexWriteException 233 */ 234 public function getReverseAssignments(string $entity): array 235 { 236 $entityIndex = $this->getEntityIndex(); 237 $entityId = $entityIndex->accessCachedValue($entity); 238 239 $reverseIndex = $this->getReverseIndex(); 240 $record = $reverseIndex->retrieveRow($entityId); 241 242 if ($record === '') { 243 return []; 244 } 245 246 return $this->parseReverseRecord($record); 247 } 248 249 /** 250 * Store the reverse index info about what tokens are assigned to the entity 251 * 252 * @param string $entity 253 * @param array $data The assignment data to store 254 * @return void 255 * @throws IndexAccessException 256 * @throws IndexWriteException 257 * @throws IndexLockException 258 */ 259 protected function saveReverseAssignments(string $entity, array $data): void 260 { 261 // remove tokens with frequency 0 (no longer assigned), then remove empty groups 262 $data = array_map('array_filter', $data); 263 $data = array_filter($data); 264 265 $record = $this->formatReverseRecord($data); 266 267 $entityIndex = $this->getEntityIndex(); 268 $entityId = $entityIndex->accessCachedValue($entity); 269 270 $reverseIndex = $this->getReverseIndex(); 271 $reverseIndex->changeRow($entityId, $record); 272 } 273 274 /** 275 * Parse a reverse index record into a two-level array 276 * 277 * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 278 * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 279 * tokens absent from the new data retain 0, signaling deletion from the frequency index. 280 * 281 * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 282 * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 283 * This mirrors how TupleOps omits *1 for frequency 1. 284 * 285 * @param string $record The raw reverse index record 286 * @return array [group => [tokenId => 0, ...], ...] 287 */ 288 protected function parseReverseRecord(string $record): array 289 { 290 $result = []; 291 foreach (explode(':', $record) as $entry) { 292 $parts = explode('*', $entry, 2); 293 $tokenId = array_pop($parts); 294 $group = array_pop($parts) ?? ''; 295 $result[$group][$tokenId] = 0; 296 } 297 return $result; 298 } 299 300 /** 301 * Format a two-level array into a reverse index record string 302 * 303 * @param array $data [group => [tokenId => freq, ...], ...] 304 * @return string The formatted record 305 */ 306 protected function formatReverseRecord(array $data): string 307 { 308 $parts = []; 309 foreach ($data as $group => $tokens) { 310 $prefix = $group === '' ? '' : "$group*"; 311 foreach (array_keys($tokens) as $tokenId) { 312 $parts[] = $prefix . $tokenId; 313 } 314 } 315 return implode(':', $parts); 316 } 317 318 /** 319 * Update frequency indexes with the given data 320 * 321 * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 322 * corresponding frequency index for each group. A frequency of 0 removes the entity 323 * from that token's frequency record. 324 * 325 * @param array $data [group => [tokenId => frequency, ...], ...] 326 * @param int $entityId The entity ID 327 * @throws IndexLockException 328 * @throws IndexWriteException 329 */ 330 protected function updateIndexes(array $data, int $entityId): void 331 { 332 foreach ($data as $group => $tokens) { 333 $freqIndex = $this->getFrequencyIndex($group); 334 foreach ($tokens as $tokenId => $freq) { 335 $record = $freqIndex->retrieveRow($tokenId); 336 $record = TupleOps::updateTuple($record, $entityId, $freq); 337 $freqIndex->changeRow($tokenId, $record); 338 } 339 $freqIndex->save(); 340 } 341 } 342} 343