1*f2bbffb5SAndreas Gohr<?php 2*f2bbffb5SAndreas Gohr 3*f2bbffb5SAndreas Gohrnamespace dokuwiki\Search\Collection; 4*f2bbffb5SAndreas Gohr 5*f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException; 6*f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexLockException; 7*f2bbffb5SAndreas Gohruse dokuwiki\Search\Exception\IndexWriteException; 8*f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\FileIndex; 9*f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\TupleOps; 10*f2bbffb5SAndreas Gohruse dokuwiki\Search\Tokenizer; 11*f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\Lock; 12*f2bbffb5SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 13*f2bbffb5SAndreas Gohr 14*f2bbffb5SAndreas Gohr/** 15*f2bbffb5SAndreas Gohr * Abstract base class for index collections 16*f2bbffb5SAndreas Gohr * 17*f2bbffb5SAndreas Gohr * A collection manages a group of related indexes that together provide a specific search use case. 18*f2bbffb5SAndreas Gohr * Every collection works with four index types: entity, token, frequency, and reverse. 19*f2bbffb5SAndreas Gohr * 20*f2bbffb5SAndreas Gohr * entity - the list of the main entities (eg. pages) 21*f2bbffb5SAndreas Gohr * token - the list of tokens (eg. words) assigned to entities (can be split into multiple files) 22*f2bbffb5SAndreas Gohr * frequency - how often a token appears on a entity (can be split into multiple files) 23*f2bbffb5SAndreas Gohr * reverse - the list of tokens assigned to each entity 24*f2bbffb5SAndreas Gohr * 25*f2bbffb5SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 26*f2bbffb5SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 27*f2bbffb5SAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 28*f2bbffb5SAndreas Gohr */ 29*f2bbffb5SAndreas Gohrabstract class AbstractCollection 30*f2bbffb5SAndreas Gohr{ 31*f2bbffb5SAndreas Gohr /** @var bool Has a lock been acquired for all used indexes? */ 32*f2bbffb5SAndreas Gohr protected bool $isWritable = false; 33*f2bbffb5SAndreas Gohr 34*f2bbffb5SAndreas Gohr /** 35*f2bbffb5SAndreas Gohr * Initialize the collection with the names of the indexes it manages 36*f2bbffb5SAndreas Gohr * 37*f2bbffb5SAndreas Gohr * @param string $idxEntity Name of the primary entity index, eg. 'page' 38*f2bbffb5SAndreas Gohr * @param string $idxToken Base name of the secondary entity index, eg. 'w' for words 39*f2bbffb5SAndreas Gohr * @param string $idxFrequency Base name of the frequency index, eg. 'i' for word frequencies 40*f2bbffb5SAndreas Gohr * @param string $idxReverse Name of the reverse index, eg. 'pageword' 41*f2bbffb5SAndreas Gohr * @param bool $splitByLength Whether to split token/frequency indexes by token length 42*f2bbffb5SAndreas Gohr */ 43*f2bbffb5SAndreas Gohr public function __construct( 44*f2bbffb5SAndreas Gohr protected string $idxEntity, 45*f2bbffb5SAndreas Gohr protected string $idxToken, 46*f2bbffb5SAndreas Gohr protected string $idxFrequency, 47*f2bbffb5SAndreas Gohr protected string $idxReverse, 48*f2bbffb5SAndreas Gohr protected bool $splitByLength = false 49*f2bbffb5SAndreas Gohr ) { 50*f2bbffb5SAndreas Gohr } 51*f2bbffb5SAndreas Gohr 52*f2bbffb5SAndreas Gohr /** 53*f2bbffb5SAndreas Gohr * Destructor 54*f2bbffb5SAndreas Gohr * 55*f2bbffb5SAndreas Gohr * Ensures locks are released when the class is destroyed 56*f2bbffb5SAndreas Gohr */ 57*f2bbffb5SAndreas Gohr public function __destruct() 58*f2bbffb5SAndreas Gohr { 59*f2bbffb5SAndreas Gohr if ($this->isWritable) { 60*f2bbffb5SAndreas Gohr $this->unlock(); 61*f2bbffb5SAndreas Gohr } 62*f2bbffb5SAndreas Gohr } 63*f2bbffb5SAndreas Gohr 64*f2bbffb5SAndreas Gohr /** 65*f2bbffb5SAndreas Gohr * Lock all indexes for writing 66*f2bbffb5SAndreas Gohr * 67*f2bbffb5SAndreas Gohr * @return $this can be used for chaining 68*f2bbffb5SAndreas Gohr * @throws IndexLockException 69*f2bbffb5SAndreas Gohr */ 70*f2bbffb5SAndreas Gohr public function lock(): static 71*f2bbffb5SAndreas Gohr { 72*f2bbffb5SAndreas Gohr foreach ([$this->idxEntity, $this->idxToken, $this->idxFrequency, $this->idxReverse] as $idxName) { 73*f2bbffb5SAndreas Gohr if (!(new Lock($idxName))->acquire()) { 74*f2bbffb5SAndreas Gohr $this->unlock(); // release any already acquired locks 75*f2bbffb5SAndreas Gohr throw new IndexLockException('Could not lock ' . $idxName . ' for writing'); 76*f2bbffb5SAndreas Gohr } 77*f2bbffb5SAndreas Gohr } 78*f2bbffb5SAndreas Gohr // locking succeeded 79*f2bbffb5SAndreas Gohr $this->isWritable = true; 80*f2bbffb5SAndreas Gohr return $this; 81*f2bbffb5SAndreas Gohr } 82*f2bbffb5SAndreas Gohr 83*f2bbffb5SAndreas Gohr /** 84*f2bbffb5SAndreas Gohr * Unlock all indexes 85*f2bbffb5SAndreas Gohr * 86*f2bbffb5SAndreas Gohr * @return void 87*f2bbffb5SAndreas Gohr */ 88*f2bbffb5SAndreas Gohr public function unlock(): void 89*f2bbffb5SAndreas Gohr { 90*f2bbffb5SAndreas Gohr foreach ([$this->idxEntity, $this->idxToken, $this->idxFrequency, $this->idxReverse] as $idxName) { 91*f2bbffb5SAndreas Gohr (new Lock($idxName))->release(); 92*f2bbffb5SAndreas Gohr } 93*f2bbffb5SAndreas Gohr $this->isWritable = false; 94*f2bbffb5SAndreas Gohr } 95*f2bbffb5SAndreas Gohr 96*f2bbffb5SAndreas Gohr /** 97*f2bbffb5SAndreas Gohr * @return FileIndex 98*f2bbffb5SAndreas Gohr */ 99*f2bbffb5SAndreas Gohr public function getEntityIndex(): FileIndex 100*f2bbffb5SAndreas Gohr { 101*f2bbffb5SAndreas Gohr return new FileIndex($this->idxEntity, '', $this->isWritable); 102*f2bbffb5SAndreas Gohr } 103*f2bbffb5SAndreas Gohr 104*f2bbffb5SAndreas Gohr /** 105*f2bbffb5SAndreas Gohr * @param int|string $suffix 106*f2bbffb5SAndreas Gohr * @return MemoryIndex 107*f2bbffb5SAndreas Gohr */ 108*f2bbffb5SAndreas Gohr public function getTokenIndex(int|string $suffix): MemoryIndex 109*f2bbffb5SAndreas Gohr { 110*f2bbffb5SAndreas Gohr return new MemoryIndex($this->idxToken, $suffix, $this->isWritable); 111*f2bbffb5SAndreas Gohr } 112*f2bbffb5SAndreas Gohr 113*f2bbffb5SAndreas Gohr /** 114*f2bbffb5SAndreas Gohr * @param int|string $suffix 115*f2bbffb5SAndreas Gohr * @return MemoryIndex 116*f2bbffb5SAndreas Gohr */ 117*f2bbffb5SAndreas Gohr public function getFrequencyIndex(int|string $suffix): MemoryIndex 118*f2bbffb5SAndreas Gohr { 119*f2bbffb5SAndreas Gohr return new MemoryIndex($this->idxFrequency, $suffix, $this->isWritable); 120*f2bbffb5SAndreas Gohr } 121*f2bbffb5SAndreas Gohr 122*f2bbffb5SAndreas Gohr /** 123*f2bbffb5SAndreas Gohr * @return FileIndex 124*f2bbffb5SAndreas Gohr */ 125*f2bbffb5SAndreas Gohr public function getReverseIndex(): FileIndex 126*f2bbffb5SAndreas Gohr { 127*f2bbffb5SAndreas Gohr return new FileIndex($this->idxReverse, '', $this->isWritable); 128*f2bbffb5SAndreas Gohr } 129*f2bbffb5SAndreas Gohr 130*f2bbffb5SAndreas Gohr /** 131*f2bbffb5SAndreas Gohr * Maximum suffix for the token indexes (eg. max word length currently stored) 132*f2bbffb5SAndreas Gohr * 133*f2bbffb5SAndreas Gohr * @return int 134*f2bbffb5SAndreas Gohr */ 135*f2bbffb5SAndreas Gohr public function getTokenIndexMaximum(): int 136*f2bbffb5SAndreas Gohr { 137*f2bbffb5SAndreas Gohr return $this->getTokenIndex('')->max(); // no suffix needed to access the maximum 138*f2bbffb5SAndreas Gohr } 139*f2bbffb5SAndreas Gohr 140*f2bbffb5SAndreas Gohr /** 141*f2bbffb5SAndreas Gohr * Add or update the tokens for a given entity 142*f2bbffb5SAndreas Gohr * 143*f2bbffb5SAndreas Gohr * The given list of tokens replaces the previously stored list for that entity. An empty list removes the 144*f2bbffb5SAndreas Gohr * entity from the index. 145*f2bbffb5SAndreas Gohr * 146*f2bbffb5SAndreas Gohr * The update merges old and new token data. getReverseAssignments() returns all previously stored token IDs 147*f2bbffb5SAndreas Gohr * with a value of 0 (see parseReverseRecord). resolveTokens() returns the new token IDs with their values. 148*f2bbffb5SAndreas Gohr * After array_replace_recursive, tokens only in the old map keep value 0 — causing updateIndexes to delete 149*f2bbffb5SAndreas Gohr * them from the frequency index via TupleOps::updateTuple. Tokens in the new map overwrite with their value. 150*f2bbffb5SAndreas Gohr * 151*f2bbffb5SAndreas Gohr * @param string $entity The name of the entity 152*f2bbffb5SAndreas Gohr * @param string[] $tokens The list of tokens for this entity 153*f2bbffb5SAndreas Gohr * @throws IndexAccessException 154*f2bbffb5SAndreas Gohr * @throws IndexWriteException 155*f2bbffb5SAndreas Gohr * @throws IndexLockException 156*f2bbffb5SAndreas Gohr */ 157*f2bbffb5SAndreas Gohr public function addEntity(string $entity, array $tokens): void 158*f2bbffb5SAndreas Gohr { 159*f2bbffb5SAndreas Gohr if (!$this->isWritable) { 160*f2bbffb5SAndreas Gohr throw new IndexLockException('Indexes not locked. Forgot to call lock()?'); 161*f2bbffb5SAndreas Gohr } 162*f2bbffb5SAndreas Gohr 163*f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 164*f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 165*f2bbffb5SAndreas Gohr 166*f2bbffb5SAndreas Gohr $old = $this->getReverseAssignments($entity); 167*f2bbffb5SAndreas Gohr $new = $this->resolveTokens($tokens); 168*f2bbffb5SAndreas Gohr 169*f2bbffb5SAndreas Gohr $merged = array_replace_recursive($old, $new); 170*f2bbffb5SAndreas Gohr 171*f2bbffb5SAndreas Gohr $this->updateIndexes($merged, $entityId); 172*f2bbffb5SAndreas Gohr $this->saveReverseAssignments($entity, $merged); 173*f2bbffb5SAndreas Gohr } 174*f2bbffb5SAndreas Gohr 175*f2bbffb5SAndreas Gohr /** 176*f2bbffb5SAndreas Gohr * Resolve raw tokens into the two-level structure [group => [tokenId => frequency]] 177*f2bbffb5SAndreas Gohr * 178*f2bbffb5SAndreas Gohr * Calls countTokens() to get token frequencies (subclass responsibility), then groups 179*f2bbffb5SAndreas Gohr * by token length if splitByLength is enabled, or under '' if not. Finally resolves 180*f2bbffb5SAndreas Gohr * token strings to IDs via the appropriate token index. 181*f2bbffb5SAndreas Gohr * 182*f2bbffb5SAndreas Gohr * @param string[] $tokens The raw token list 183*f2bbffb5SAndreas Gohr * @return array [group => [tokenId => frequency, ...], ...] 184*f2bbffb5SAndreas Gohr * @throws IndexLockException 185*f2bbffb5SAndreas Gohr * @throws IndexWriteException 186*f2bbffb5SAndreas Gohr */ 187*f2bbffb5SAndreas Gohr protected function resolveTokens(array $tokens): array 188*f2bbffb5SAndreas Gohr { 189*f2bbffb5SAndreas Gohr $counted = $this->countTokens($tokens); 190*f2bbffb5SAndreas Gohr 191*f2bbffb5SAndreas Gohr // group tokens by their index suffix 192*f2bbffb5SAndreas Gohr $groups = []; 193*f2bbffb5SAndreas Gohr foreach ($counted as $token => $freq) { 194*f2bbffb5SAndreas Gohr $group = $this->splitByLength ? (string)Tokenizer::tokenLength($token) : ''; 195*f2bbffb5SAndreas Gohr $groups[$group][$token] = $freq; 196*f2bbffb5SAndreas Gohr } 197*f2bbffb5SAndreas Gohr 198*f2bbffb5SAndreas Gohr // resolve token strings to IDs 199*f2bbffb5SAndreas Gohr $result = []; 200*f2bbffb5SAndreas Gohr foreach ($groups as $group => $tokenFreqs) { 201*f2bbffb5SAndreas Gohr $tokenIndex = $this->getTokenIndex($group); 202*f2bbffb5SAndreas Gohr $result[$group] = []; 203*f2bbffb5SAndreas Gohr foreach ($tokenFreqs as $token => $freq) { 204*f2bbffb5SAndreas Gohr $tokenId = $tokenIndex->getRowID((string)$token); 205*f2bbffb5SAndreas Gohr $result[$group][$tokenId] = $freq; 206*f2bbffb5SAndreas Gohr } 207*f2bbffb5SAndreas Gohr $tokenIndex->save(); 208*f2bbffb5SAndreas Gohr } 209*f2bbffb5SAndreas Gohr 210*f2bbffb5SAndreas Gohr return $result; 211*f2bbffb5SAndreas Gohr } 212*f2bbffb5SAndreas Gohr 213*f2bbffb5SAndreas Gohr /** 214*f2bbffb5SAndreas Gohr * Count or deduplicate tokens and return their frequencies 215*f2bbffb5SAndreas Gohr * 216*f2bbffb5SAndreas Gohr * FrequencyCollections return actual occurrence counts. 217*f2bbffb5SAndreas Gohr * LookupCollections deduplicate and return 1 for each token. 218*f2bbffb5SAndreas Gohr * 219*f2bbffb5SAndreas Gohr * @param string[] $tokens The raw token list 220*f2bbffb5SAndreas Gohr * @return array [token => frequency, ...] 221*f2bbffb5SAndreas Gohr */ 222*f2bbffb5SAndreas Gohr abstract protected function countTokens(array $tokens): array; 223*f2bbffb5SAndreas Gohr 224*f2bbffb5SAndreas Gohr /** 225*f2bbffb5SAndreas Gohr * Get the token assignments for a given entity from the reverse index 226*f2bbffb5SAndreas Gohr * 227*f2bbffb5SAndreas Gohr * Returns the parsed reverse index record. The exact structure depends on the collection type. 228*f2bbffb5SAndreas Gohr * 229*f2bbffb5SAndreas Gohr * @param string $entity 230*f2bbffb5SAndreas Gohr * @return array 231*f2bbffb5SAndreas Gohr * @throws IndexAccessException 232*f2bbffb5SAndreas Gohr * @throws IndexWriteException 233*f2bbffb5SAndreas Gohr */ 234*f2bbffb5SAndreas Gohr public function getReverseAssignments(string $entity): array 235*f2bbffb5SAndreas Gohr { 236*f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 237*f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 238*f2bbffb5SAndreas Gohr 239*f2bbffb5SAndreas Gohr $reverseIndex = $this->getReverseIndex(); 240*f2bbffb5SAndreas Gohr $record = $reverseIndex->retrieveRow($entityId); 241*f2bbffb5SAndreas Gohr 242*f2bbffb5SAndreas Gohr if ($record === '') { 243*f2bbffb5SAndreas Gohr return []; 244*f2bbffb5SAndreas Gohr } 245*f2bbffb5SAndreas Gohr 246*f2bbffb5SAndreas Gohr return $this->parseReverseRecord($record); 247*f2bbffb5SAndreas Gohr } 248*f2bbffb5SAndreas Gohr 249*f2bbffb5SAndreas Gohr /** 250*f2bbffb5SAndreas Gohr * Store the reverse index info about what tokens are assigned to the entity 251*f2bbffb5SAndreas Gohr * 252*f2bbffb5SAndreas Gohr * @param string $entity 253*f2bbffb5SAndreas Gohr * @param array $data The assignment data to store 254*f2bbffb5SAndreas Gohr * @return void 255*f2bbffb5SAndreas Gohr * @throws IndexAccessException 256*f2bbffb5SAndreas Gohr * @throws IndexWriteException 257*f2bbffb5SAndreas Gohr * @throws IndexLockException 258*f2bbffb5SAndreas Gohr */ 259*f2bbffb5SAndreas Gohr protected function saveReverseAssignments(string $entity, array $data): void 260*f2bbffb5SAndreas Gohr { 261*f2bbffb5SAndreas Gohr // remove tokens with frequency 0 (no longer assigned), then remove empty groups 262*f2bbffb5SAndreas Gohr $data = array_map('array_filter', $data); 263*f2bbffb5SAndreas Gohr $data = array_filter($data); 264*f2bbffb5SAndreas Gohr 265*f2bbffb5SAndreas Gohr $record = $this->formatReverseRecord($data); 266*f2bbffb5SAndreas Gohr 267*f2bbffb5SAndreas Gohr $entityIndex = $this->getEntityIndex(); 268*f2bbffb5SAndreas Gohr $entityId = $entityIndex->accessCachedValue($entity); 269*f2bbffb5SAndreas Gohr 270*f2bbffb5SAndreas Gohr $reverseIndex = $this->getReverseIndex(); 271*f2bbffb5SAndreas Gohr $reverseIndex->changeRow($entityId, $record); 272*f2bbffb5SAndreas Gohr } 273*f2bbffb5SAndreas Gohr 274*f2bbffb5SAndreas Gohr /** 275*f2bbffb5SAndreas Gohr * Parse a reverse index record into a two-level array 276*f2bbffb5SAndreas Gohr * 277*f2bbffb5SAndreas Gohr * The reverse index only stores which token IDs belong to an entity, not their frequencies. All values 278*f2bbffb5SAndreas Gohr * in the returned array are set to 0. This is intentional: when merged with new data in addEntity(), 279*f2bbffb5SAndreas Gohr * tokens absent from the new data retain 0, signaling deletion from the frequency index. 280*f2bbffb5SAndreas Gohr * 281*f2bbffb5SAndreas Gohr * For split collections the format is "group*tokenId:group*tokenId:..." where group is the token length. 282*f2bbffb5SAndreas Gohr * For non-split collections the group prefix is omitted: "tokenId:tokenId:..." 283*f2bbffb5SAndreas Gohr * This mirrors how TupleOps omits *1 for frequency 1. 284*f2bbffb5SAndreas Gohr * 285*f2bbffb5SAndreas Gohr * @param string $record The raw reverse index record 286*f2bbffb5SAndreas Gohr * @return array [group => [tokenId => 0, ...], ...] 287*f2bbffb5SAndreas Gohr */ 288*f2bbffb5SAndreas Gohr protected function parseReverseRecord(string $record): array 289*f2bbffb5SAndreas Gohr { 290*f2bbffb5SAndreas Gohr $result = []; 291*f2bbffb5SAndreas Gohr foreach (explode(':', $record) as $entry) { 292*f2bbffb5SAndreas Gohr $parts = explode('*', $entry, 2); 293*f2bbffb5SAndreas Gohr $tokenId = array_pop($parts); 294*f2bbffb5SAndreas Gohr $group = array_pop($parts) ?? ''; 295*f2bbffb5SAndreas Gohr $result[$group][$tokenId] = 0; 296*f2bbffb5SAndreas Gohr } 297*f2bbffb5SAndreas Gohr return $result; 298*f2bbffb5SAndreas Gohr } 299*f2bbffb5SAndreas Gohr 300*f2bbffb5SAndreas Gohr /** 301*f2bbffb5SAndreas Gohr * Format a two-level array into a reverse index record string 302*f2bbffb5SAndreas Gohr * 303*f2bbffb5SAndreas Gohr * @param array $data [group => [tokenId => freq, ...], ...] 304*f2bbffb5SAndreas Gohr * @return string The formatted record 305*f2bbffb5SAndreas Gohr */ 306*f2bbffb5SAndreas Gohr protected function formatReverseRecord(array $data): string 307*f2bbffb5SAndreas Gohr { 308*f2bbffb5SAndreas Gohr $parts = []; 309*f2bbffb5SAndreas Gohr foreach ($data as $group => $tokens) { 310*f2bbffb5SAndreas Gohr $prefix = $group === '' ? '' : "$group*"; 311*f2bbffb5SAndreas Gohr foreach (array_keys($tokens) as $tokenId) { 312*f2bbffb5SAndreas Gohr $parts[] = $prefix . $tokenId; 313*f2bbffb5SAndreas Gohr } 314*f2bbffb5SAndreas Gohr } 315*f2bbffb5SAndreas Gohr return implode(':', $parts); 316*f2bbffb5SAndreas Gohr } 317*f2bbffb5SAndreas Gohr 318*f2bbffb5SAndreas Gohr /** 319*f2bbffb5SAndreas Gohr * Update frequency indexes with the given data 320*f2bbffb5SAndreas Gohr * 321*f2bbffb5SAndreas Gohr * Iterates over the two-level structure [group => [tokenId => freq]] and updates the 322*f2bbffb5SAndreas Gohr * corresponding frequency index for each group. A frequency of 0 removes the entity 323*f2bbffb5SAndreas Gohr * from that token's frequency record. 324*f2bbffb5SAndreas Gohr * 325*f2bbffb5SAndreas Gohr * @param array $data [group => [tokenId => frequency, ...], ...] 326*f2bbffb5SAndreas Gohr * @param int $entityId The entity ID 327*f2bbffb5SAndreas Gohr * @throws IndexLockException 328*f2bbffb5SAndreas Gohr * @throws IndexWriteException 329*f2bbffb5SAndreas Gohr */ 330*f2bbffb5SAndreas Gohr protected function updateIndexes(array $data, int $entityId): void 331*f2bbffb5SAndreas Gohr { 332*f2bbffb5SAndreas Gohr foreach ($data as $group => $tokens) { 333*f2bbffb5SAndreas Gohr $freqIndex = $this->getFrequencyIndex($group); 334*f2bbffb5SAndreas Gohr foreach ($tokens as $tokenId => $freq) { 335*f2bbffb5SAndreas Gohr $record = $freqIndex->retrieveRow($tokenId); 336*f2bbffb5SAndreas Gohr $record = TupleOps::updateTuple($record, $entityId, $freq); 337*f2bbffb5SAndreas Gohr $freqIndex->changeRow($tokenId, $record); 338*f2bbffb5SAndreas Gohr } 339*f2bbffb5SAndreas Gohr $freqIndex->save(); 340*f2bbffb5SAndreas Gohr } 341*f2bbffb5SAndreas Gohr } 342*f2bbffb5SAndreas Gohr} 343