1*6734bb8cSAndreas Gohr<?php 2*6734bb8cSAndreas Gohr 3*6734bb8cSAndreas Gohrnamespace dokuwiki\Search\Collection; 4*6734bb8cSAndreas Gohr 5*6734bb8cSAndreas Gohruse dokuwiki\Search\Exception\SearchException; 6*6734bb8cSAndreas Gohruse dokuwiki\Search\Index\AbstractIndex; 7*6734bb8cSAndreas Gohr 8*6734bb8cSAndreas Gohr/** 9*6734bb8cSAndreas Gohr * Search a collection for one or more terms with wildcards 10*6734bb8cSAndreas Gohr * 11*6734bb8cSAndreas Gohr * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both 12*6734bb8cSAndreas Gohr * split-by-length and non-split index layouts transparently. 13*6734bb8cSAndreas Gohr * 14*6734bb8cSAndreas Gohr * Provides two APIs: 15*6734bb8cSAndreas Gohr * - addTerm()/execute(): For fulltext-style search with Term objects and min-length validation 16*6734bb8cSAndreas Gohr * - lookup(): For metadata-style search with exact/wildcard/callback matching, no length restrictions 17*6734bb8cSAndreas Gohr */ 18*6734bb8cSAndreas Gohrclass CollectionSearch 19*6734bb8cSAndreas Gohr{ 20*6734bb8cSAndreas Gohr /** @var Term[] all terms indexed by original term name */ 21*6734bb8cSAndreas Gohr protected array $allTerms = []; 22*6734bb8cSAndreas Gohr 23*6734bb8cSAndreas Gohr /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */ 24*6734bb8cSAndreas Gohr protected array $groupedTerms = []; 25*6734bb8cSAndreas Gohr 26*6734bb8cSAndreas Gohr /** @var array<int, string> a list of entities that match [entityID => entityName] */ 27*6734bb8cSAndreas Gohr protected array $entities = []; 28*6734bb8cSAndreas Gohr 29*6734bb8cSAndreas Gohr /** @var AbstractCollection The collection this search works on */ 30*6734bb8cSAndreas Gohr protected AbstractCollection $collection; 31*6734bb8cSAndreas Gohr 32*6734bb8cSAndreas Gohr /** @var ?int the maximum token index suffix as currently indexed */ 33*6734bb8cSAndreas Gohr protected ?int $max = null; 34*6734bb8cSAndreas Gohr 35*6734bb8cSAndreas Gohr /** 36*6734bb8cSAndreas Gohr * Initialize a search on the given collection 37*6734bb8cSAndreas Gohr * 38*6734bb8cSAndreas Gohr * @param AbstractCollection $collection 39*6734bb8cSAndreas Gohr */ 40*6734bb8cSAndreas Gohr public function __construct(AbstractCollection $collection) 41*6734bb8cSAndreas Gohr { 42*6734bb8cSAndreas Gohr $this->collection = $collection; 43*6734bb8cSAndreas Gohr } 44*6734bb8cSAndreas Gohr 45*6734bb8cSAndreas Gohr /** 46*6734bb8cSAndreas Gohr * Add a term that will be looked up in the index later 47*6734bb8cSAndreas Gohr * 48*6734bb8cSAndreas Gohr * @param string $term 49*6734bb8cSAndreas Gohr * @return Term the internal representation of the term, it will not be complete before the search has been executed 50*6734bb8cSAndreas Gohr * @throws SearchException if the given term was too short or otherwise invalid 51*6734bb8cSAndreas Gohr */ 52*6734bb8cSAndreas Gohr public function addTerm(string $term): Term 53*6734bb8cSAndreas Gohr { 54*6734bb8cSAndreas Gohr $term = new Term($term); 55*6734bb8cSAndreas Gohr 56*6734bb8cSAndreas Gohr // we keep all terms in an array 57*6734bb8cSAndreas Gohr $this->allTerms[$term->getOriginal()] = $term; 58*6734bb8cSAndreas Gohr 59*6734bb8cSAndreas Gohr if ($this->collection->isSplitByLength()) { 60*6734bb8cSAndreas Gohr // for wildcards, we need to find tokens from all indexes equal or larger than the term length 61*6734bb8cSAndreas Gohr if ($term->getWildcard()) { 62*6734bb8cSAndreas Gohr if ($this->max === null) { 63*6734bb8cSAndreas Gohr $this->max = $this->collection->getTokenIndexMaximum(); 64*6734bb8cSAndreas Gohr } 65*6734bb8cSAndreas Gohr $max = $this->max; 66*6734bb8cSAndreas Gohr } else { 67*6734bb8cSAndreas Gohr $max = $term->getLength(); 68*6734bb8cSAndreas Gohr } 69*6734bb8cSAndreas Gohr 70*6734bb8cSAndreas Gohr for ($i = $term->getLength(); $i <= $max; $i++) { 71*6734bb8cSAndreas Gohr $this->groupedTerms[$i][] = $term; 72*6734bb8cSAndreas Gohr } 73*6734bb8cSAndreas Gohr } else { 74*6734bb8cSAndreas Gohr // non-split: all terms go into a single group 75*6734bb8cSAndreas Gohr $this->groupedTerms[0][] = $term; 76*6734bb8cSAndreas Gohr } 77*6734bb8cSAndreas Gohr 78*6734bb8cSAndreas Gohr return $term; 79*6734bb8cSAndreas Gohr } 80*6734bb8cSAndreas Gohr 81*6734bb8cSAndreas Gohr /** 82*6734bb8cSAndreas Gohr * Execute the search 83*6734bb8cSAndreas Gohr * 84*6734bb8cSAndreas Gohr * @return Term[] All defined terms. Use their methods to access the results 85*6734bb8cSAndreas Gohr */ 86*6734bb8cSAndreas Gohr public function execute(): array 87*6734bb8cSAndreas Gohr { 88*6734bb8cSAndreas Gohr $this->findTokens(); 89*6734bb8cSAndreas Gohr $this->findFrequencies(); 90*6734bb8cSAndreas Gohr $this->findEntities(); 91*6734bb8cSAndreas Gohr 92*6734bb8cSAndreas Gohr return $this->allTerms; 93*6734bb8cSAndreas Gohr } 94*6734bb8cSAndreas Gohr 95*6734bb8cSAndreas Gohr /** 96*6734bb8cSAndreas Gohr * Get the entities that have the term 97*6734bb8cSAndreas Gohr * 98*6734bb8cSAndreas Gohr * @return array<int, string> [entityID => entityName, ...] 99*6734bb8cSAndreas Gohr */ 100*6734bb8cSAndreas Gohr public function getEntities(): array 101*6734bb8cSAndreas Gohr { 102*6734bb8cSAndreas Gohr return $this->entities; 103*6734bb8cSAndreas Gohr } 104*6734bb8cSAndreas Gohr 105*6734bb8cSAndreas Gohr /** 106*6734bb8cSAndreas Gohr * Search for values in the collection's token index 107*6734bb8cSAndreas Gohr * 108*6734bb8cSAndreas Gohr * A simpler API for metadata-style lookups without Term objects or min-length restrictions. 109*6734bb8cSAndreas Gohr * Supports exact match, wildcard (*), and callback matching. 110*6734bb8cSAndreas Gohr * 111*6734bb8cSAndreas Gohr * @param string|string[] $values search values 112*6734bb8cSAndreas Gohr * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool 113*6734bb8cSAndreas Gohr * @return array [value => [entityName, ...], ...] 114*6734bb8cSAndreas Gohr */ 115*6734bb8cSAndreas Gohr public function lookup(string|array $values, ?callable $func = null): array 116*6734bb8cSAndreas Gohr { 117*6734bb8cSAndreas Gohr if (!is_array($values)) { 118*6734bb8cSAndreas Gohr $values = [$values]; 119*6734bb8cSAndreas Gohr } 120*6734bb8cSAndreas Gohr 121*6734bb8cSAndreas Gohr $result = array_fill_keys($values, []); 122*6734bb8cSAndreas Gohr 123*6734bb8cSAndreas Gohr // determine which groups to search 124*6734bb8cSAndreas Gohr $groups = $this->collection->isSplitByLength() 125*6734bb8cSAndreas Gohr ? range(1, $this->collection->getTokenIndexMaximum()) 126*6734bb8cSAndreas Gohr : [0]; 127*6734bb8cSAndreas Gohr 128*6734bb8cSAndreas Gohr // find matching token IDs across all groups 129*6734bb8cSAndreas Gohr $allMatches = []; // [group => [tokenId => [value, ...], ...]] 130*6734bb8cSAndreas Gohr $allEntityIds = []; 131*6734bb8cSAndreas Gohr 132*6734bb8cSAndreas Gohr foreach ($groups as $group) { 133*6734bb8cSAndreas Gohr $tokenIndex = $this->collection->getTokenIndex($group); 134*6734bb8cSAndreas Gohr if (!$tokenIndex->exists()) continue; 135*6734bb8cSAndreas Gohr 136*6734bb8cSAndreas Gohr $matches = $this->findMatchingTokens($tokenIndex, $values, $func); 137*6734bb8cSAndreas Gohr if (empty($matches)) continue; 138*6734bb8cSAndreas Gohr 139*6734bb8cSAndreas Gohr // resolve token IDs to entity frequencies 140*6734bb8cSAndreas Gohr $tokenFreqs = $this->collection->resolveTokenFrequencies($group, array_keys($matches)); 141*6734bb8cSAndreas Gohr foreach ($tokenFreqs as $tokenId => $frequencies) { 142*6734bb8cSAndreas Gohr foreach ($frequencies as $entityId => $freq) { 143*6734bb8cSAndreas Gohr $allEntityIds[$entityId] = true; 144*6734bb8cSAndreas Gohr } 145*6734bb8cSAndreas Gohr } 146*6734bb8cSAndreas Gohr 147*6734bb8cSAndreas Gohr $allMatches[$group] = ['matches' => $matches, 'freqs' => $tokenFreqs]; 148*6734bb8cSAndreas Gohr } 149*6734bb8cSAndreas Gohr 150*6734bb8cSAndreas Gohr if (empty($allEntityIds)) return $result; 151*6734bb8cSAndreas Gohr 152*6734bb8cSAndreas Gohr // resolve entity IDs to names 153*6734bb8cSAndreas Gohr $entityIndex = $this->collection->getEntityIndex(); 154*6734bb8cSAndreas Gohr $entityNames = $entityIndex->retrieveRows(array_keys($allEntityIds)); 155*6734bb8cSAndreas Gohr 156*6734bb8cSAndreas Gohr // assemble results 157*6734bb8cSAndreas Gohr foreach ($allMatches as $group => $data) { 158*6734bb8cSAndreas Gohr foreach ($data['matches'] as $tokenId => $valList) { 159*6734bb8cSAndreas Gohr $pages = []; 160*6734bb8cSAndreas Gohr if (isset($data['freqs'][$tokenId])) { 161*6734bb8cSAndreas Gohr foreach (array_keys($data['freqs'][$tokenId]) as $entityId) { 162*6734bb8cSAndreas Gohr if (isset($entityNames[$entityId]) && $entityNames[$entityId] !== '') { 163*6734bb8cSAndreas Gohr $pages[] = $entityNames[$entityId]; 164*6734bb8cSAndreas Gohr } 165*6734bb8cSAndreas Gohr } 166*6734bb8cSAndreas Gohr } 167*6734bb8cSAndreas Gohr foreach ($valList as $val) { 168*6734bb8cSAndreas Gohr $result[$val] = array_merge($result[$val], $pages); 169*6734bb8cSAndreas Gohr } 170*6734bb8cSAndreas Gohr } 171*6734bb8cSAndreas Gohr } 172*6734bb8cSAndreas Gohr 173*6734bb8cSAndreas Gohr return $result; 174*6734bb8cSAndreas Gohr } 175*6734bb8cSAndreas Gohr 176*6734bb8cSAndreas Gohr /** 177*6734bb8cSAndreas Gohr * Find token IDs matching the given values using exact, wildcard, or callback matching 178*6734bb8cSAndreas Gohr * 179*6734bb8cSAndreas Gohr * @param AbstractIndex $tokenIndex 180*6734bb8cSAndreas Gohr * @param string[] $values 181*6734bb8cSAndreas Gohr * @param callable|null $func 182*6734bb8cSAndreas Gohr * @return array [tokenId => [value, ...], ...] matching token IDs with the values they matched 183*6734bb8cSAndreas Gohr */ 184*6734bb8cSAndreas Gohr protected function findMatchingTokens(AbstractIndex $tokenIndex, array $values, ?callable $func): array 185*6734bb8cSAndreas Gohr { 186*6734bb8cSAndreas Gohr $matches = []; 187*6734bb8cSAndreas Gohr 188*6734bb8cSAndreas Gohr if ($func !== null) { 189*6734bb8cSAndreas Gohr // callback matching: iterate all tokens 190*6734bb8cSAndreas Gohr foreach ($tokenIndex as $tokenId => $word) { 191*6734bb8cSAndreas Gohr if ($word === '') continue; 192*6734bb8cSAndreas Gohr foreach ($values as $val) { 193*6734bb8cSAndreas Gohr if (call_user_func($func, $val, $word)) { 194*6734bb8cSAndreas Gohr $matches[$tokenId][] = $val; 195*6734bb8cSAndreas Gohr } 196*6734bb8cSAndreas Gohr } 197*6734bb8cSAndreas Gohr } 198*6734bb8cSAndreas Gohr } else { 199*6734bb8cSAndreas Gohr foreach ($values as $val) { 200*6734bb8cSAndreas Gohr $xval = $val; 201*6734bb8cSAndreas Gohr $caret = '^'; 202*6734bb8cSAndreas Gohr $dollar = '$'; 203*6734bb8cSAndreas Gohr if (substr($xval, 0, 1) === '*') { 204*6734bb8cSAndreas Gohr $xval = substr($xval, 1); 205*6734bb8cSAndreas Gohr $caret = ''; 206*6734bb8cSAndreas Gohr } 207*6734bb8cSAndreas Gohr if (substr($xval, -1, 1) === '*') { 208*6734bb8cSAndreas Gohr $xval = substr($xval, 0, -1); 209*6734bb8cSAndreas Gohr $dollar = ''; 210*6734bb8cSAndreas Gohr } 211*6734bb8cSAndreas Gohr if (!$caret || !$dollar) { 212*6734bb8cSAndreas Gohr // wildcard matching 213*6734bb8cSAndreas Gohr $re = '/' . $caret . preg_quote($xval, '/') . $dollar . '/'; 214*6734bb8cSAndreas Gohr foreach ($tokenIndex->search($re) as $tokenId => $word) { 215*6734bb8cSAndreas Gohr $matches[$tokenId][] = $val; 216*6734bb8cSAndreas Gohr } 217*6734bb8cSAndreas Gohr } else { 218*6734bb8cSAndreas Gohr // exact matching 219*6734bb8cSAndreas Gohr $tokenId = $tokenIndex->getRowID($val); 220*6734bb8cSAndreas Gohr if ($tokenId !== null) { 221*6734bb8cSAndreas Gohr $matches[$tokenId][] = $val; 222*6734bb8cSAndreas Gohr } 223*6734bb8cSAndreas Gohr } 224*6734bb8cSAndreas Gohr } 225*6734bb8cSAndreas Gohr } 226*6734bb8cSAndreas Gohr 227*6734bb8cSAndreas Gohr return $matches; 228*6734bb8cSAndreas Gohr } 229*6734bb8cSAndreas Gohr 230*6734bb8cSAndreas Gohr /** 231*6734bb8cSAndreas Gohr * Look up the matching tokens for all set terms 232*6734bb8cSAndreas Gohr * 233*6734bb8cSAndreas Gohr * @return void 234*6734bb8cSAndreas Gohr */ 235*6734bb8cSAndreas Gohr protected function findTokens(): void 236*6734bb8cSAndreas Gohr { 237*6734bb8cSAndreas Gohr foreach ($this->groupedTerms as $group => $terms) { 238*6734bb8cSAndreas Gohr $tokenIndex = $this->collection->getTokenIndex($group); 239*6734bb8cSAndreas Gohr if (!$tokenIndex->exists()) continue; 240*6734bb8cSAndreas Gohr foreach ($terms as $term) { 241*6734bb8cSAndreas Gohr $term->addTokens($group, $tokenIndex->search('/^' . $term->getQuoted() . '$/')); 242*6734bb8cSAndreas Gohr } 243*6734bb8cSAndreas Gohr } 244*6734bb8cSAndreas Gohr } 245*6734bb8cSAndreas Gohr 246*6734bb8cSAndreas Gohr /** 247*6734bb8cSAndreas Gohr * Look up the entity frequencies for all tokens found by findTokens 248*6734bb8cSAndreas Gohr * 249*6734bb8cSAndreas Gohr * @return void 250*6734bb8cSAndreas Gohr */ 251*6734bb8cSAndreas Gohr protected function findFrequencies(): void 252*6734bb8cSAndreas Gohr { 253*6734bb8cSAndreas Gohr foreach ($this->groupedTerms as $group => $terms) { 254*6734bb8cSAndreas Gohr foreach ($terms as $term) { 255*6734bb8cSAndreas Gohr $tokenIds = $term->getTokenIDsByGroup($group); 256*6734bb8cSAndreas Gohr if (empty($tokenIds)) continue; 257*6734bb8cSAndreas Gohr 258*6734bb8cSAndreas Gohr $tokenFreqs = $this->collection->resolveTokenFrequencies($group, $tokenIds); 259*6734bb8cSAndreas Gohr foreach ($tokenFreqs as $tokenId => $frequencies) { 260*6734bb8cSAndreas Gohr foreach ($frequencies as $entityID => $frequency) { 261*6734bb8cSAndreas Gohr $term->addEntityFrequency($entityID, $frequency); 262*6734bb8cSAndreas Gohr $this->entities[$entityID] = ''; 263*6734bb8cSAndreas Gohr } 264*6734bb8cSAndreas Gohr } 265*6734bb8cSAndreas Gohr } 266*6734bb8cSAndreas Gohr } 267*6734bb8cSAndreas Gohr } 268*6734bb8cSAndreas Gohr 269*6734bb8cSAndreas Gohr /** 270*6734bb8cSAndreas Gohr * Lookup the actual names of found entities 271*6734bb8cSAndreas Gohr * 272*6734bb8cSAndreas Gohr * @return void 273*6734bb8cSAndreas Gohr */ 274*6734bb8cSAndreas Gohr protected function findEntities(): void 275*6734bb8cSAndreas Gohr { 276*6734bb8cSAndreas Gohr $entityIndex = $this->collection->getEntityIndex(); 277*6734bb8cSAndreas Gohr $this->entities = $entityIndex->retrieveRows(array_keys($this->entities)); 278*6734bb8cSAndreas Gohr 279*6734bb8cSAndreas Gohr foreach ($this->allTerms as $term) { 280*6734bb8cSAndreas Gohr $term->resolveEntities($this->entities); 281*6734bb8cSAndreas Gohr } 282*6734bb8cSAndreas Gohr } 283*6734bb8cSAndreas Gohr} 284