1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\SearchException; 6use dokuwiki\Search\Index\AbstractIndex; 7 8/** 9 * Search a collection for one or more terms with wildcards 10 * 11 * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both 12 * split-by-length and non-split index layouts transparently. 13 * 14 * Provides two APIs: 15 * - addTerm()/execute(): For fulltext-style search with Term objects and min-length validation 16 * - lookup(): For metadata-style search with exact/wildcard/callback matching, no length restrictions 17 */ 18class CollectionSearch 19{ 20 /** @var Term[] all terms indexed by original term name */ 21 protected array $allTerms = []; 22 23 /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */ 24 protected array $groupedTerms = []; 25 26 /** @var array<int, string> a list of entities that match [entityID => entityName] */ 27 protected array $entities = []; 28 29 /** @var AbstractCollection The collection this search works on */ 30 protected AbstractCollection $collection; 31 32 /** @var ?int the maximum token index suffix as currently indexed */ 33 protected ?int $max = null; 34 35 /** 36 * Initialize a search on the given collection 37 * 38 * @param AbstractCollection $collection 39 */ 40 public function __construct(AbstractCollection $collection) 41 { 42 $this->collection = $collection; 43 } 44 45 /** 46 * Add a term that will be looked up in the index later 47 * 48 * @param string $term 49 * @return Term the internal representation of the term, it will not be complete before the search has been executed 50 * @throws SearchException if the given term was too short or otherwise invalid 51 */ 52 public function addTerm(string $term): Term 53 { 54 $term = new Term($term); 55 56 // we keep all terms in an array 57 $this->allTerms[$term->getOriginal()] = $term; 58 59 if ($this->collection->isSplitByLength()) { 60 // for wildcards, we need to find tokens from all indexes equal or larger than the term length 61 if ($term->getWildcard()) { 62 if ($this->max === null) { 63 $this->max = $this->collection->getTokenIndexMaximum(); 64 } 65 $max = $this->max; 66 } else { 67 $max = $term->getLength(); 68 } 69 70 for ($i = $term->getLength(); $i <= $max; $i++) { 71 $this->groupedTerms[$i][] = $term; 72 } 73 } else { 74 // non-split: all terms go into a single group 75 $this->groupedTerms[0][] = $term; 76 } 77 78 return $term; 79 } 80 81 /** 82 * Execute the search 83 * 84 * @return Term[] All defined terms. Use their methods to access the results 85 */ 86 public function execute(): array 87 { 88 $this->findTokens(); 89 $this->findFrequencies(); 90 $this->findEntities(); 91 92 return $this->allTerms; 93 } 94 95 /** 96 * Get the entities that have the term 97 * 98 * @return array<int, string> [entityID => entityName, ...] 99 */ 100 public function getEntities(): array 101 { 102 return $this->entities; 103 } 104 105 /** 106 * Search for values in the collection's token index 107 * 108 * A simpler API for metadata-style lookups without Term objects or min-length restrictions. 109 * Supports exact match, wildcard (*), and callback matching. 110 * 111 * @param string|string[] $values search values 112 * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool 113 * @return array [value => [entityName, ...], ...] 114 */ 115 public function lookup(string|array $values, ?callable $func = null): array 116 { 117 if (!is_array($values)) { 118 $values = [$values]; 119 } 120 121 $result = array_fill_keys($values, []); 122 123 // determine which groups to search 124 $max = $this->collection->isSplitByLength() ? $this->collection->getTokenIndexMaximum() : 0; 125 $groups = $this->collection->isSplitByLength() 126 ? ($max > 0 ? range(1, $max) : []) 127 : [0]; 128 129 // find matching token IDs across all groups 130 $allMatches = []; // [group => [tokenId => [value, ...], ...]] 131 $allEntityIds = []; 132 133 foreach ($groups as $group) { 134 $tokenIndex = $this->collection->getTokenIndex($group); 135 if (!$tokenIndex->exists()) continue; 136 137 $matches = $this->findMatchingTokens($tokenIndex, $values, $func); 138 if (empty($matches)) continue; 139 140 // resolve token IDs to entity frequencies 141 $tokenFreqs = $this->collection->resolveTokenFrequencies($group, array_keys($matches)); 142 foreach ($tokenFreqs as $tokenId => $frequencies) { 143 foreach ($frequencies as $entityId => $freq) { 144 $allEntityIds[$entityId] = true; 145 } 146 } 147 148 $allMatches[$group] = ['matches' => $matches, 'freqs' => $tokenFreqs]; 149 } 150 151 if (empty($allEntityIds)) return $result; 152 153 // resolve entity IDs to names 154 $entityIndex = $this->collection->getEntityIndex(); 155 $entityNames = $entityIndex->retrieveRows(array_keys($allEntityIds)); 156 157 // assemble results 158 foreach ($allMatches as $group => $data) { 159 foreach ($data['matches'] as $tokenId => $valList) { 160 $pages = []; 161 if (isset($data['freqs'][$tokenId])) { 162 foreach (array_keys($data['freqs'][$tokenId]) as $entityId) { 163 if (isset($entityNames[$entityId]) && $entityNames[$entityId] !== '') { 164 $pages[] = $entityNames[$entityId]; 165 } 166 } 167 } 168 foreach ($valList as $val) { 169 $result[$val] = array_merge($result[$val], $pages); 170 } 171 } 172 } 173 174 return $result; 175 } 176 177 /** 178 * Find token IDs matching the given values using exact, wildcard, or callback matching 179 * 180 * @param AbstractIndex $tokenIndex 181 * @param string[] $values 182 * @param callable|null $func 183 * @return array [tokenId => [value, ...], ...] matching token IDs with the values they matched 184 */ 185 protected function findMatchingTokens(AbstractIndex $tokenIndex, array $values, ?callable $func): array 186 { 187 $matches = []; 188 189 if ($func !== null) { 190 // callback matching: iterate all tokens 191 foreach ($tokenIndex as $tokenId => $word) { 192 if ($word === '') continue; 193 foreach ($values as $val) { 194 if (call_user_func($func, $val, $word)) { 195 $matches[$tokenId][] = $val; 196 } 197 } 198 } 199 } else { 200 foreach ($values as $val) { 201 $xval = $val; 202 $caret = '^'; 203 $dollar = '$'; 204 if (substr($xval, 0, 1) === '*') { 205 $xval = substr($xval, 1); 206 $caret = ''; 207 } 208 if (substr($xval, -1, 1) === '*') { 209 $xval = substr($xval, 0, -1); 210 $dollar = ''; 211 } 212 if (!$caret || !$dollar) { 213 // wildcard matching 214 $re = '/' . $caret . preg_quote($xval, '/') . $dollar . '/'; 215 foreach ($tokenIndex->search($re) as $tokenId => $word) { 216 $matches[$tokenId][] = $val; 217 } 218 } else { 219 // exact matching 220 $tokenId = $tokenIndex->getRowID($val); 221 if ($tokenId !== null) { 222 $matches[$tokenId][] = $val; 223 } 224 } 225 } 226 } 227 228 return $matches; 229 } 230 231 /** 232 * Look up the matching tokens for all set terms 233 * 234 * @return void 235 */ 236 protected function findTokens(): void 237 { 238 foreach ($this->groupedTerms as $group => $terms) { 239 $tokenIndex = $this->collection->getTokenIndex($group); 240 if (!$tokenIndex->exists()) continue; 241 foreach ($terms as $term) { 242 $term->addTokens($group, $tokenIndex->search('/^' . $term->getQuoted() . '$/')); 243 } 244 } 245 } 246 247 /** 248 * Look up the entity frequencies for all tokens found by findTokens 249 * 250 * @return void 251 */ 252 protected function findFrequencies(): void 253 { 254 foreach ($this->groupedTerms as $group => $terms) { 255 foreach ($terms as $term) { 256 $tokenIds = $term->getTokenIDsByGroup($group); 257 if (empty($tokenIds)) continue; 258 259 $tokenFreqs = $this->collection->resolveTokenFrequencies($group, $tokenIds); 260 foreach ($tokenFreqs as $tokenId => $frequencies) { 261 foreach ($frequencies as $entityID => $frequency) { 262 $term->addEntityFrequency($entityID, $frequency); 263 $this->entities[$entityID] = ''; 264 } 265 } 266 } 267 } 268 } 269 270 /** 271 * Lookup the actual names of found entities 272 * 273 * @return void 274 */ 275 protected function findEntities(): void 276 { 277 $entityIndex = $this->collection->getEntityIndex(); 278 $this->entities = $entityIndex->retrieveRows(array_keys($this->entities)); 279 280 foreach ($this->allTerms as $term) { 281 $term->resolveEntities($this->entities); 282 } 283 } 284} 285