1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Exception\SearchException; 6use dokuwiki\Search\Index\AbstractIndex; 7 8/** 9 * Search a collection for one or more terms with wildcards 10 * 11 * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both 12 * split-by-length and non-split index layouts transparently. 13 * 14 * Provides two APIs: 15 * - addTerm()/execute(): For fulltext-style search with Term objects and min-length validation 16 * - lookup(): For metadata-style search with exact/wildcard/callback matching, no length restrictions 17 */ 18class CollectionSearch 19{ 20 /** @var Term[] all terms indexed by original term name */ 21 protected array $allTerms = []; 22 23 /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */ 24 protected array $groupedTerms = []; 25 26 /** @var array<int, string> a list of entities that match [entityID => entityName] */ 27 protected array $entities = []; 28 29 /** @var AbstractCollection The collection this search works on */ 30 protected AbstractCollection $collection; 31 32 /** @var ?int the maximum token index suffix as currently indexed */ 33 protected ?int $max = null; 34 35 /** 36 * Initialize a search on the given collection 37 * 38 * @param AbstractCollection $collection 39 */ 40 public function __construct(AbstractCollection $collection) 41 { 42 $this->collection = $collection; 43 } 44 45 /** 46 * Add a term that will be looked up in the index later 47 * 48 * @param string $term 49 * @return Term the internal representation of the term, it will not be complete before the search has been executed 50 * @throws SearchException if the given term was too short or otherwise invalid 51 */ 52 public function addTerm(string $term): Term 53 { 54 $term = new Term($term); 55 56 // we keep all terms in an array 57 $this->allTerms[$term->getOriginal()] = $term; 58 59 if ($this->collection->isSplitByLength()) { 60 // for wildcards, we need to find tokens from all indexes equal or larger than the term length 61 if ($term->getWildcard()) { 62 if ($this->max === null) { 63 $this->max = $this->collection->getTokenIndexMaximum(); 64 } 65 $max = $this->max; 66 } else { 67 $max = $term->getLength(); 68 } 69 70 for ($i = $term->getLength(); $i <= $max; $i++) { 71 $this->groupedTerms[$i][] = $term; 72 } 73 } else { 74 // non-split: all terms go into a single group 75 $this->groupedTerms[0][] = $term; 76 } 77 78 return $term; 79 } 80 81 /** 82 * Execute the search 83 * 84 * @return Term[] All defined terms. Use their methods to access the results 85 */ 86 public function execute(): array 87 { 88 $this->findTokens(); 89 $this->findFrequencies(); 90 $this->findEntities(); 91 92 return $this->allTerms; 93 } 94 95 /** 96 * Get the entities that have the term 97 * 98 * @return array<int, string> [entityID => entityName, ...] 99 */ 100 public function getEntities(): array 101 { 102 return $this->entities; 103 } 104 105 /** 106 * Search for values in the collection's token index 107 * 108 * A simpler API for metadata-style lookups without Term objects or min-length restrictions. 109 * Supports exact match, wildcard (*), and callback matching. 110 * 111 * @param string|string[] $values search values 112 * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool 113 * @return array [value => [entityName, ...], ...] 114 */ 115 public function lookup(string|array $values, ?callable $func = null): array 116 { 117 if (!is_array($values)) { 118 $values = [$values]; 119 } 120 121 $result = array_fill_keys($values, []); 122 123 // determine which groups to search 124 $groups = $this->collection->isSplitByLength() 125 ? range(1, $this->collection->getTokenIndexMaximum()) 126 : [0]; 127 128 // find matching token IDs across all groups 129 $allMatches = []; // [group => [tokenId => [value, ...], ...]] 130 $allEntityIds = []; 131 132 foreach ($groups as $group) { 133 $tokenIndex = $this->collection->getTokenIndex($group); 134 if (!$tokenIndex->exists()) continue; 135 136 $matches = $this->findMatchingTokens($tokenIndex, $values, $func); 137 if (empty($matches)) continue; 138 139 // resolve token IDs to entity frequencies 140 $tokenFreqs = $this->collection->resolveTokenFrequencies($group, array_keys($matches)); 141 foreach ($tokenFreqs as $tokenId => $frequencies) { 142 foreach ($frequencies as $entityId => $freq) { 143 $allEntityIds[$entityId] = true; 144 } 145 } 146 147 $allMatches[$group] = ['matches' => $matches, 'freqs' => $tokenFreqs]; 148 } 149 150 if (empty($allEntityIds)) return $result; 151 152 // resolve entity IDs to names 153 $entityIndex = $this->collection->getEntityIndex(); 154 $entityNames = $entityIndex->retrieveRows(array_keys($allEntityIds)); 155 156 // assemble results 157 foreach ($allMatches as $group => $data) { 158 foreach ($data['matches'] as $tokenId => $valList) { 159 $pages = []; 160 if (isset($data['freqs'][$tokenId])) { 161 foreach (array_keys($data['freqs'][$tokenId]) as $entityId) { 162 if (isset($entityNames[$entityId]) && $entityNames[$entityId] !== '') { 163 $pages[] = $entityNames[$entityId]; 164 } 165 } 166 } 167 foreach ($valList as $val) { 168 $result[$val] = array_merge($result[$val], $pages); 169 } 170 } 171 } 172 173 return $result; 174 } 175 176 /** 177 * Find token IDs matching the given values using exact, wildcard, or callback matching 178 * 179 * @param AbstractIndex $tokenIndex 180 * @param string[] $values 181 * @param callable|null $func 182 * @return array [tokenId => [value, ...], ...] matching token IDs with the values they matched 183 */ 184 protected function findMatchingTokens(AbstractIndex $tokenIndex, array $values, ?callable $func): array 185 { 186 $matches = []; 187 188 if ($func !== null) { 189 // callback matching: iterate all tokens 190 foreach ($tokenIndex as $tokenId => $word) { 191 if ($word === '') continue; 192 foreach ($values as $val) { 193 if (call_user_func($func, $val, $word)) { 194 $matches[$tokenId][] = $val; 195 } 196 } 197 } 198 } else { 199 foreach ($values as $val) { 200 $xval = $val; 201 $caret = '^'; 202 $dollar = '$'; 203 if (substr($xval, 0, 1) === '*') { 204 $xval = substr($xval, 1); 205 $caret = ''; 206 } 207 if (substr($xval, -1, 1) === '*') { 208 $xval = substr($xval, 0, -1); 209 $dollar = ''; 210 } 211 if (!$caret || !$dollar) { 212 // wildcard matching 213 $re = '/' . $caret . preg_quote($xval, '/') . $dollar . '/'; 214 foreach ($tokenIndex->search($re) as $tokenId => $word) { 215 $matches[$tokenId][] = $val; 216 } 217 } else { 218 // exact matching 219 $tokenId = $tokenIndex->getRowID($val); 220 if ($tokenId !== null) { 221 $matches[$tokenId][] = $val; 222 } 223 } 224 } 225 } 226 227 return $matches; 228 } 229 230 /** 231 * Look up the matching tokens for all set terms 232 * 233 * @return void 234 */ 235 protected function findTokens(): void 236 { 237 foreach ($this->groupedTerms as $group => $terms) { 238 $tokenIndex = $this->collection->getTokenIndex($group); 239 if (!$tokenIndex->exists()) continue; 240 foreach ($terms as $term) { 241 $term->addTokens($group, $tokenIndex->search('/^' . $term->getQuoted() . '$/')); 242 } 243 } 244 } 245 246 /** 247 * Look up the entity frequencies for all tokens found by findTokens 248 * 249 * @return void 250 */ 251 protected function findFrequencies(): void 252 { 253 foreach ($this->groupedTerms as $group => $terms) { 254 foreach ($terms as $term) { 255 $tokenIds = $term->getTokenIDsByGroup($group); 256 if (empty($tokenIds)) continue; 257 258 $tokenFreqs = $this->collection->resolveTokenFrequencies($group, $tokenIds); 259 foreach ($tokenFreqs as $tokenId => $frequencies) { 260 foreach ($frequencies as $entityID => $frequency) { 261 $term->addEntityFrequency($entityID, $frequency); 262 $this->entities[$entityID] = ''; 263 } 264 } 265 } 266 } 267 } 268 269 /** 270 * Lookup the actual names of found entities 271 * 272 * @return void 273 */ 274 protected function findEntities(): void 275 { 276 $entityIndex = $this->collection->getEntityIndex(); 277 $this->entities = $entityIndex->retrieveRows(array_keys($this->entities)); 278 279 foreach ($this->allTerms as $term) { 280 $term->resolveEntities($this->entities); 281 } 282 } 283} 284