16734bb8cSAndreas Gohr<?php 26734bb8cSAndreas Gohr 36734bb8cSAndreas Gohrnamespace dokuwiki\Search\Collection; 46734bb8cSAndreas Gohr 56734bb8cSAndreas Gohr/** 66734bb8cSAndreas Gohr * Search a collection for one or more terms with wildcards 76734bb8cSAndreas Gohr * 86734bb8cSAndreas Gohr * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both 96734bb8cSAndreas Gohr * split-by-length and non-split index layouts transparently. 106734bb8cSAndreas Gohr * 111148921dSAndreas Gohr * Use addTerm() to register search terms (with optional wildcards), then call execute(). 121148921dSAndreas Gohr * Set caseInsensitive() on the search or on individual terms for case-insensitive matching. 136734bb8cSAndreas Gohr */ 146734bb8cSAndreas Gohrclass CollectionSearch 156734bb8cSAndreas Gohr{ 166734bb8cSAndreas Gohr /** @var Term[] all terms indexed by original term name */ 176734bb8cSAndreas Gohr protected array $allTerms = []; 186734bb8cSAndreas Gohr 196734bb8cSAndreas Gohr /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */ 206734bb8cSAndreas Gohr protected array $groupedTerms = []; 216734bb8cSAndreas Gohr 226734bb8cSAndreas Gohr /** @var AbstractCollection The collection this search works on */ 236734bb8cSAndreas Gohr protected AbstractCollection $collection; 246734bb8cSAndreas Gohr 256734bb8cSAndreas Gohr /** @var ?int the maximum token index suffix as currently indexed */ 266734bb8cSAndreas Gohr protected ?int $max = null; 276734bb8cSAndreas Gohr 281148921dSAndreas Gohr /** @var bool default case sensitivity for new terms */ 291148921dSAndreas Gohr protected bool $defaultCaseInsensitive = false; 301148921dSAndreas Gohr 316734bb8cSAndreas Gohr /** 326734bb8cSAndreas Gohr * Initialize a search on the given collection 336734bb8cSAndreas Gohr * 346734bb8cSAndreas Gohr * @param AbstractCollection $collection 356734bb8cSAndreas Gohr */ 366734bb8cSAndreas Gohr public function __construct(AbstractCollection $collection) 376734bb8cSAndreas Gohr { 386734bb8cSAndreas Gohr $this->collection = $collection; 396734bb8cSAndreas Gohr } 406734bb8cSAndreas Gohr 416734bb8cSAndreas Gohr /** 421148921dSAndreas Gohr * Enable case-insensitive matching for all subsequently added terms 431148921dSAndreas Gohr * 441148921dSAndreas Gohr * @return static 451148921dSAndreas Gohr */ 461148921dSAndreas Gohr public function caseInsensitive(): static 471148921dSAndreas Gohr { 481148921dSAndreas Gohr $this->defaultCaseInsensitive = true; 491148921dSAndreas Gohr return $this; 501148921dSAndreas Gohr } 511148921dSAndreas Gohr 521148921dSAndreas Gohr /** 536734bb8cSAndreas Gohr * Add a term that will be looked up in the index later 546734bb8cSAndreas Gohr * 551148921dSAndreas Gohr * @param string $term the search term, may include * wildcards at start/end 566734bb8cSAndreas Gohr * @return Term the internal representation of the term, it will not be complete before the search has been executed 576734bb8cSAndreas Gohr */ 586734bb8cSAndreas Gohr public function addTerm(string $term): Term 596734bb8cSAndreas Gohr { 606734bb8cSAndreas Gohr $term = new Term($term); 616734bb8cSAndreas Gohr 621148921dSAndreas Gohr if ($this->defaultCaseInsensitive) { 631148921dSAndreas Gohr $term->caseInsensitive(); 641148921dSAndreas Gohr } 651148921dSAndreas Gohr 666734bb8cSAndreas Gohr // we keep all terms in an array 676734bb8cSAndreas Gohr $this->allTerms[$term->getOriginal()] = $term; 686734bb8cSAndreas Gohr 696734bb8cSAndreas Gohr if ($this->collection->isSplitByLength()) { 706734bb8cSAndreas Gohr // for wildcards, we need to find tokens from all indexes equal or larger than the term length 716734bb8cSAndreas Gohr if ($term->getWildcard()) { 726734bb8cSAndreas Gohr if ($this->max === null) { 736734bb8cSAndreas Gohr $this->max = $this->collection->getTokenIndexMaximum(); 746734bb8cSAndreas Gohr } 756734bb8cSAndreas Gohr $max = $this->max; 766734bb8cSAndreas Gohr } else { 776734bb8cSAndreas Gohr $max = $term->getLength(); 786734bb8cSAndreas Gohr } 796734bb8cSAndreas Gohr 806734bb8cSAndreas Gohr for ($i = $term->getLength(); $i <= $max; $i++) { 816734bb8cSAndreas Gohr $this->groupedTerms[$i][] = $term; 826734bb8cSAndreas Gohr } 836734bb8cSAndreas Gohr } else { 846734bb8cSAndreas Gohr // non-split: all terms go into a single group 856734bb8cSAndreas Gohr $this->groupedTerms[0][] = $term; 866734bb8cSAndreas Gohr } 876734bb8cSAndreas Gohr 886734bb8cSAndreas Gohr return $term; 896734bb8cSAndreas Gohr } 906734bb8cSAndreas Gohr 916734bb8cSAndreas Gohr /** 926734bb8cSAndreas Gohr * Execute the search 936734bb8cSAndreas Gohr * 941148921dSAndreas Gohr * For each index group, scans the token index once testing all terms, then resolves 951148921dSAndreas Gohr * which entities have the matched tokens (via the frequency index). After all groups 961148921dSAndreas Gohr * are processed, entity IDs are batch-resolved to names via the entity index, and 971148921dSAndreas Gohr * each Term is populated with the final results: entity name → token name → frequency. 981148921dSAndreas Gohr * 991148921dSAndreas Gohr * @return Term[] All defined terms keyed by original term string 1006734bb8cSAndreas Gohr */ 1016734bb8cSAndreas Gohr public function execute(): array 1026734bb8cSAndreas Gohr { 1031148921dSAndreas Gohr // Pass 1: per group, scan tokens and resolve frequencies 1046734bb8cSAndreas Gohr $allEntityIds = []; 1051148921dSAndreas Gohr $groupResults = []; 1061148921dSAndreas Gohr foreach ($this->groupedTerms as $group => $terms) { 1076734bb8cSAndreas Gohr $tokenIndex = $this->collection->getTokenIndex($group); 1086734bb8cSAndreas Gohr if (!$tokenIndex->exists()) continue; 1096734bb8cSAndreas Gohr 1101148921dSAndreas Gohr // single-pass token scan for all terms in this group 1111148921dSAndreas Gohr $tokenMatches = []; // [tokenId => [{term, token}, ...]] 1121148921dSAndreas Gohr foreach ($tokenIndex as $tokenId => $tokenValue) { 1131148921dSAndreas Gohr if ($tokenValue === '') continue; 1141148921dSAndreas Gohr foreach ($terms as $term) { 1151148921dSAndreas Gohr if ($term->matches($tokenValue)) { 1161148921dSAndreas Gohr $tokenMatches[$tokenId][] = ['term' => $term, 'token' => $tokenValue]; 1171148921dSAndreas Gohr } 1181148921dSAndreas Gohr } 1191148921dSAndreas Gohr } 120*9369b4a9SAndreas Gohr if ($tokenMatches === []) continue; 1216734bb8cSAndreas Gohr 1221148921dSAndreas Gohr // resolve which entities have these tokens 1231148921dSAndreas Gohr $freqs = $this->collection->resolveTokenFrequencies($group, array_keys($tokenMatches)); 1241148921dSAndreas Gohr 1251148921dSAndreas Gohr // collect entity IDs for batch name resolution 1261148921dSAndreas Gohr foreach ($freqs as $entityFreqs) { 1271148921dSAndreas Gohr foreach (array_keys($entityFreqs) as $entityId) { 1286734bb8cSAndreas Gohr $allEntityIds[$entityId] = true; 1296734bb8cSAndreas Gohr } 1306734bb8cSAndreas Gohr } 1316734bb8cSAndreas Gohr 1321148921dSAndreas Gohr $groupResults[] = ['matches' => $tokenMatches, 'freqs' => $freqs]; 1336734bb8cSAndreas Gohr } 1346734bb8cSAndreas Gohr 135*9369b4a9SAndreas Gohr if ($allEntityIds === []) return $this->allTerms; 1366734bb8cSAndreas Gohr 1371148921dSAndreas Gohr // Batch resolve entity IDs to names (single sequential file read) 1381148921dSAndreas Gohr $entityMap = $this->collection->getEntityIndex()->retrieveRows(array_keys($allEntityIds)); 1396734bb8cSAndreas Gohr 1401148921dSAndreas Gohr // Pass 2: populate Terms with fully resolved data 1411148921dSAndreas Gohr foreach ($groupResults as $data) { 1421148921dSAndreas Gohr foreach ($data['freqs'] as $tokenId => $entityFreqs) { 1431148921dSAndreas Gohr foreach ($data['matches'][$tokenId] as $match) { 1441148921dSAndreas Gohr foreach ($entityFreqs as $entityId => $freq) { 1451148921dSAndreas Gohr $entityName = $entityMap[$entityId] ?? ''; 1461148921dSAndreas Gohr if ($entityName === '') continue; 1471148921dSAndreas Gohr $match['term']->addMatch($entityName, $match['token'], $freq); 1486734bb8cSAndreas Gohr } 1496734bb8cSAndreas Gohr } 1506734bb8cSAndreas Gohr } 1516734bb8cSAndreas Gohr } 1526734bb8cSAndreas Gohr 1531148921dSAndreas Gohr return $this->allTerms; 1546734bb8cSAndreas Gohr } 1556734bb8cSAndreas Gohr} 156