16734bb8cSAndreas Gohr<?php 26734bb8cSAndreas Gohr 36734bb8cSAndreas Gohrnamespace dokuwiki\Search\Collection; 46734bb8cSAndreas Gohr 56734bb8cSAndreas Gohr/** 66734bb8cSAndreas Gohr * Search a collection for one or more terms with wildcards 76734bb8cSAndreas Gohr * 86734bb8cSAndreas Gohr * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both 96734bb8cSAndreas Gohr * split-by-length and non-split index layouts transparently. 106734bb8cSAndreas Gohr * 11*1148921dSAndreas Gohr * Use addTerm() to register search terms (with optional wildcards), then call execute(). 12*1148921dSAndreas Gohr * Set caseInsensitive() on the search or on individual terms for case-insensitive matching. 136734bb8cSAndreas Gohr */ 146734bb8cSAndreas Gohrclass CollectionSearch 156734bb8cSAndreas Gohr{ 166734bb8cSAndreas Gohr /** @var Term[] all terms indexed by original term name */ 176734bb8cSAndreas Gohr protected array $allTerms = []; 186734bb8cSAndreas Gohr 196734bb8cSAndreas Gohr /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */ 206734bb8cSAndreas Gohr protected array $groupedTerms = []; 216734bb8cSAndreas Gohr 226734bb8cSAndreas Gohr /** @var AbstractCollection The collection this search works on */ 236734bb8cSAndreas Gohr protected AbstractCollection $collection; 246734bb8cSAndreas Gohr 256734bb8cSAndreas Gohr /** @var ?int the maximum token index suffix as currently indexed */ 266734bb8cSAndreas Gohr protected ?int $max = null; 276734bb8cSAndreas Gohr 28*1148921dSAndreas Gohr /** @var bool default case sensitivity for new terms */ 29*1148921dSAndreas Gohr protected bool $defaultCaseInsensitive = false; 30*1148921dSAndreas Gohr 316734bb8cSAndreas Gohr /** 326734bb8cSAndreas Gohr * Initialize a search on the given collection 336734bb8cSAndreas Gohr * 346734bb8cSAndreas Gohr * @param AbstractCollection $collection 356734bb8cSAndreas Gohr */ 366734bb8cSAndreas Gohr public function __construct(AbstractCollection $collection) 376734bb8cSAndreas Gohr { 386734bb8cSAndreas Gohr $this->collection = $collection; 396734bb8cSAndreas Gohr } 406734bb8cSAndreas Gohr 416734bb8cSAndreas Gohr /** 42*1148921dSAndreas Gohr * Enable case-insensitive matching for all subsequently added terms 43*1148921dSAndreas Gohr * 44*1148921dSAndreas Gohr * @return static 45*1148921dSAndreas Gohr */ 46*1148921dSAndreas Gohr public function caseInsensitive(): static 47*1148921dSAndreas Gohr { 48*1148921dSAndreas Gohr $this->defaultCaseInsensitive = true; 49*1148921dSAndreas Gohr return $this; 50*1148921dSAndreas Gohr } 51*1148921dSAndreas Gohr 52*1148921dSAndreas Gohr /** 536734bb8cSAndreas Gohr * Add a term that will be looked up in the index later 546734bb8cSAndreas Gohr * 55*1148921dSAndreas Gohr * @param string $term the search term, may include * wildcards at start/end 566734bb8cSAndreas Gohr * @return Term the internal representation of the term, it will not be complete before the search has been executed 576734bb8cSAndreas Gohr */ 586734bb8cSAndreas Gohr public function addTerm(string $term): Term 596734bb8cSAndreas Gohr { 606734bb8cSAndreas Gohr $term = new Term($term); 616734bb8cSAndreas Gohr 62*1148921dSAndreas Gohr if ($this->defaultCaseInsensitive) { 63*1148921dSAndreas Gohr $term->caseInsensitive(); 64*1148921dSAndreas Gohr } 65*1148921dSAndreas Gohr 666734bb8cSAndreas Gohr // we keep all terms in an array 676734bb8cSAndreas Gohr $this->allTerms[$term->getOriginal()] = $term; 686734bb8cSAndreas Gohr 696734bb8cSAndreas Gohr if ($this->collection->isSplitByLength()) { 706734bb8cSAndreas Gohr // for wildcards, we need to find tokens from all indexes equal or larger than the term length 716734bb8cSAndreas Gohr if ($term->getWildcard()) { 726734bb8cSAndreas Gohr if ($this->max === null) { 736734bb8cSAndreas Gohr $this->max = $this->collection->getTokenIndexMaximum(); 746734bb8cSAndreas Gohr } 756734bb8cSAndreas Gohr $max = $this->max; 766734bb8cSAndreas Gohr } else { 776734bb8cSAndreas Gohr $max = $term->getLength(); 786734bb8cSAndreas Gohr } 796734bb8cSAndreas Gohr 806734bb8cSAndreas Gohr for ($i = $term->getLength(); $i <= $max; $i++) { 816734bb8cSAndreas Gohr $this->groupedTerms[$i][] = $term; 826734bb8cSAndreas Gohr } 836734bb8cSAndreas Gohr } else { 846734bb8cSAndreas Gohr // non-split: all terms go into a single group 856734bb8cSAndreas Gohr $this->groupedTerms[0][] = $term; 866734bb8cSAndreas Gohr } 876734bb8cSAndreas Gohr 886734bb8cSAndreas Gohr return $term; 896734bb8cSAndreas Gohr } 906734bb8cSAndreas Gohr 916734bb8cSAndreas Gohr /** 926734bb8cSAndreas Gohr * Execute the search 936734bb8cSAndreas Gohr * 94*1148921dSAndreas Gohr * For each index group, scans the token index once testing all terms, then resolves 95*1148921dSAndreas Gohr * which entities have the matched tokens (via the frequency index). After all groups 96*1148921dSAndreas Gohr * are processed, entity IDs are batch-resolved to names via the entity index, and 97*1148921dSAndreas Gohr * each Term is populated with the final results: entity name → token name → frequency. 98*1148921dSAndreas Gohr * 99*1148921dSAndreas Gohr * @return Term[] All defined terms keyed by original term string 1006734bb8cSAndreas Gohr */ 1016734bb8cSAndreas Gohr public function execute(): array 1026734bb8cSAndreas Gohr { 103*1148921dSAndreas Gohr // Pass 1: per group, scan tokens and resolve frequencies 1046734bb8cSAndreas Gohr $allEntityIds = []; 105*1148921dSAndreas Gohr $groupResults = []; 106*1148921dSAndreas Gohr foreach ($this->groupedTerms as $group => $terms) { 1076734bb8cSAndreas Gohr $tokenIndex = $this->collection->getTokenIndex($group); 1086734bb8cSAndreas Gohr if (!$tokenIndex->exists()) continue; 1096734bb8cSAndreas Gohr 110*1148921dSAndreas Gohr // single-pass token scan for all terms in this group 111*1148921dSAndreas Gohr $tokenMatches = []; // [tokenId => [{term, token}, ...]] 112*1148921dSAndreas Gohr foreach ($tokenIndex as $tokenId => $tokenValue) { 113*1148921dSAndreas Gohr if ($tokenValue === '') continue; 114*1148921dSAndreas Gohr foreach ($terms as $term) { 115*1148921dSAndreas Gohr if ($term->matches($tokenValue)) { 116*1148921dSAndreas Gohr $tokenMatches[$tokenId][] = ['term' => $term, 'token' => $tokenValue]; 117*1148921dSAndreas Gohr } 118*1148921dSAndreas Gohr } 119*1148921dSAndreas Gohr } 120*1148921dSAndreas Gohr if (empty($tokenMatches)) continue; 1216734bb8cSAndreas Gohr 122*1148921dSAndreas Gohr // resolve which entities have these tokens 123*1148921dSAndreas Gohr $freqs = $this->collection->resolveTokenFrequencies($group, array_keys($tokenMatches)); 124*1148921dSAndreas Gohr 125*1148921dSAndreas Gohr // collect entity IDs for batch name resolution 126*1148921dSAndreas Gohr foreach ($freqs as $entityFreqs) { 127*1148921dSAndreas Gohr foreach (array_keys($entityFreqs) as $entityId) { 1286734bb8cSAndreas Gohr $allEntityIds[$entityId] = true; 1296734bb8cSAndreas Gohr } 1306734bb8cSAndreas Gohr } 1316734bb8cSAndreas Gohr 132*1148921dSAndreas Gohr $groupResults[] = ['matches' => $tokenMatches, 'freqs' => $freqs]; 1336734bb8cSAndreas Gohr } 1346734bb8cSAndreas Gohr 135*1148921dSAndreas Gohr if (empty($allEntityIds)) return $this->allTerms; 1366734bb8cSAndreas Gohr 137*1148921dSAndreas Gohr // Batch resolve entity IDs to names (single sequential file read) 138*1148921dSAndreas Gohr $entityMap = $this->collection->getEntityIndex()->retrieveRows(array_keys($allEntityIds)); 1396734bb8cSAndreas Gohr 140*1148921dSAndreas Gohr // Pass 2: populate Terms with fully resolved data 141*1148921dSAndreas Gohr foreach ($groupResults as $data) { 142*1148921dSAndreas Gohr foreach ($data['freqs'] as $tokenId => $entityFreqs) { 143*1148921dSAndreas Gohr foreach ($data['matches'][$tokenId] as $match) { 144*1148921dSAndreas Gohr foreach ($entityFreqs as $entityId => $freq) { 145*1148921dSAndreas Gohr $entityName = $entityMap[$entityId] ?? ''; 146*1148921dSAndreas Gohr if ($entityName === '') continue; 147*1148921dSAndreas Gohr $match['term']->addMatch($entityName, $match['token'], $freq); 1486734bb8cSAndreas Gohr } 1496734bb8cSAndreas Gohr } 1506734bb8cSAndreas Gohr } 1516734bb8cSAndreas Gohr } 1526734bb8cSAndreas Gohr 153*1148921dSAndreas Gohr return $this->allTerms; 1546734bb8cSAndreas Gohr } 1556734bb8cSAndreas Gohr} 156