xref: /dokuwiki/inc/Search/Collection/CollectionSearch.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
16734bb8cSAndreas Gohr<?php
26734bb8cSAndreas Gohr
36734bb8cSAndreas Gohrnamespace dokuwiki\Search\Collection;
46734bb8cSAndreas Gohr
56734bb8cSAndreas Gohr/**
66734bb8cSAndreas Gohr * Search a collection for one or more terms with wildcards
76734bb8cSAndreas Gohr *
86734bb8cSAndreas Gohr * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both
96734bb8cSAndreas Gohr * split-by-length and non-split index layouts transparently.
106734bb8cSAndreas Gohr *
11*1148921dSAndreas Gohr * Use addTerm() to register search terms (with optional wildcards), then call execute().
12*1148921dSAndreas Gohr * Set caseInsensitive() on the search or on individual terms for case-insensitive matching.
136734bb8cSAndreas Gohr */
146734bb8cSAndreas Gohrclass CollectionSearch
156734bb8cSAndreas Gohr{
166734bb8cSAndreas Gohr    /** @var Term[] all terms indexed by original term name */
176734bb8cSAndreas Gohr    protected array $allTerms = [];
186734bb8cSAndreas Gohr
196734bb8cSAndreas Gohr    /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */
206734bb8cSAndreas Gohr    protected array $groupedTerms = [];
216734bb8cSAndreas Gohr
226734bb8cSAndreas Gohr    /** @var AbstractCollection The collection this search works on */
236734bb8cSAndreas Gohr    protected AbstractCollection $collection;
246734bb8cSAndreas Gohr
256734bb8cSAndreas Gohr    /** @var ?int the maximum token index suffix as currently indexed */
266734bb8cSAndreas Gohr    protected ?int $max = null;
276734bb8cSAndreas Gohr
28*1148921dSAndreas Gohr    /** @var bool default case sensitivity for new terms */
29*1148921dSAndreas Gohr    protected bool $defaultCaseInsensitive = false;
30*1148921dSAndreas Gohr
316734bb8cSAndreas Gohr    /**
326734bb8cSAndreas Gohr     * Initialize a search on the given collection
336734bb8cSAndreas Gohr     *
346734bb8cSAndreas Gohr     * @param AbstractCollection $collection
356734bb8cSAndreas Gohr     */
366734bb8cSAndreas Gohr    public function __construct(AbstractCollection $collection)
376734bb8cSAndreas Gohr    {
386734bb8cSAndreas Gohr        $this->collection = $collection;
396734bb8cSAndreas Gohr    }
406734bb8cSAndreas Gohr
416734bb8cSAndreas Gohr    /**
42*1148921dSAndreas Gohr     * Enable case-insensitive matching for all subsequently added terms
43*1148921dSAndreas Gohr     *
44*1148921dSAndreas Gohr     * @return static
45*1148921dSAndreas Gohr     */
46*1148921dSAndreas Gohr    public function caseInsensitive(): static
47*1148921dSAndreas Gohr    {
48*1148921dSAndreas Gohr        $this->defaultCaseInsensitive = true;
49*1148921dSAndreas Gohr        return $this;
50*1148921dSAndreas Gohr    }
51*1148921dSAndreas Gohr
52*1148921dSAndreas Gohr    /**
536734bb8cSAndreas Gohr     * Add a term that will be looked up in the index later
546734bb8cSAndreas Gohr     *
55*1148921dSAndreas Gohr     * @param string $term the search term, may include * wildcards at start/end
566734bb8cSAndreas Gohr     * @return Term the internal representation of the term, it will not be complete before the search has been executed
576734bb8cSAndreas Gohr     */
586734bb8cSAndreas Gohr    public function addTerm(string $term): Term
596734bb8cSAndreas Gohr    {
606734bb8cSAndreas Gohr        $term = new Term($term);
616734bb8cSAndreas Gohr
62*1148921dSAndreas Gohr        if ($this->defaultCaseInsensitive) {
63*1148921dSAndreas Gohr            $term->caseInsensitive();
64*1148921dSAndreas Gohr        }
65*1148921dSAndreas Gohr
666734bb8cSAndreas Gohr        // we keep all terms in an array
676734bb8cSAndreas Gohr        $this->allTerms[$term->getOriginal()] = $term;
686734bb8cSAndreas Gohr
696734bb8cSAndreas Gohr        if ($this->collection->isSplitByLength()) {
706734bb8cSAndreas Gohr            // for wildcards, we need to find tokens from all indexes equal or larger than the term length
716734bb8cSAndreas Gohr            if ($term->getWildcard()) {
726734bb8cSAndreas Gohr                if ($this->max === null) {
736734bb8cSAndreas Gohr                    $this->max = $this->collection->getTokenIndexMaximum();
746734bb8cSAndreas Gohr                }
756734bb8cSAndreas Gohr                $max = $this->max;
766734bb8cSAndreas Gohr            } else {
776734bb8cSAndreas Gohr                $max = $term->getLength();
786734bb8cSAndreas Gohr            }
796734bb8cSAndreas Gohr
806734bb8cSAndreas Gohr            for ($i = $term->getLength(); $i <= $max; $i++) {
816734bb8cSAndreas Gohr                $this->groupedTerms[$i][] = $term;
826734bb8cSAndreas Gohr            }
836734bb8cSAndreas Gohr        } else {
846734bb8cSAndreas Gohr            // non-split: all terms go into a single group
856734bb8cSAndreas Gohr            $this->groupedTerms[0][] = $term;
866734bb8cSAndreas Gohr        }
876734bb8cSAndreas Gohr
886734bb8cSAndreas Gohr        return $term;
896734bb8cSAndreas Gohr    }
906734bb8cSAndreas Gohr
916734bb8cSAndreas Gohr    /**
926734bb8cSAndreas Gohr     * Execute the search
936734bb8cSAndreas Gohr     *
94*1148921dSAndreas Gohr     * For each index group, scans the token index once testing all terms, then resolves
95*1148921dSAndreas Gohr     * which entities have the matched tokens (via the frequency index). After all groups
96*1148921dSAndreas Gohr     * are processed, entity IDs are batch-resolved to names via the entity index, and
97*1148921dSAndreas Gohr     * each Term is populated with the final results: entity name → token name → frequency.
98*1148921dSAndreas Gohr     *
99*1148921dSAndreas Gohr     * @return Term[] All defined terms keyed by original term string
1006734bb8cSAndreas Gohr     */
1016734bb8cSAndreas Gohr    public function execute(): array
1026734bb8cSAndreas Gohr    {
103*1148921dSAndreas Gohr        // Pass 1: per group, scan tokens and resolve frequencies
1046734bb8cSAndreas Gohr        $allEntityIds = [];
105*1148921dSAndreas Gohr        $groupResults = [];
106*1148921dSAndreas Gohr        foreach ($this->groupedTerms as $group => $terms) {
1076734bb8cSAndreas Gohr            $tokenIndex = $this->collection->getTokenIndex($group);
1086734bb8cSAndreas Gohr            if (!$tokenIndex->exists()) continue;
1096734bb8cSAndreas Gohr
110*1148921dSAndreas Gohr            // single-pass token scan for all terms in this group
111*1148921dSAndreas Gohr            $tokenMatches = []; // [tokenId => [{term, token}, ...]]
112*1148921dSAndreas Gohr            foreach ($tokenIndex as $tokenId => $tokenValue) {
113*1148921dSAndreas Gohr                if ($tokenValue === '') continue;
114*1148921dSAndreas Gohr                foreach ($terms as $term) {
115*1148921dSAndreas Gohr                    if ($term->matches($tokenValue)) {
116*1148921dSAndreas Gohr                        $tokenMatches[$tokenId][] = ['term' => $term, 'token' => $tokenValue];
117*1148921dSAndreas Gohr                    }
118*1148921dSAndreas Gohr                }
119*1148921dSAndreas Gohr            }
120*1148921dSAndreas Gohr            if (empty($tokenMatches)) continue;
1216734bb8cSAndreas Gohr
122*1148921dSAndreas Gohr            // resolve which entities have these tokens
123*1148921dSAndreas Gohr            $freqs = $this->collection->resolveTokenFrequencies($group, array_keys($tokenMatches));
124*1148921dSAndreas Gohr
125*1148921dSAndreas Gohr            // collect entity IDs for batch name resolution
126*1148921dSAndreas Gohr            foreach ($freqs as $entityFreqs) {
127*1148921dSAndreas Gohr                foreach (array_keys($entityFreqs) as $entityId) {
1286734bb8cSAndreas Gohr                    $allEntityIds[$entityId] = true;
1296734bb8cSAndreas Gohr                }
1306734bb8cSAndreas Gohr            }
1316734bb8cSAndreas Gohr
132*1148921dSAndreas Gohr            $groupResults[] = ['matches' => $tokenMatches, 'freqs' => $freqs];
1336734bb8cSAndreas Gohr        }
1346734bb8cSAndreas Gohr
135*1148921dSAndreas Gohr        if (empty($allEntityIds)) return $this->allTerms;
1366734bb8cSAndreas Gohr
137*1148921dSAndreas Gohr        // Batch resolve entity IDs to names (single sequential file read)
138*1148921dSAndreas Gohr        $entityMap = $this->collection->getEntityIndex()->retrieveRows(array_keys($allEntityIds));
1396734bb8cSAndreas Gohr
140*1148921dSAndreas Gohr        // Pass 2: populate Terms with fully resolved data
141*1148921dSAndreas Gohr        foreach ($groupResults as $data) {
142*1148921dSAndreas Gohr            foreach ($data['freqs'] as $tokenId => $entityFreqs) {
143*1148921dSAndreas Gohr                foreach ($data['matches'][$tokenId] as $match) {
144*1148921dSAndreas Gohr                    foreach ($entityFreqs as $entityId => $freq) {
145*1148921dSAndreas Gohr                        $entityName = $entityMap[$entityId] ?? '';
146*1148921dSAndreas Gohr                        if ($entityName === '') continue;
147*1148921dSAndreas Gohr                        $match['term']->addMatch($entityName, $match['token'], $freq);
1486734bb8cSAndreas Gohr                    }
1496734bb8cSAndreas Gohr                }
1506734bb8cSAndreas Gohr            }
1516734bb8cSAndreas Gohr        }
1526734bb8cSAndreas Gohr
153*1148921dSAndreas Gohr        return $this->allTerms;
1546734bb8cSAndreas Gohr    }
1556734bb8cSAndreas Gohr}
156