xref: /dokuwiki/inc/Search/Collection/CollectionSearch.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1*6734bb8cSAndreas Gohr<?php
2*6734bb8cSAndreas Gohr
3*6734bb8cSAndreas Gohrnamespace dokuwiki\Search\Collection;
4*6734bb8cSAndreas Gohr
5*6734bb8cSAndreas Gohruse dokuwiki\Search\Exception\SearchException;
6*6734bb8cSAndreas Gohruse dokuwiki\Search\Index\AbstractIndex;
7*6734bb8cSAndreas Gohr
8*6734bb8cSAndreas Gohr/**
9*6734bb8cSAndreas Gohr * Search a collection for one or more terms with wildcards
10*6734bb8cSAndreas Gohr *
11*6734bb8cSAndreas Gohr * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both
12*6734bb8cSAndreas Gohr * split-by-length and non-split index layouts transparently.
13*6734bb8cSAndreas Gohr *
14*6734bb8cSAndreas Gohr * Provides two APIs:
15*6734bb8cSAndreas Gohr * - addTerm()/execute(): For fulltext-style search with Term objects and min-length validation
16*6734bb8cSAndreas Gohr * - lookup(): For metadata-style search with exact/wildcard/callback matching, no length restrictions
17*6734bb8cSAndreas Gohr */
18*6734bb8cSAndreas Gohrclass CollectionSearch
19*6734bb8cSAndreas Gohr{
20*6734bb8cSAndreas Gohr    /** @var Term[] all terms indexed by original term name */
21*6734bb8cSAndreas Gohr    protected array $allTerms = [];
22*6734bb8cSAndreas Gohr
23*6734bb8cSAndreas Gohr    /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */
24*6734bb8cSAndreas Gohr    protected array $groupedTerms = [];
25*6734bb8cSAndreas Gohr
26*6734bb8cSAndreas Gohr    /** @var array<int, string> a list of entities that match [entityID => entityName] */
27*6734bb8cSAndreas Gohr    protected array $entities = [];
28*6734bb8cSAndreas Gohr
29*6734bb8cSAndreas Gohr    /** @var AbstractCollection The collection this search works on */
30*6734bb8cSAndreas Gohr    protected AbstractCollection $collection;
31*6734bb8cSAndreas Gohr
32*6734bb8cSAndreas Gohr    /** @var ?int the maximum token index suffix as currently indexed */
33*6734bb8cSAndreas Gohr    protected ?int $max = null;
34*6734bb8cSAndreas Gohr
35*6734bb8cSAndreas Gohr    /**
36*6734bb8cSAndreas Gohr     * Initialize a search on the given collection
37*6734bb8cSAndreas Gohr     *
38*6734bb8cSAndreas Gohr     * @param AbstractCollection $collection
39*6734bb8cSAndreas Gohr     */
40*6734bb8cSAndreas Gohr    public function __construct(AbstractCollection $collection)
41*6734bb8cSAndreas Gohr    {
42*6734bb8cSAndreas Gohr        $this->collection = $collection;
43*6734bb8cSAndreas Gohr    }
44*6734bb8cSAndreas Gohr
45*6734bb8cSAndreas Gohr    /**
46*6734bb8cSAndreas Gohr     * Add a term that will be looked up in the index later
47*6734bb8cSAndreas Gohr     *
48*6734bb8cSAndreas Gohr     * @param string $term
49*6734bb8cSAndreas Gohr     * @return Term the internal representation of the term, it will not be complete before the search has been executed
50*6734bb8cSAndreas Gohr     * @throws SearchException if the given term was too short or otherwise invalid
51*6734bb8cSAndreas Gohr     */
52*6734bb8cSAndreas Gohr    public function addTerm(string $term): Term
53*6734bb8cSAndreas Gohr    {
54*6734bb8cSAndreas Gohr        $term = new Term($term);
55*6734bb8cSAndreas Gohr
56*6734bb8cSAndreas Gohr        // we keep all terms in an array
57*6734bb8cSAndreas Gohr        $this->allTerms[$term->getOriginal()] = $term;
58*6734bb8cSAndreas Gohr
59*6734bb8cSAndreas Gohr        if ($this->collection->isSplitByLength()) {
60*6734bb8cSAndreas Gohr            // for wildcards, we need to find tokens from all indexes equal or larger than the term length
61*6734bb8cSAndreas Gohr            if ($term->getWildcard()) {
62*6734bb8cSAndreas Gohr                if ($this->max === null) {
63*6734bb8cSAndreas Gohr                    $this->max = $this->collection->getTokenIndexMaximum();
64*6734bb8cSAndreas Gohr                }
65*6734bb8cSAndreas Gohr                $max = $this->max;
66*6734bb8cSAndreas Gohr            } else {
67*6734bb8cSAndreas Gohr                $max = $term->getLength();
68*6734bb8cSAndreas Gohr            }
69*6734bb8cSAndreas Gohr
70*6734bb8cSAndreas Gohr            for ($i = $term->getLength(); $i <= $max; $i++) {
71*6734bb8cSAndreas Gohr                $this->groupedTerms[$i][] = $term;
72*6734bb8cSAndreas Gohr            }
73*6734bb8cSAndreas Gohr        } else {
74*6734bb8cSAndreas Gohr            // non-split: all terms go into a single group
75*6734bb8cSAndreas Gohr            $this->groupedTerms[0][] = $term;
76*6734bb8cSAndreas Gohr        }
77*6734bb8cSAndreas Gohr
78*6734bb8cSAndreas Gohr        return $term;
79*6734bb8cSAndreas Gohr    }
80*6734bb8cSAndreas Gohr
81*6734bb8cSAndreas Gohr    /**
82*6734bb8cSAndreas Gohr     * Execute the search
83*6734bb8cSAndreas Gohr     *
84*6734bb8cSAndreas Gohr     * @return Term[] All defined terms. Use their methods to access the results
85*6734bb8cSAndreas Gohr     */
86*6734bb8cSAndreas Gohr    public function execute(): array
87*6734bb8cSAndreas Gohr    {
88*6734bb8cSAndreas Gohr        $this->findTokens();
89*6734bb8cSAndreas Gohr        $this->findFrequencies();
90*6734bb8cSAndreas Gohr        $this->findEntities();
91*6734bb8cSAndreas Gohr
92*6734bb8cSAndreas Gohr        return $this->allTerms;
93*6734bb8cSAndreas Gohr    }
94*6734bb8cSAndreas Gohr
95*6734bb8cSAndreas Gohr    /**
96*6734bb8cSAndreas Gohr     * Get the entities that have the term
97*6734bb8cSAndreas Gohr     *
98*6734bb8cSAndreas Gohr     * @return array<int, string> [entityID => entityName, ...]
99*6734bb8cSAndreas Gohr     */
100*6734bb8cSAndreas Gohr    public function getEntities(): array
101*6734bb8cSAndreas Gohr    {
102*6734bb8cSAndreas Gohr        return $this->entities;
103*6734bb8cSAndreas Gohr    }
104*6734bb8cSAndreas Gohr
105*6734bb8cSAndreas Gohr    /**
106*6734bb8cSAndreas Gohr     * Search for values in the collection's token index
107*6734bb8cSAndreas Gohr     *
108*6734bb8cSAndreas Gohr     * A simpler API for metadata-style lookups without Term objects or min-length restrictions.
109*6734bb8cSAndreas Gohr     * Supports exact match, wildcard (*), and callback matching.
110*6734bb8cSAndreas Gohr     *
111*6734bb8cSAndreas Gohr     * @param string|string[] $values search values
112*6734bb8cSAndreas Gohr     * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool
113*6734bb8cSAndreas Gohr     * @return array [value => [entityName, ...], ...]
114*6734bb8cSAndreas Gohr     */
115*6734bb8cSAndreas Gohr    public function lookup(string|array $values, ?callable $func = null): array
116*6734bb8cSAndreas Gohr    {
117*6734bb8cSAndreas Gohr        if (!is_array($values)) {
118*6734bb8cSAndreas Gohr            $values = [$values];
119*6734bb8cSAndreas Gohr        }
120*6734bb8cSAndreas Gohr
121*6734bb8cSAndreas Gohr        $result = array_fill_keys($values, []);
122*6734bb8cSAndreas Gohr
123*6734bb8cSAndreas Gohr        // determine which groups to search
124*6734bb8cSAndreas Gohr        $groups = $this->collection->isSplitByLength()
125*6734bb8cSAndreas Gohr            ? range(1, $this->collection->getTokenIndexMaximum())
126*6734bb8cSAndreas Gohr            : [0];
127*6734bb8cSAndreas Gohr
128*6734bb8cSAndreas Gohr        // find matching token IDs across all groups
129*6734bb8cSAndreas Gohr        $allMatches = []; // [group => [tokenId => [value, ...], ...]]
130*6734bb8cSAndreas Gohr        $allEntityIds = [];
131*6734bb8cSAndreas Gohr
132*6734bb8cSAndreas Gohr        foreach ($groups as $group) {
133*6734bb8cSAndreas Gohr            $tokenIndex = $this->collection->getTokenIndex($group);
134*6734bb8cSAndreas Gohr            if (!$tokenIndex->exists()) continue;
135*6734bb8cSAndreas Gohr
136*6734bb8cSAndreas Gohr            $matches = $this->findMatchingTokens($tokenIndex, $values, $func);
137*6734bb8cSAndreas Gohr            if (empty($matches)) continue;
138*6734bb8cSAndreas Gohr
139*6734bb8cSAndreas Gohr            // resolve token IDs to entity frequencies
140*6734bb8cSAndreas Gohr            $tokenFreqs = $this->collection->resolveTokenFrequencies($group, array_keys($matches));
141*6734bb8cSAndreas Gohr            foreach ($tokenFreqs as $tokenId => $frequencies) {
142*6734bb8cSAndreas Gohr                foreach ($frequencies as $entityId => $freq) {
143*6734bb8cSAndreas Gohr                    $allEntityIds[$entityId] = true;
144*6734bb8cSAndreas Gohr                }
145*6734bb8cSAndreas Gohr            }
146*6734bb8cSAndreas Gohr
147*6734bb8cSAndreas Gohr            $allMatches[$group] = ['matches' => $matches, 'freqs' => $tokenFreqs];
148*6734bb8cSAndreas Gohr        }
149*6734bb8cSAndreas Gohr
150*6734bb8cSAndreas Gohr        if (empty($allEntityIds)) return $result;
151*6734bb8cSAndreas Gohr
152*6734bb8cSAndreas Gohr        // resolve entity IDs to names
153*6734bb8cSAndreas Gohr        $entityIndex = $this->collection->getEntityIndex();
154*6734bb8cSAndreas Gohr        $entityNames = $entityIndex->retrieveRows(array_keys($allEntityIds));
155*6734bb8cSAndreas Gohr
156*6734bb8cSAndreas Gohr        // assemble results
157*6734bb8cSAndreas Gohr        foreach ($allMatches as $group => $data) {
158*6734bb8cSAndreas Gohr            foreach ($data['matches'] as $tokenId => $valList) {
159*6734bb8cSAndreas Gohr                $pages = [];
160*6734bb8cSAndreas Gohr                if (isset($data['freqs'][$tokenId])) {
161*6734bb8cSAndreas Gohr                    foreach (array_keys($data['freqs'][$tokenId]) as $entityId) {
162*6734bb8cSAndreas Gohr                        if (isset($entityNames[$entityId]) && $entityNames[$entityId] !== '') {
163*6734bb8cSAndreas Gohr                            $pages[] = $entityNames[$entityId];
164*6734bb8cSAndreas Gohr                        }
165*6734bb8cSAndreas Gohr                    }
166*6734bb8cSAndreas Gohr                }
167*6734bb8cSAndreas Gohr                foreach ($valList as $val) {
168*6734bb8cSAndreas Gohr                    $result[$val] = array_merge($result[$val], $pages);
169*6734bb8cSAndreas Gohr                }
170*6734bb8cSAndreas Gohr            }
171*6734bb8cSAndreas Gohr        }
172*6734bb8cSAndreas Gohr
173*6734bb8cSAndreas Gohr        return $result;
174*6734bb8cSAndreas Gohr    }
175*6734bb8cSAndreas Gohr
176*6734bb8cSAndreas Gohr    /**
177*6734bb8cSAndreas Gohr     * Find token IDs matching the given values using exact, wildcard, or callback matching
178*6734bb8cSAndreas Gohr     *
179*6734bb8cSAndreas Gohr     * @param AbstractIndex $tokenIndex
180*6734bb8cSAndreas Gohr     * @param string[] $values
181*6734bb8cSAndreas Gohr     * @param callable|null $func
182*6734bb8cSAndreas Gohr     * @return array [tokenId => [value, ...], ...] matching token IDs with the values they matched
183*6734bb8cSAndreas Gohr     */
184*6734bb8cSAndreas Gohr    protected function findMatchingTokens(AbstractIndex $tokenIndex, array $values, ?callable $func): array
185*6734bb8cSAndreas Gohr    {
186*6734bb8cSAndreas Gohr        $matches = [];
187*6734bb8cSAndreas Gohr
188*6734bb8cSAndreas Gohr        if ($func !== null) {
189*6734bb8cSAndreas Gohr            // callback matching: iterate all tokens
190*6734bb8cSAndreas Gohr            foreach ($tokenIndex as $tokenId => $word) {
191*6734bb8cSAndreas Gohr                if ($word === '') continue;
192*6734bb8cSAndreas Gohr                foreach ($values as $val) {
193*6734bb8cSAndreas Gohr                    if (call_user_func($func, $val, $word)) {
194*6734bb8cSAndreas Gohr                        $matches[$tokenId][] = $val;
195*6734bb8cSAndreas Gohr                    }
196*6734bb8cSAndreas Gohr                }
197*6734bb8cSAndreas Gohr            }
198*6734bb8cSAndreas Gohr        } else {
199*6734bb8cSAndreas Gohr            foreach ($values as $val) {
200*6734bb8cSAndreas Gohr                $xval = $val;
201*6734bb8cSAndreas Gohr                $caret = '^';
202*6734bb8cSAndreas Gohr                $dollar = '$';
203*6734bb8cSAndreas Gohr                if (substr($xval, 0, 1) === '*') {
204*6734bb8cSAndreas Gohr                    $xval = substr($xval, 1);
205*6734bb8cSAndreas Gohr                    $caret = '';
206*6734bb8cSAndreas Gohr                }
207*6734bb8cSAndreas Gohr                if (substr($xval, -1, 1) === '*') {
208*6734bb8cSAndreas Gohr                    $xval = substr($xval, 0, -1);
209*6734bb8cSAndreas Gohr                    $dollar = '';
210*6734bb8cSAndreas Gohr                }
211*6734bb8cSAndreas Gohr                if (!$caret || !$dollar) {
212*6734bb8cSAndreas Gohr                    // wildcard matching
213*6734bb8cSAndreas Gohr                    $re = '/' . $caret . preg_quote($xval, '/') . $dollar . '/';
214*6734bb8cSAndreas Gohr                    foreach ($tokenIndex->search($re) as $tokenId => $word) {
215*6734bb8cSAndreas Gohr                        $matches[$tokenId][] = $val;
216*6734bb8cSAndreas Gohr                    }
217*6734bb8cSAndreas Gohr                } else {
218*6734bb8cSAndreas Gohr                    // exact matching
219*6734bb8cSAndreas Gohr                    $tokenId = $tokenIndex->getRowID($val);
220*6734bb8cSAndreas Gohr                    if ($tokenId !== null) {
221*6734bb8cSAndreas Gohr                        $matches[$tokenId][] = $val;
222*6734bb8cSAndreas Gohr                    }
223*6734bb8cSAndreas Gohr                }
224*6734bb8cSAndreas Gohr            }
225*6734bb8cSAndreas Gohr        }
226*6734bb8cSAndreas Gohr
227*6734bb8cSAndreas Gohr        return $matches;
228*6734bb8cSAndreas Gohr    }
229*6734bb8cSAndreas Gohr
230*6734bb8cSAndreas Gohr    /**
231*6734bb8cSAndreas Gohr     * Look up the matching tokens for all set terms
232*6734bb8cSAndreas Gohr     *
233*6734bb8cSAndreas Gohr     * @return void
234*6734bb8cSAndreas Gohr     */
235*6734bb8cSAndreas Gohr    protected function findTokens(): void
236*6734bb8cSAndreas Gohr    {
237*6734bb8cSAndreas Gohr        foreach ($this->groupedTerms as $group => $terms) {
238*6734bb8cSAndreas Gohr            $tokenIndex = $this->collection->getTokenIndex($group);
239*6734bb8cSAndreas Gohr            if (!$tokenIndex->exists()) continue;
240*6734bb8cSAndreas Gohr            foreach ($terms as $term) {
241*6734bb8cSAndreas Gohr                $term->addTokens($group, $tokenIndex->search('/^' . $term->getQuoted() . '$/'));
242*6734bb8cSAndreas Gohr            }
243*6734bb8cSAndreas Gohr        }
244*6734bb8cSAndreas Gohr    }
245*6734bb8cSAndreas Gohr
246*6734bb8cSAndreas Gohr    /**
247*6734bb8cSAndreas Gohr     * Look up the entity frequencies for all tokens found by findTokens
248*6734bb8cSAndreas Gohr     *
249*6734bb8cSAndreas Gohr     * @return void
250*6734bb8cSAndreas Gohr     */
251*6734bb8cSAndreas Gohr    protected function findFrequencies(): void
252*6734bb8cSAndreas Gohr    {
253*6734bb8cSAndreas Gohr        foreach ($this->groupedTerms as $group => $terms) {
254*6734bb8cSAndreas Gohr            foreach ($terms as $term) {
255*6734bb8cSAndreas Gohr                $tokenIds = $term->getTokenIDsByGroup($group);
256*6734bb8cSAndreas Gohr                if (empty($tokenIds)) continue;
257*6734bb8cSAndreas Gohr
258*6734bb8cSAndreas Gohr                $tokenFreqs = $this->collection->resolveTokenFrequencies($group, $tokenIds);
259*6734bb8cSAndreas Gohr                foreach ($tokenFreqs as $tokenId => $frequencies) {
260*6734bb8cSAndreas Gohr                    foreach ($frequencies as $entityID => $frequency) {
261*6734bb8cSAndreas Gohr                        $term->addEntityFrequency($entityID, $frequency);
262*6734bb8cSAndreas Gohr                        $this->entities[$entityID] = '';
263*6734bb8cSAndreas Gohr                    }
264*6734bb8cSAndreas Gohr                }
265*6734bb8cSAndreas Gohr            }
266*6734bb8cSAndreas Gohr        }
267*6734bb8cSAndreas Gohr    }
268*6734bb8cSAndreas Gohr
269*6734bb8cSAndreas Gohr    /**
270*6734bb8cSAndreas Gohr     * Lookup the actual names of found entities
271*6734bb8cSAndreas Gohr     *
272*6734bb8cSAndreas Gohr     * @return void
273*6734bb8cSAndreas Gohr     */
274*6734bb8cSAndreas Gohr    protected function findEntities(): void
275*6734bb8cSAndreas Gohr    {
276*6734bb8cSAndreas Gohr        $entityIndex = $this->collection->getEntityIndex();
277*6734bb8cSAndreas Gohr        $this->entities = $entityIndex->retrieveRows(array_keys($this->entities));
278*6734bb8cSAndreas Gohr
279*6734bb8cSAndreas Gohr        foreach ($this->allTerms as $term) {
280*6734bb8cSAndreas Gohr            $term->resolveEntities($this->entities);
281*6734bb8cSAndreas Gohr        }
282*6734bb8cSAndreas Gohr    }
283*6734bb8cSAndreas Gohr}
284