xref: /dokuwiki/inc/Search/Collection/CollectionSearch.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\SearchException;
6use dokuwiki\Search\Index\AbstractIndex;
7
8/**
9 * Search a collection for one or more terms with wildcards
10 *
11 * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both
12 * split-by-length and non-split index layouts transparently.
13 *
14 * Provides two APIs:
15 * - addTerm()/execute(): For fulltext-style search with Term objects and min-length validation
16 * - lookup(): For metadata-style search with exact/wildcard/callback matching, no length restrictions
17 */
18class CollectionSearch
19{
20    /** @var Term[] all terms indexed by original term name */
21    protected array $allTerms = [];
22
23    /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */
24    protected array $groupedTerms = [];
25
26    /** @var array<int, string> a list of entities that match [entityID => entityName] */
27    protected array $entities = [];
28
29    /** @var AbstractCollection The collection this search works on */
30    protected AbstractCollection $collection;
31
32    /** @var ?int the maximum token index suffix as currently indexed */
33    protected ?int $max = null;
34
35    /**
36     * Initialize a search on the given collection
37     *
38     * @param AbstractCollection $collection
39     */
40    public function __construct(AbstractCollection $collection)
41    {
42        $this->collection = $collection;
43    }
44
45    /**
46     * Add a term that will be looked up in the index later
47     *
48     * @param string $term
49     * @return Term the internal representation of the term, it will not be complete before the search has been executed
50     * @throws SearchException if the given term was too short or otherwise invalid
51     */
52    public function addTerm(string $term): Term
53    {
54        $term = new Term($term);
55
56        // we keep all terms in an array
57        $this->allTerms[$term->getOriginal()] = $term;
58
59        if ($this->collection->isSplitByLength()) {
60            // for wildcards, we need to find tokens from all indexes equal or larger than the term length
61            if ($term->getWildcard()) {
62                if ($this->max === null) {
63                    $this->max = $this->collection->getTokenIndexMaximum();
64                }
65                $max = $this->max;
66            } else {
67                $max = $term->getLength();
68            }
69
70            for ($i = $term->getLength(); $i <= $max; $i++) {
71                $this->groupedTerms[$i][] = $term;
72            }
73        } else {
74            // non-split: all terms go into a single group
75            $this->groupedTerms[0][] = $term;
76        }
77
78        return $term;
79    }
80
81    /**
82     * Execute the search
83     *
84     * @return Term[] All defined terms. Use their methods to access the results
85     */
86    public function execute(): array
87    {
88        $this->findTokens();
89        $this->findFrequencies();
90        $this->findEntities();
91
92        return $this->allTerms;
93    }
94
95    /**
96     * Get the entities that have the term
97     *
98     * @return array<int, string> [entityID => entityName, ...]
99     */
100    public function getEntities(): array
101    {
102        return $this->entities;
103    }
104
105    /**
106     * Search for values in the collection's token index
107     *
108     * A simpler API for metadata-style lookups without Term objects or min-length restrictions.
109     * Supports exact match, wildcard (*), and callback matching.
110     *
111     * @param string|string[] $values search values
112     * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool
113     * @return array [value => [entityName, ...], ...]
114     */
115    public function lookup(string|array $values, ?callable $func = null): array
116    {
117        if (!is_array($values)) {
118            $values = [$values];
119        }
120
121        $result = array_fill_keys($values, []);
122
123        // determine which groups to search
124        $groups = $this->collection->isSplitByLength()
125            ? range(1, $this->collection->getTokenIndexMaximum())
126            : [0];
127
128        // find matching token IDs across all groups
129        $allMatches = []; // [group => [tokenId => [value, ...], ...]]
130        $allEntityIds = [];
131
132        foreach ($groups as $group) {
133            $tokenIndex = $this->collection->getTokenIndex($group);
134            if (!$tokenIndex->exists()) continue;
135
136            $matches = $this->findMatchingTokens($tokenIndex, $values, $func);
137            if (empty($matches)) continue;
138
139            // resolve token IDs to entity frequencies
140            $tokenFreqs = $this->collection->resolveTokenFrequencies($group, array_keys($matches));
141            foreach ($tokenFreqs as $tokenId => $frequencies) {
142                foreach ($frequencies as $entityId => $freq) {
143                    $allEntityIds[$entityId] = true;
144                }
145            }
146
147            $allMatches[$group] = ['matches' => $matches, 'freqs' => $tokenFreqs];
148        }
149
150        if (empty($allEntityIds)) return $result;
151
152        // resolve entity IDs to names
153        $entityIndex = $this->collection->getEntityIndex();
154        $entityNames = $entityIndex->retrieveRows(array_keys($allEntityIds));
155
156        // assemble results
157        foreach ($allMatches as $group => $data) {
158            foreach ($data['matches'] as $tokenId => $valList) {
159                $pages = [];
160                if (isset($data['freqs'][$tokenId])) {
161                    foreach (array_keys($data['freqs'][$tokenId]) as $entityId) {
162                        if (isset($entityNames[$entityId]) && $entityNames[$entityId] !== '') {
163                            $pages[] = $entityNames[$entityId];
164                        }
165                    }
166                }
167                foreach ($valList as $val) {
168                    $result[$val] = array_merge($result[$val], $pages);
169                }
170            }
171        }
172
173        return $result;
174    }
175
176    /**
177     * Find token IDs matching the given values using exact, wildcard, or callback matching
178     *
179     * @param AbstractIndex $tokenIndex
180     * @param string[] $values
181     * @param callable|null $func
182     * @return array [tokenId => [value, ...], ...] matching token IDs with the values they matched
183     */
184    protected function findMatchingTokens(AbstractIndex $tokenIndex, array $values, ?callable $func): array
185    {
186        $matches = [];
187
188        if ($func !== null) {
189            // callback matching: iterate all tokens
190            foreach ($tokenIndex as $tokenId => $word) {
191                if ($word === '') continue;
192                foreach ($values as $val) {
193                    if (call_user_func($func, $val, $word)) {
194                        $matches[$tokenId][] = $val;
195                    }
196                }
197            }
198        } else {
199            foreach ($values as $val) {
200                $xval = $val;
201                $caret = '^';
202                $dollar = '$';
203                if (substr($xval, 0, 1) === '*') {
204                    $xval = substr($xval, 1);
205                    $caret = '';
206                }
207                if (substr($xval, -1, 1) === '*') {
208                    $xval = substr($xval, 0, -1);
209                    $dollar = '';
210                }
211                if (!$caret || !$dollar) {
212                    // wildcard matching
213                    $re = '/' . $caret . preg_quote($xval, '/') . $dollar . '/';
214                    foreach ($tokenIndex->search($re) as $tokenId => $word) {
215                        $matches[$tokenId][] = $val;
216                    }
217                } else {
218                    // exact matching
219                    $tokenId = $tokenIndex->getRowID($val);
220                    if ($tokenId !== null) {
221                        $matches[$tokenId][] = $val;
222                    }
223                }
224            }
225        }
226
227        return $matches;
228    }
229
230    /**
231     * Look up the matching tokens for all set terms
232     *
233     * @return void
234     */
235    protected function findTokens(): void
236    {
237        foreach ($this->groupedTerms as $group => $terms) {
238            $tokenIndex = $this->collection->getTokenIndex($group);
239            if (!$tokenIndex->exists()) continue;
240            foreach ($terms as $term) {
241                $term->addTokens($group, $tokenIndex->search('/^' . $term->getQuoted() . '$/'));
242            }
243        }
244    }
245
246    /**
247     * Look up the entity frequencies for all tokens found by findTokens
248     *
249     * @return void
250     */
251    protected function findFrequencies(): void
252    {
253        foreach ($this->groupedTerms as $group => $terms) {
254            foreach ($terms as $term) {
255                $tokenIds = $term->getTokenIDsByGroup($group);
256                if (empty($tokenIds)) continue;
257
258                $tokenFreqs = $this->collection->resolveTokenFrequencies($group, $tokenIds);
259                foreach ($tokenFreqs as $tokenId => $frequencies) {
260                    foreach ($frequencies as $entityID => $frequency) {
261                        $term->addEntityFrequency($entityID, $frequency);
262                        $this->entities[$entityID] = '';
263                    }
264                }
265            }
266        }
267    }
268
269    /**
270     * Lookup the actual names of found entities
271     *
272     * @return void
273     */
274    protected function findEntities(): void
275    {
276        $entityIndex = $this->collection->getEntityIndex();
277        $this->entities = $entityIndex->retrieveRows(array_keys($this->entities));
278
279        foreach ($this->allTerms as $term) {
280            $term->resolveEntities($this->entities);
281        }
282    }
283}
284