xref: /dokuwiki/inc/Search/Collection/CollectionSearch.php (revision 5e9d26e3624fd22ca3c57447be9e16d0b502761e)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\SearchException;
6use dokuwiki\Search\Index\AbstractIndex;
7
8/**
9 * Search a collection for one or more terms with wildcards
10 *
11 * Works with any AbstractCollection (Frequency, Lookup, Direct) and handles both
12 * split-by-length and non-split index layouts transparently.
13 *
14 * Provides two APIs:
15 * - addTerm()/execute(): For fulltext-style search with Term objects and min-length validation
16 * - lookup(): For metadata-style search with exact/wildcard/callback matching, no length restrictions
17 */
18class CollectionSearch
19{
20    /** @var Term[] all terms indexed by original term name */
21    protected array $allTerms = [];
22
23    /** @var array<int, Term[]> references to terms indexed by group (length for split, 0 for non-split) */
24    protected array $groupedTerms = [];
25
26    /** @var array<int, string> a list of entities that match [entityID => entityName] */
27    protected array $entities = [];
28
29    /** @var AbstractCollection The collection this search works on */
30    protected AbstractCollection $collection;
31
32    /** @var ?int the maximum token index suffix as currently indexed */
33    protected ?int $max = null;
34
35    /**
36     * Initialize a search on the given collection
37     *
38     * @param AbstractCollection $collection
39     */
40    public function __construct(AbstractCollection $collection)
41    {
42        $this->collection = $collection;
43    }
44
45    /**
46     * Add a term that will be looked up in the index later
47     *
48     * @param string $term
49     * @return Term the internal representation of the term, it will not be complete before the search has been executed
50     * @throws SearchException if the given term was too short or otherwise invalid
51     */
52    public function addTerm(string $term): Term
53    {
54        $term = new Term($term);
55
56        // we keep all terms in an array
57        $this->allTerms[$term->getOriginal()] = $term;
58
59        if ($this->collection->isSplitByLength()) {
60            // for wildcards, we need to find tokens from all indexes equal or larger than the term length
61            if ($term->getWildcard()) {
62                if ($this->max === null) {
63                    $this->max = $this->collection->getTokenIndexMaximum();
64                }
65                $max = $this->max;
66            } else {
67                $max = $term->getLength();
68            }
69
70            for ($i = $term->getLength(); $i <= $max; $i++) {
71                $this->groupedTerms[$i][] = $term;
72            }
73        } else {
74            // non-split: all terms go into a single group
75            $this->groupedTerms[0][] = $term;
76        }
77
78        return $term;
79    }
80
81    /**
82     * Execute the search
83     *
84     * @return Term[] All defined terms. Use their methods to access the results
85     */
86    public function execute(): array
87    {
88        $this->findTokens();
89        $this->findFrequencies();
90        $this->findEntities();
91
92        return $this->allTerms;
93    }
94
95    /**
96     * Get the entities that have the term
97     *
98     * @return array<int, string> [entityID => entityName, ...]
99     */
100    public function getEntities(): array
101    {
102        return $this->entities;
103    }
104
105    /**
106     * Search for values in the collection's token index
107     *
108     * A simpler API for metadata-style lookups without Term objects or min-length restrictions.
109     * Supports exact match, wildcard (*), and callback matching.
110     *
111     * @param string|string[] $values search values
112     * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool
113     * @return array [value => [entityName, ...], ...]
114     */
115    public function lookup(string|array $values, ?callable $func = null): array
116    {
117        if (!is_array($values)) {
118            $values = [$values];
119        }
120
121        $result = array_fill_keys($values, []);
122
123        // determine which groups to search
124        $max = $this->collection->isSplitByLength() ? $this->collection->getTokenIndexMaximum() : 0;
125        $groups = $this->collection->isSplitByLength()
126            ? ($max > 0 ? range(1, $max) : [])
127            : [0];
128
129        // find matching token IDs across all groups
130        $allMatches = []; // [group => [tokenId => [value, ...], ...]]
131        $allEntityIds = [];
132
133        foreach ($groups as $group) {
134            $tokenIndex = $this->collection->getTokenIndex($group);
135            if (!$tokenIndex->exists()) continue;
136
137            $matches = $this->findMatchingTokens($tokenIndex, $values, $func);
138            if (empty($matches)) continue;
139
140            // resolve token IDs to entity frequencies
141            $tokenFreqs = $this->collection->resolveTokenFrequencies($group, array_keys($matches));
142            foreach ($tokenFreqs as $tokenId => $frequencies) {
143                foreach ($frequencies as $entityId => $freq) {
144                    $allEntityIds[$entityId] = true;
145                }
146            }
147
148            $allMatches[$group] = ['matches' => $matches, 'freqs' => $tokenFreqs];
149        }
150
151        if (empty($allEntityIds)) return $result;
152
153        // resolve entity IDs to names
154        $entityIndex = $this->collection->getEntityIndex();
155        $entityNames = $entityIndex->retrieveRows(array_keys($allEntityIds));
156
157        // assemble results
158        foreach ($allMatches as $group => $data) {
159            foreach ($data['matches'] as $tokenId => $valList) {
160                $pages = [];
161                if (isset($data['freqs'][$tokenId])) {
162                    foreach (array_keys($data['freqs'][$tokenId]) as $entityId) {
163                        if (isset($entityNames[$entityId]) && $entityNames[$entityId] !== '') {
164                            $pages[] = $entityNames[$entityId];
165                        }
166                    }
167                }
168                foreach ($valList as $val) {
169                    $result[$val] = array_merge($result[$val], $pages);
170                }
171            }
172        }
173
174        return $result;
175    }
176
177    /**
178     * Find token IDs matching the given values using exact, wildcard, or callback matching
179     *
180     * @param AbstractIndex $tokenIndex
181     * @param string[] $values
182     * @param callable|null $func
183     * @return array [tokenId => [value, ...], ...] matching token IDs with the values they matched
184     */
185    protected function findMatchingTokens(AbstractIndex $tokenIndex, array $values, ?callable $func): array
186    {
187        $matches = [];
188
189        if ($func !== null) {
190            // callback matching: iterate all tokens
191            foreach ($tokenIndex as $tokenId => $word) {
192                if ($word === '') continue;
193                foreach ($values as $val) {
194                    if (call_user_func($func, $val, $word)) {
195                        $matches[$tokenId][] = $val;
196                    }
197                }
198            }
199        } else {
200            foreach ($values as $val) {
201                $xval = $val;
202                $caret = '^';
203                $dollar = '$';
204                if (substr($xval, 0, 1) === '*') {
205                    $xval = substr($xval, 1);
206                    $caret = '';
207                }
208                if (substr($xval, -1, 1) === '*') {
209                    $xval = substr($xval, 0, -1);
210                    $dollar = '';
211                }
212                if (!$caret || !$dollar) {
213                    // wildcard matching
214                    $re = '/' . $caret . preg_quote($xval, '/') . $dollar . '/';
215                    foreach ($tokenIndex->search($re) as $tokenId => $word) {
216                        $matches[$tokenId][] = $val;
217                    }
218                } else {
219                    // exact matching
220                    $tokenId = $tokenIndex->getRowID($val);
221                    if ($tokenId !== null) {
222                        $matches[$tokenId][] = $val;
223                    }
224                }
225            }
226        }
227
228        return $matches;
229    }
230
231    /**
232     * Look up the matching tokens for all set terms
233     *
234     * @return void
235     */
236    protected function findTokens(): void
237    {
238        foreach ($this->groupedTerms as $group => $terms) {
239            $tokenIndex = $this->collection->getTokenIndex($group);
240            if (!$tokenIndex->exists()) continue;
241            foreach ($terms as $term) {
242                $term->addTokens($group, $tokenIndex->search('/^' . $term->getQuoted() . '$/'));
243            }
244        }
245    }
246
247    /**
248     * Look up the entity frequencies for all tokens found by findTokens
249     *
250     * @return void
251     */
252    protected function findFrequencies(): void
253    {
254        foreach ($this->groupedTerms as $group => $terms) {
255            foreach ($terms as $term) {
256                $tokenIds = $term->getTokenIDsByGroup($group);
257                if (empty($tokenIds)) continue;
258
259                $tokenFreqs = $this->collection->resolveTokenFrequencies($group, $tokenIds);
260                foreach ($tokenFreqs as $tokenId => $frequencies) {
261                    foreach ($frequencies as $entityID => $frequency) {
262                        $term->addEntityFrequency($entityID, $frequency);
263                        $this->entities[$entityID] = '';
264                    }
265                }
266            }
267        }
268    }
269
270    /**
271     * Lookup the actual names of found entities
272     *
273     * @return void
274     */
275    protected function findEntities(): void
276    {
277        $entityIndex = $this->collection->getEntityIndex();
278        $this->entities = $entityIndex->retrieveRows(array_keys($this->entities));
279
280        foreach ($this->allTerms as $term) {
281            $term->resolveEntities($this->entities);
282        }
283    }
284}
285