xref: /dokuwiki/inc/Search/Query/QueryEvaluator.php (revision 06053dca2fac9a1da4eb1accf8c2488942da5d2a)
1<?php
2
3namespace dokuwiki\Search\Query;
4
5use dokuwiki\Utf8\PhpString;
6use dokuwiki\Extension\Event;
7use dokuwiki\Search\Collection\Term;
8use dokuwiki\Search\Indexer;
9use dokuwiki\Utf8;
10
11/**
12 * Evaluates a parsed search query against word lookup results
13 *
14 * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry)
15 * to avoid materializing the full page index unless absolutely necessary
16 * (standalone NOT or namespace-only queries).
17 */
18class QueryEvaluator
19{
20    /** @var string[] RPN token array from QueryParser */
21    protected array $rpn;
22
23    /** @var Term[] word => Term mapping from CollectionSearch */
24    protected array $terms;
25
26    /** @var PageSet|null lazy-loaded universe of all indexed pages */
27    protected ?PageSet $allPages = null;
28
29    /**
30     * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary']
31     * @param Term[] $terms word => Term mapping from CollectionSearch::execute()
32     */
33    public function __construct(array $rpn, array $terms)
34    {
35        $this->rpn = $rpn;
36        $this->terms = $terms;
37    }
38
39    /**
40     * Evaluate the RPN query and return matching pages with scores
41     *
42     * The query is represented in Reverse Polish Notation (RPN), also known as postfix
43     * notation. In RPN, operands come first and operators follow. For example, the infix
44     * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for
45     * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND".
46     *
47     * Evaluation uses a stack. Each token is processed left to right: operand tokens
48     * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND,
49     * OR, NOT) pop their operands from the stack, compute a result, and push it back.
50     * After all tokens are processed, the single remaining stack entry is the final result.
51     *
52     * The stack entries are typed: word lookups produce PageSet entries (concrete page
53     * results with scores), namespace tokens produce NamespacePredicate entries (a filter
54     * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators
55     * inspect the types of their operands to choose the most efficient operation — for
56     * example, AND with a NegatedEntry performs set subtraction rather than requiring
57     * the full page universe.
58     *
59     * @return array<string, int> page ID => score
60     */
61    public function evaluate(): array
62    {
63        /** @var StackEntry[] $stack */
64        $stack = [];
65
66        foreach ($this->rpn as $token) {
67            switch (substr($token, 0, 3)) {
68                case 'W+:':
69                case 'W-:':
70                case 'W_:':
71                    $word = substr($token, 3);
72                    if (isset($this->terms[$word])) {
73                        $stack[] = new PageSet($this->terms[$word]->getEntityFrequencies());
74                    }
75                    break;
76
77                case 'P+:':
78                case 'P-:':
79                    $phrase = substr($token, 3);
80                    // Phrases are always preceded by their component words AND'd together,
81                    // so the top of stack contains pages matching all words in the phrase.
82                    // We verify the actual phrase exists in those candidate pages.
83                    $candidates = end($stack) ?: new PageSet();
84                    $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates));
85                    break;
86
87                case 'N+:':
88                case 'N-:':
89                    $ns = cleanID(substr($token, 3)) . ':';
90                    $stack[] = new NamespacePredicate($ns);
91                    break;
92
93                case 'AND':
94                    $right = array_pop($stack);
95                    $left = array_pop($stack);
96                    $stack[] = $this->opAnd($left, $right);
97                    break;
98
99                case 'OR':
100                    $right = array_pop($stack);
101                    $left = array_pop($stack);
102                    $stack[] = $this->opOr($left, $right);
103                    break;
104
105                case 'NOT':
106                    $operand = array_pop($stack);
107                    $stack[] = new NegatedEntry($operand);
108                    break;
109            }
110        }
111
112        $result = array_pop($stack) ?? new PageSet();
113        return $this->materialize($result)->getPages();
114    }
115
116    // region Operators
117
118    /**
119     * AND: combine two operands based on their types
120     *
121     * PageSet AND PageSet produces an intersection with summed scores. When one
122     * operand is a NegatedEntry, the operation becomes set subtraction. When one
123     * operand is a NamespacePredicate, the other is filtered by namespace prefix.
124     *
125     * @param StackEntry $left
126     * @param StackEntry $right
127     * @return StackEntry
128     */
129    protected function opAnd(StackEntry $left, StackEntry $right): StackEntry
130    {
131        // page set AND negated → subtract
132        if ($left instanceof PageSet && $right instanceof NegatedEntry) {
133            return $this->subtractNegated($left, $right);
134        }
135        if ($left instanceof NegatedEntry && $right instanceof PageSet) {
136            return $this->subtractNegated($right, $left);
137        }
138
139        // page set AND namespace → filter by prefix
140        if ($left instanceof PageSet && $right instanceof NamespacePredicate) {
141            return $right->filter($left);
142        }
143        if ($left instanceof NamespacePredicate && $right instanceof PageSet) {
144            return $left->filter($right);
145        }
146
147        // page set AND page set → intersect, sum scores
148        if ($left instanceof PageSet && $right instanceof PageSet) {
149            return $left->intersect($right);
150        }
151
152        // rare cases (negated AND negated, namespace AND namespace, etc.)
153        return $this->materialize($left)->intersect($this->materialize($right));
154    }
155
156    /**
157     * OR: unite two operands
158     *
159     * PageSet OR PageSet produces a union with summed scores. Other combinations
160     * require materializing operands into concrete page sets first.
161     *
162     * @param StackEntry $left
163     * @param StackEntry $right
164     * @return StackEntry
165     */
166    protected function opOr(StackEntry $left, StackEntry $right): StackEntry
167    {
168        if ($left instanceof PageSet && $right instanceof PageSet) {
169            return $left->unite($right);
170        }
171
172        return $this->materialize($left)->unite($this->materialize($right));
173    }
174
175    /**
176     * Subtract a NegatedEntry from a PageSet
177     *
178     * The inner entry of the NegatedEntry determines the operation:
179     * - NegatedEntry(PageSet): set subtraction
180     * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace
181     *
182     * @param PageSet $pages the positive page set
183     * @param NegatedEntry $negated the negated operand
184     * @return PageSet
185     */
186    protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet
187    {
188        $inner = $negated->getInner();
189
190        if ($inner instanceof NamespacePredicate) {
191            return $inner->exclude($pages);
192        }
193
194        return $pages->subtract($this->materialize($inner));
195    }
196
197    // endregion
198
199    // region Phrase matching
200
201    /**
202     * Check which pages from the candidate set contain the given phrase
203     *
204     * Verifies phrase presence by reading each page's raw wiki text.
205     * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event.
206     * Pages that match retain their original scores from the candidate set.
207     *
208     * @param string $phrase the phrase to search for
209     * @param PageSet $candidates pages to check (typically the AND'd word results)
210     * @return PageSet pages where the phrase was found, with original scores preserved
211     */
212    protected function matchPhrase(string $phrase, PageSet $candidates): PageSet
213    {
214        $matched = [];
215        foreach ($candidates->getPages() as $id => $score) {
216            $evdata = [
217                'id' => $id,
218                'phrase' => $phrase,
219                'text' => rawWiki($id),
220            ];
221            $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
222            if ($event->advise_before() && $event->result !== true) {
223                $text = PhpString::strtolower($evdata['text']);
224                if (str_contains($text, $phrase)) {
225                    $event->result = true;
226                }
227            }
228            $event->advise_after();
229            if ($event->result === true) {
230                $matched[$id] = $score;
231            }
232        }
233        return new PageSet($matched);
234    }
235
236    // endregion
237
238    // region Materialization
239
240    /**
241     * Convert any StackEntry into a concrete PageSet
242     *
243     * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry
244     * cannot be resolved without knowing all existing pages — a namespace
245     * predicate needs the full page list to find matching pages, and a negated
246     * entry needs it to compute the complement. This is why materialization
247     * triggers a lazy-load of the full page index from disk.
248     *
249     * This is only needed for standalone namespace or negative-only queries
250     * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed
251     * operators handle these entries without the universe.
252     *
253     * @param StackEntry $entry
254     * @return PageSet
255     */
256    protected function materialize(StackEntry $entry): PageSet
257    {
258        if ($entry instanceof PageSet) {
259            return $entry;
260        }
261
262        if ($entry instanceof NegatedEntry) {
263            return $this->getAllPages()->subtract($this->materialize($entry->getInner()));
264        }
265
266        if ($entry instanceof NamespacePredicate) {
267            return $entry->filter($this->getAllPages());
268        }
269
270        return new PageSet();
271    }
272
273    /**
274     * Lazy-load the universe of all indexed pages
275     *
276     * @return PageSet all pages with score 0
277     */
278    protected function getAllPages(): PageSet
279    {
280        if (!$this->allPages instanceof PageSet) {
281            $pages = (new Indexer())->getAllPages();
282            $this->allPages = new PageSet(array_fill_keys($pages, 0));
283        }
284        return $this->allPages;
285    }
286
287    // endregion
288}
289