xref: /dokuwiki/inc/Search/Query/QueryEvaluator.php (revision ede4646658cf51245060332d97a319a39c788ea1)
1<?php
2
3namespace dokuwiki\Search\Query;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\Term;
7use dokuwiki\Search\Indexer;
8use dokuwiki\Utf8;
9
10/**
11 * Evaluates a parsed search query against word lookup results
12 *
13 * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry)
14 * to avoid materializing the full page index unless absolutely necessary
15 * (standalone NOT or namespace-only queries).
16 */
17class QueryEvaluator
18{
19    /** @var string[] RPN token array from QueryParser */
20    protected array $rpn;
21
22    /** @var Term[] word => Term mapping from FrequencyCollectionSearch */
23    protected array $terms;
24
25    /** @var PageSet|null lazy-loaded universe of all indexed pages */
26    protected ?PageSet $allPages = null;
27
28    /**
29     * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary']
30     * @param Term[] $terms word => Term mapping from FrequencyCollectionSearch::execute()
31     */
32    public function __construct(array $rpn, array $terms)
33    {
34        $this->rpn = $rpn;
35        $this->terms = $terms;
36    }
37
38    /**
39     * Evaluate the RPN query and return matching pages with scores
40     *
41     * The query is represented in Reverse Polish Notation (RPN), also known as postfix
42     * notation. In RPN, operands come first and operators follow. For example, the infix
43     * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for
44     * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND".
45     *
46     * Evaluation uses a stack. Each token is processed left to right: operand tokens
47     * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND,
48     * OR, NOT) pop their operands from the stack, compute a result, and push it back.
49     * After all tokens are processed, the single remaining stack entry is the final result.
50     *
51     * The stack entries are typed: word lookups produce PageSet entries (concrete page
52     * results with scores), namespace tokens produce NamespacePredicate entries (a filter
53     * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators
54     * inspect the types of their operands to choose the most efficient operation — for
55     * example, AND with a NegatedEntry performs set subtraction rather than requiring
56     * the full page universe.
57     *
58     * @return array<string, int> page ID => score
59     */
60    public function evaluate(): array
61    {
62        /** @var StackEntry[] $stack */
63        $stack = [];
64
65        foreach ($this->rpn as $token) {
66            switch (substr($token, 0, 3)) {
67                case 'W+:':
68                case 'W-:':
69                case 'W_:':
70                    $word = substr($token, 3);
71                    if (isset($this->terms[$word])) {
72                        $stack[] = new PageSet((array)$this->terms[$word]->getEntityFrequencies());
73                    }
74                    break;
75
76                case 'P+:':
77                case 'P-:':
78                    $phrase = substr($token, 3);
79                    // Phrases are always preceded by their component words AND'd together,
80                    // so the top of stack contains pages matching all words in the phrase.
81                    // We verify the actual phrase exists in those candidate pages.
82                    $candidates = end($stack) ?: new PageSet();
83                    $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates));
84                    break;
85
86                case 'N+:':
87                case 'N-:':
88                    $ns = cleanID(substr($token, 3)) . ':';
89                    $stack[] = new NamespacePredicate($ns);
90                    break;
91
92                case 'AND':
93                    $right = array_pop($stack);
94                    $left = array_pop($stack);
95                    $stack[] = $this->opAnd($left, $right);
96                    break;
97
98                case 'OR':
99                    $right = array_pop($stack);
100                    $left = array_pop($stack);
101                    $stack[] = $this->opOr($left, $right);
102                    break;
103
104                case 'NOT':
105                    $operand = array_pop($stack);
106                    $stack[] = new NegatedEntry($operand);
107                    break;
108            }
109        }
110
111        $result = array_pop($stack) ?? new PageSet();
112        return $this->materialize($result)->getPages();
113    }
114
115    // region Operators
116
117    /**
118     * AND: combine two operands based on their types
119     *
120     * PageSet AND PageSet produces an intersection with summed scores. When one
121     * operand is a NegatedEntry, the operation becomes set subtraction. When one
122     * operand is a NamespacePredicate, the other is filtered by namespace prefix.
123     *
124     * @param StackEntry $left
125     * @param StackEntry $right
126     * @return StackEntry
127     */
128    protected function opAnd(StackEntry $left, StackEntry $right): StackEntry
129    {
130        // page set AND negated → subtract
131        if ($left instanceof PageSet && $right instanceof NegatedEntry) {
132            return $this->subtractNegated($left, $right);
133        }
134        if ($left instanceof NegatedEntry && $right instanceof PageSet) {
135            return $this->subtractNegated($right, $left);
136        }
137
138        // page set AND namespace → filter by prefix
139        if ($left instanceof PageSet && $right instanceof NamespacePredicate) {
140            return $right->filter($left);
141        }
142        if ($left instanceof NamespacePredicate && $right instanceof PageSet) {
143            return $left->filter($right);
144        }
145
146        // page set AND page set → intersect, sum scores
147        if ($left instanceof PageSet && $right instanceof PageSet) {
148            return $left->intersect($right);
149        }
150
151        // rare cases (negated AND negated, namespace AND namespace, etc.)
152        return $this->materialize($left)->intersect($this->materialize($right));
153    }
154
155    /**
156     * OR: unite two operands
157     *
158     * PageSet OR PageSet produces a union with summed scores. Other combinations
159     * require materializing operands into concrete page sets first.
160     *
161     * @param StackEntry $left
162     * @param StackEntry $right
163     * @return StackEntry
164     */
165    protected function opOr(StackEntry $left, StackEntry $right): StackEntry
166    {
167        if ($left instanceof PageSet && $right instanceof PageSet) {
168            return $left->unite($right);
169        }
170
171        return $this->materialize($left)->unite($this->materialize($right));
172    }
173
174    /**
175     * Subtract a NegatedEntry from a PageSet
176     *
177     * The inner entry of the NegatedEntry determines the operation:
178     * - NegatedEntry(PageSet): set subtraction
179     * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace
180     *
181     * @param PageSet $pages the positive page set
182     * @param NegatedEntry $negated the negated operand
183     * @return PageSet
184     */
185    protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet
186    {
187        $inner = $negated->getInner();
188
189        if ($inner instanceof NamespacePredicate) {
190            return $inner->exclude($pages);
191        }
192
193        return $pages->subtract($this->materialize($inner));
194    }
195
196    // endregion
197
198    // region Phrase matching
199
200    /**
201     * Check which pages from the candidate set contain the given phrase
202     *
203     * Verifies phrase presence by reading each page's raw wiki text.
204     * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event.
205     * Pages that match retain their original scores from the candidate set.
206     *
207     * @param string $phrase the phrase to search for
208     * @param PageSet $candidates pages to check (typically the AND'd word results)
209     * @return PageSet pages where the phrase was found, with original scores preserved
210     */
211    protected function matchPhrase(string $phrase, PageSet $candidates): PageSet
212    {
213        $matched = [];
214        foreach ($candidates->getPages() as $id => $score) {
215            $evdata = [
216                'id' => $id,
217                'phrase' => $phrase,
218                'text' => rawWiki($id),
219            ];
220            $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
221            if ($event->advise_before() && $event->result !== true) {
222                $text = Utf8\PhpString::strtolower($evdata['text']);
223                if (str_contains($text, $phrase)) {
224                    $event->result = true;
225                }
226            }
227            $event->advise_after();
228            if ($event->result === true) {
229                $matched[$id] = $score;
230            }
231        }
232        return new PageSet($matched);
233    }
234
235    // endregion
236
237    // region Materialization
238
239    /**
240     * Convert any StackEntry into a concrete PageSet
241     *
242     * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry
243     * cannot be resolved without knowing all existing pages — a namespace
244     * predicate needs the full page list to find matching pages, and a negated
245     * entry needs it to compute the complement. This is why materialization
246     * triggers a lazy-load of the full page index from disk.
247     *
248     * This is only needed for standalone namespace or negative-only queries
249     * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed
250     * operators handle these entries without the universe.
251     *
252     * @param StackEntry $entry
253     * @return PageSet
254     */
255    protected function materialize(StackEntry $entry): PageSet
256    {
257        if ($entry instanceof PageSet) {
258            return $entry;
259        }
260
261        if ($entry instanceof NegatedEntry) {
262            return $this->getAllPages()->subtract($this->materialize($entry->getInner()));
263        }
264
265        if ($entry instanceof NamespacePredicate) {
266            return $entry->filter($this->getAllPages());
267        }
268
269        return new PageSet();
270    }
271
272    /**
273     * Lazy-load the universe of all indexed pages
274     *
275     * @return PageSet all pages with score 0
276     */
277    protected function getAllPages(): PageSet
278    {
279        if ($this->allPages === null) {
280            $pages = (new Indexer())->getAllPages();
281            $this->allPages = new PageSet(array_fill_keys($pages, 0));
282        }
283        return $this->allPages;
284    }
285
286    // endregion
287}
288