xref: /dokuwiki/inc/Search/Query/QueryEvaluator.php (revision 0b1bbbbb7d4e3c531cd255dbf878ce27d5967a0c)
1*0b1bbbbbSAndreas Gohr<?php
2*0b1bbbbbSAndreas Gohr
3*0b1bbbbbSAndreas Gohrnamespace dokuwiki\Search\Query;
4*0b1bbbbbSAndreas Gohr
5*0b1bbbbbSAndreas Gohruse dokuwiki\Extension\Event;
6*0b1bbbbbSAndreas Gohruse dokuwiki\Search\Collection\Term;
7*0b1bbbbbSAndreas Gohruse dokuwiki\Search\Indexer;
8*0b1bbbbbSAndreas Gohruse dokuwiki\Utf8;
9*0b1bbbbbSAndreas Gohr
10*0b1bbbbbSAndreas Gohr/**
11*0b1bbbbbSAndreas Gohr * Evaluates a parsed search query against word lookup results
12*0b1bbbbbSAndreas Gohr *
13*0b1bbbbbSAndreas Gohr * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry)
14*0b1bbbbbSAndreas Gohr * to avoid materializing the full page index unless absolutely necessary
15*0b1bbbbbSAndreas Gohr * (standalone NOT or namespace-only queries).
16*0b1bbbbbSAndreas Gohr */
17*0b1bbbbbSAndreas Gohrclass QueryEvaluator
18*0b1bbbbbSAndreas Gohr{
19*0b1bbbbbSAndreas Gohr    /** @var string[] RPN token array from QueryParser */
20*0b1bbbbbSAndreas Gohr    protected array $rpn;
21*0b1bbbbbSAndreas Gohr
22*0b1bbbbbSAndreas Gohr    /** @var Term[] word => Term mapping from FrequencyCollectionSearch */
23*0b1bbbbbSAndreas Gohr    protected array $terms;
24*0b1bbbbbSAndreas Gohr
25*0b1bbbbbSAndreas Gohr    /** @var PageSet|null lazy-loaded universe of all indexed pages */
26*0b1bbbbbSAndreas Gohr    protected ?PageSet $allPages = null;
27*0b1bbbbbSAndreas Gohr
28*0b1bbbbbSAndreas Gohr    /**
29*0b1bbbbbSAndreas Gohr     * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary']
30*0b1bbbbbSAndreas Gohr     * @param Term[] $terms word => Term mapping from FrequencyCollectionSearch::execute()
31*0b1bbbbbSAndreas Gohr     */
32*0b1bbbbbSAndreas Gohr    public function __construct(array $rpn, array $terms)
33*0b1bbbbbSAndreas Gohr    {
34*0b1bbbbbSAndreas Gohr        $this->rpn = $rpn;
35*0b1bbbbbSAndreas Gohr        $this->terms = $terms;
36*0b1bbbbbSAndreas Gohr    }
37*0b1bbbbbSAndreas Gohr
38*0b1bbbbbSAndreas Gohr    /**
39*0b1bbbbbSAndreas Gohr     * Evaluate the RPN query and return matching pages with scores
40*0b1bbbbbSAndreas Gohr     *
41*0b1bbbbbSAndreas Gohr     * The query is represented in Reverse Polish Notation (RPN), also known as postfix
42*0b1bbbbbSAndreas Gohr     * notation. In RPN, operands come first and operators follow. For example, the infix
43*0b1bbbbbSAndreas Gohr     * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for
44*0b1bbbbbSAndreas Gohr     * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND".
45*0b1bbbbbSAndreas Gohr     *
46*0b1bbbbbSAndreas Gohr     * Evaluation uses a stack. Each token is processed left to right: operand tokens
47*0b1bbbbbSAndreas Gohr     * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND,
48*0b1bbbbbSAndreas Gohr     * OR, NOT) pop their operands from the stack, compute a result, and push it back.
49*0b1bbbbbSAndreas Gohr     * After all tokens are processed, the single remaining stack entry is the final result.
50*0b1bbbbbSAndreas Gohr     *
51*0b1bbbbbSAndreas Gohr     * The stack entries are typed: word lookups produce PageSet entries (concrete page
52*0b1bbbbbSAndreas Gohr     * results with scores), namespace tokens produce NamespacePredicate entries (a filter
53*0b1bbbbbSAndreas Gohr     * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators
54*0b1bbbbbSAndreas Gohr     * inspect the types of their operands to choose the most efficient operation — for
55*0b1bbbbbSAndreas Gohr     * example, AND with a NegatedEntry performs set subtraction rather than requiring
56*0b1bbbbbSAndreas Gohr     * the full page universe.
57*0b1bbbbbSAndreas Gohr     *
58*0b1bbbbbSAndreas Gohr     * @return array<string, int> page ID => score
59*0b1bbbbbSAndreas Gohr     */
60*0b1bbbbbSAndreas Gohr    public function evaluate(): array
61*0b1bbbbbSAndreas Gohr    {
62*0b1bbbbbSAndreas Gohr        /** @var StackEntry[] $stack */
63*0b1bbbbbSAndreas Gohr        $stack = [];
64*0b1bbbbbSAndreas Gohr
65*0b1bbbbbSAndreas Gohr        foreach ($this->rpn as $token) {
66*0b1bbbbbSAndreas Gohr            switch (substr($token, 0, 3)) {
67*0b1bbbbbSAndreas Gohr                case 'W+:':
68*0b1bbbbbSAndreas Gohr                case 'W-:':
69*0b1bbbbbSAndreas Gohr                case 'W_:':
70*0b1bbbbbSAndreas Gohr                    $word = substr($token, 3);
71*0b1bbbbbSAndreas Gohr                    if (isset($this->terms[$word])) {
72*0b1bbbbbSAndreas Gohr                        $stack[] = new PageSet((array)$this->terms[$word]->getEntityFrequencies());
73*0b1bbbbbSAndreas Gohr                    }
74*0b1bbbbbSAndreas Gohr                    break;
75*0b1bbbbbSAndreas Gohr
76*0b1bbbbbSAndreas Gohr                case 'P+:':
77*0b1bbbbbSAndreas Gohr                case 'P-:':
78*0b1bbbbbSAndreas Gohr                    $phrase = substr($token, 3);
79*0b1bbbbbSAndreas Gohr                    // Phrases are always preceded by their component words AND'd together,
80*0b1bbbbbSAndreas Gohr                    // so the top of stack contains pages matching all words in the phrase.
81*0b1bbbbbSAndreas Gohr                    // We verify the actual phrase exists in those candidate pages.
82*0b1bbbbbSAndreas Gohr                    $candidates = end($stack) ?: new PageSet();
83*0b1bbbbbSAndreas Gohr                    $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates));
84*0b1bbbbbSAndreas Gohr                    break;
85*0b1bbbbbSAndreas Gohr
86*0b1bbbbbSAndreas Gohr                case 'N+:':
87*0b1bbbbbSAndreas Gohr                case 'N-:':
88*0b1bbbbbSAndreas Gohr                    $ns = cleanID(substr($token, 3)) . ':';
89*0b1bbbbbSAndreas Gohr                    $stack[] = new NamespacePredicate($ns);
90*0b1bbbbbSAndreas Gohr                    break;
91*0b1bbbbbSAndreas Gohr
92*0b1bbbbbSAndreas Gohr                case 'AND':
93*0b1bbbbbSAndreas Gohr                    $right = array_pop($stack);
94*0b1bbbbbSAndreas Gohr                    $left = array_pop($stack);
95*0b1bbbbbSAndreas Gohr                    $stack[] = $this->opAnd($left, $right);
96*0b1bbbbbSAndreas Gohr                    break;
97*0b1bbbbbSAndreas Gohr
98*0b1bbbbbSAndreas Gohr                case 'OR':
99*0b1bbbbbSAndreas Gohr                    $right = array_pop($stack);
100*0b1bbbbbSAndreas Gohr                    $left = array_pop($stack);
101*0b1bbbbbSAndreas Gohr                    $stack[] = $this->opOr($left, $right);
102*0b1bbbbbSAndreas Gohr                    break;
103*0b1bbbbbSAndreas Gohr
104*0b1bbbbbSAndreas Gohr                case 'NOT':
105*0b1bbbbbSAndreas Gohr                    $operand = array_pop($stack);
106*0b1bbbbbSAndreas Gohr                    $stack[] = new NegatedEntry($operand);
107*0b1bbbbbSAndreas Gohr                    break;
108*0b1bbbbbSAndreas Gohr            }
109*0b1bbbbbSAndreas Gohr        }
110*0b1bbbbbSAndreas Gohr
111*0b1bbbbbSAndreas Gohr        $result = array_pop($stack) ?? new PageSet();
112*0b1bbbbbSAndreas Gohr        return $this->materialize($result)->getPages();
113*0b1bbbbbSAndreas Gohr    }
114*0b1bbbbbSAndreas Gohr
115*0b1bbbbbSAndreas Gohr    // region Operators
116*0b1bbbbbSAndreas Gohr
117*0b1bbbbbSAndreas Gohr    /**
118*0b1bbbbbSAndreas Gohr     * AND: combine two operands based on their types
119*0b1bbbbbSAndreas Gohr     *
120*0b1bbbbbSAndreas Gohr     * PageSet AND PageSet produces an intersection with summed scores. When one
121*0b1bbbbbSAndreas Gohr     * operand is a NegatedEntry, the operation becomes set subtraction. When one
122*0b1bbbbbSAndreas Gohr     * operand is a NamespacePredicate, the other is filtered by namespace prefix.
123*0b1bbbbbSAndreas Gohr     *
124*0b1bbbbbSAndreas Gohr     * @param StackEntry $left
125*0b1bbbbbSAndreas Gohr     * @param StackEntry $right
126*0b1bbbbbSAndreas Gohr     * @return StackEntry
127*0b1bbbbbSAndreas Gohr     */
128*0b1bbbbbSAndreas Gohr    protected function opAnd(StackEntry $left, StackEntry $right): StackEntry
129*0b1bbbbbSAndreas Gohr    {
130*0b1bbbbbSAndreas Gohr        // page set AND negated → subtract
131*0b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof NegatedEntry) {
132*0b1bbbbbSAndreas Gohr            return $this->subtractNegated($left, $right);
133*0b1bbbbbSAndreas Gohr        }
134*0b1bbbbbSAndreas Gohr        if ($left instanceof NegatedEntry && $right instanceof PageSet) {
135*0b1bbbbbSAndreas Gohr            return $this->subtractNegated($right, $left);
136*0b1bbbbbSAndreas Gohr        }
137*0b1bbbbbSAndreas Gohr
138*0b1bbbbbSAndreas Gohr        // page set AND namespace → filter by prefix
139*0b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof NamespacePredicate) {
140*0b1bbbbbSAndreas Gohr            return $right->filter($left);
141*0b1bbbbbSAndreas Gohr        }
142*0b1bbbbbSAndreas Gohr        if ($left instanceof NamespacePredicate && $right instanceof PageSet) {
143*0b1bbbbbSAndreas Gohr            return $left->filter($right);
144*0b1bbbbbSAndreas Gohr        }
145*0b1bbbbbSAndreas Gohr
146*0b1bbbbbSAndreas Gohr        // page set AND page set → intersect, sum scores
147*0b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof PageSet) {
148*0b1bbbbbSAndreas Gohr            return $left->intersect($right);
149*0b1bbbbbSAndreas Gohr        }
150*0b1bbbbbSAndreas Gohr
151*0b1bbbbbSAndreas Gohr        // rare cases (negated AND negated, namespace AND namespace, etc.)
152*0b1bbbbbSAndreas Gohr        return $this->materialize($left)->intersect($this->materialize($right));
153*0b1bbbbbSAndreas Gohr    }
154*0b1bbbbbSAndreas Gohr
155*0b1bbbbbSAndreas Gohr    /**
156*0b1bbbbbSAndreas Gohr     * OR: unite two operands
157*0b1bbbbbSAndreas Gohr     *
158*0b1bbbbbSAndreas Gohr     * PageSet OR PageSet produces a union with summed scores. Other combinations
159*0b1bbbbbSAndreas Gohr     * require materializing operands into concrete page sets first.
160*0b1bbbbbSAndreas Gohr     *
161*0b1bbbbbSAndreas Gohr     * @param StackEntry $left
162*0b1bbbbbSAndreas Gohr     * @param StackEntry $right
163*0b1bbbbbSAndreas Gohr     * @return StackEntry
164*0b1bbbbbSAndreas Gohr     */
165*0b1bbbbbSAndreas Gohr    protected function opOr(StackEntry $left, StackEntry $right): StackEntry
166*0b1bbbbbSAndreas Gohr    {
167*0b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof PageSet) {
168*0b1bbbbbSAndreas Gohr            return $left->unite($right);
169*0b1bbbbbSAndreas Gohr        }
170*0b1bbbbbSAndreas Gohr
171*0b1bbbbbSAndreas Gohr        return $this->materialize($left)->unite($this->materialize($right));
172*0b1bbbbbSAndreas Gohr    }
173*0b1bbbbbSAndreas Gohr
174*0b1bbbbbSAndreas Gohr    /**
175*0b1bbbbbSAndreas Gohr     * Subtract a NegatedEntry from a PageSet
176*0b1bbbbbSAndreas Gohr     *
177*0b1bbbbbSAndreas Gohr     * The inner entry of the NegatedEntry determines the operation:
178*0b1bbbbbSAndreas Gohr     * - NegatedEntry(PageSet): set subtraction
179*0b1bbbbbSAndreas Gohr     * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace
180*0b1bbbbbSAndreas Gohr     *
181*0b1bbbbbSAndreas Gohr     * @param PageSet $pages the positive page set
182*0b1bbbbbSAndreas Gohr     * @param NegatedEntry $negated the negated operand
183*0b1bbbbbSAndreas Gohr     * @return PageSet
184*0b1bbbbbSAndreas Gohr     */
185*0b1bbbbbSAndreas Gohr    protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet
186*0b1bbbbbSAndreas Gohr    {
187*0b1bbbbbSAndreas Gohr        $inner = $negated->getInner();
188*0b1bbbbbSAndreas Gohr
189*0b1bbbbbSAndreas Gohr        if ($inner instanceof NamespacePredicate) {
190*0b1bbbbbSAndreas Gohr            return $inner->exclude($pages);
191*0b1bbbbbSAndreas Gohr        }
192*0b1bbbbbSAndreas Gohr
193*0b1bbbbbSAndreas Gohr        return $pages->subtract($this->materialize($inner));
194*0b1bbbbbSAndreas Gohr    }
195*0b1bbbbbSAndreas Gohr
196*0b1bbbbbSAndreas Gohr    // endregion
197*0b1bbbbbSAndreas Gohr
198*0b1bbbbbSAndreas Gohr    // region Phrase matching
199*0b1bbbbbSAndreas Gohr
200*0b1bbbbbSAndreas Gohr    /**
201*0b1bbbbbSAndreas Gohr     * Check which pages from the candidate set contain the given phrase
202*0b1bbbbbSAndreas Gohr     *
203*0b1bbbbbSAndreas Gohr     * Verifies phrase presence by reading each page's raw wiki text.
204*0b1bbbbbSAndreas Gohr     * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event.
205*0b1bbbbbSAndreas Gohr     * Pages that match retain their original scores from the candidate set.
206*0b1bbbbbSAndreas Gohr     *
207*0b1bbbbbSAndreas Gohr     * @param string $phrase the phrase to search for
208*0b1bbbbbSAndreas Gohr     * @param PageSet $candidates pages to check (typically the AND'd word results)
209*0b1bbbbbSAndreas Gohr     * @return PageSet pages where the phrase was found, with original scores preserved
210*0b1bbbbbSAndreas Gohr     */
211*0b1bbbbbSAndreas Gohr    protected function matchPhrase(string $phrase, PageSet $candidates): PageSet
212*0b1bbbbbSAndreas Gohr    {
213*0b1bbbbbSAndreas Gohr        $matched = [];
214*0b1bbbbbSAndreas Gohr        foreach ($candidates->getPages() as $id => $score) {
215*0b1bbbbbSAndreas Gohr            $evdata = [
216*0b1bbbbbSAndreas Gohr                'id' => $id,
217*0b1bbbbbSAndreas Gohr                'phrase' => $phrase,
218*0b1bbbbbSAndreas Gohr                'text' => rawWiki($id),
219*0b1bbbbbSAndreas Gohr            ];
220*0b1bbbbbSAndreas Gohr            $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
221*0b1bbbbbSAndreas Gohr            if ($event->advise_before() && $event->result !== true) {
222*0b1bbbbbSAndreas Gohr                $text = Utf8\PhpString::strtolower($evdata['text']);
223*0b1bbbbbSAndreas Gohr                if (str_contains($text, $phrase)) {
224*0b1bbbbbSAndreas Gohr                    $event->result = true;
225*0b1bbbbbSAndreas Gohr                }
226*0b1bbbbbSAndreas Gohr            }
227*0b1bbbbbSAndreas Gohr            $event->advise_after();
228*0b1bbbbbSAndreas Gohr            if ($event->result === true) {
229*0b1bbbbbSAndreas Gohr                $matched[$id] = $score;
230*0b1bbbbbSAndreas Gohr            }
231*0b1bbbbbSAndreas Gohr        }
232*0b1bbbbbSAndreas Gohr        return new PageSet($matched);
233*0b1bbbbbSAndreas Gohr    }
234*0b1bbbbbSAndreas Gohr
235*0b1bbbbbSAndreas Gohr    // endregion
236*0b1bbbbbSAndreas Gohr
237*0b1bbbbbSAndreas Gohr    // region Materialization
238*0b1bbbbbSAndreas Gohr
239*0b1bbbbbSAndreas Gohr    /**
240*0b1bbbbbSAndreas Gohr     * Convert any StackEntry into a concrete PageSet
241*0b1bbbbbSAndreas Gohr     *
242*0b1bbbbbSAndreas Gohr     * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry
243*0b1bbbbbSAndreas Gohr     * cannot be resolved without knowing all existing pages — a namespace
244*0b1bbbbbSAndreas Gohr     * predicate needs the full page list to find matching pages, and a negated
245*0b1bbbbbSAndreas Gohr     * entry needs it to compute the complement. This is why materialization
246*0b1bbbbbSAndreas Gohr     * triggers a lazy-load of the full page index from disk.
247*0b1bbbbbSAndreas Gohr     *
248*0b1bbbbbSAndreas Gohr     * This is only needed for standalone namespace or negative-only queries
249*0b1bbbbbSAndreas Gohr     * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed
250*0b1bbbbbSAndreas Gohr     * operators handle these entries without the universe.
251*0b1bbbbbSAndreas Gohr     *
252*0b1bbbbbSAndreas Gohr     * @param StackEntry $entry
253*0b1bbbbbSAndreas Gohr     * @return PageSet
254*0b1bbbbbSAndreas Gohr     */
255*0b1bbbbbSAndreas Gohr    protected function materialize(StackEntry $entry): PageSet
256*0b1bbbbbSAndreas Gohr    {
257*0b1bbbbbSAndreas Gohr        if ($entry instanceof PageSet) {
258*0b1bbbbbSAndreas Gohr            return $entry;
259*0b1bbbbbSAndreas Gohr        }
260*0b1bbbbbSAndreas Gohr
261*0b1bbbbbSAndreas Gohr        if ($entry instanceof NegatedEntry) {
262*0b1bbbbbSAndreas Gohr            return $this->getAllPages()->subtract($this->materialize($entry->getInner()));
263*0b1bbbbbSAndreas Gohr        }
264*0b1bbbbbSAndreas Gohr
265*0b1bbbbbSAndreas Gohr        if ($entry instanceof NamespacePredicate) {
266*0b1bbbbbSAndreas Gohr            return $entry->filter($this->getAllPages());
267*0b1bbbbbSAndreas Gohr        }
268*0b1bbbbbSAndreas Gohr
269*0b1bbbbbSAndreas Gohr        return new PageSet();
270*0b1bbbbbSAndreas Gohr    }
271*0b1bbbbbSAndreas Gohr
272*0b1bbbbbSAndreas Gohr    /**
273*0b1bbbbbSAndreas Gohr     * Lazy-load the universe of all indexed pages
274*0b1bbbbbSAndreas Gohr     *
275*0b1bbbbbSAndreas Gohr     * @return PageSet all pages with score 0
276*0b1bbbbbSAndreas Gohr     */
277*0b1bbbbbSAndreas Gohr    protected function getAllPages(): PageSet
278*0b1bbbbbSAndreas Gohr    {
279*0b1bbbbbSAndreas Gohr        if ($this->allPages === null) {
280*0b1bbbbbSAndreas Gohr            $pages = (new Indexer())->getAllPages();
281*0b1bbbbbSAndreas Gohr            $this->allPages = new PageSet(array_fill_keys($pages, 0));
282*0b1bbbbbSAndreas Gohr        }
283*0b1bbbbbSAndreas Gohr        return $this->allPages;
284*0b1bbbbbSAndreas Gohr    }
285*0b1bbbbbSAndreas Gohr
286*0b1bbbbbSAndreas Gohr    // endregion
287*0b1bbbbbSAndreas Gohr}
288