xref: /dokuwiki/inc/Search/Query/QueryEvaluator.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
10b1bbbbbSAndreas Gohr<?php
20b1bbbbbSAndreas Gohr
30b1bbbbbSAndreas Gohrnamespace dokuwiki\Search\Query;
40b1bbbbbSAndreas Gohr
50b1bbbbbSAndreas Gohruse dokuwiki\Extension\Event;
60b1bbbbbSAndreas Gohruse dokuwiki\Search\Collection\Term;
70b1bbbbbSAndreas Gohruse dokuwiki\Search\Indexer;
80b1bbbbbSAndreas Gohruse dokuwiki\Utf8;
90b1bbbbbSAndreas Gohr
100b1bbbbbSAndreas Gohr/**
110b1bbbbbSAndreas Gohr * Evaluates a parsed search query against word lookup results
120b1bbbbbSAndreas Gohr *
130b1bbbbbSAndreas Gohr * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry)
140b1bbbbbSAndreas Gohr * to avoid materializing the full page index unless absolutely necessary
150b1bbbbbSAndreas Gohr * (standalone NOT or namespace-only queries).
160b1bbbbbSAndreas Gohr */
170b1bbbbbSAndreas Gohrclass QueryEvaluator
180b1bbbbbSAndreas Gohr{
190b1bbbbbSAndreas Gohr    /** @var string[] RPN token array from QueryParser */
200b1bbbbbSAndreas Gohr    protected array $rpn;
210b1bbbbbSAndreas Gohr
22*6734bb8cSAndreas Gohr    /** @var Term[] word => Term mapping from CollectionSearch */
230b1bbbbbSAndreas Gohr    protected array $terms;
240b1bbbbbSAndreas Gohr
250b1bbbbbSAndreas Gohr    /** @var PageSet|null lazy-loaded universe of all indexed pages */
260b1bbbbbSAndreas Gohr    protected ?PageSet $allPages = null;
270b1bbbbbSAndreas Gohr
280b1bbbbbSAndreas Gohr    /**
290b1bbbbbSAndreas Gohr     * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary']
30*6734bb8cSAndreas Gohr     * @param Term[] $terms word => Term mapping from CollectionSearch::execute()
310b1bbbbbSAndreas Gohr     */
320b1bbbbbSAndreas Gohr    public function __construct(array $rpn, array $terms)
330b1bbbbbSAndreas Gohr    {
340b1bbbbbSAndreas Gohr        $this->rpn = $rpn;
350b1bbbbbSAndreas Gohr        $this->terms = $terms;
360b1bbbbbSAndreas Gohr    }
370b1bbbbbSAndreas Gohr
380b1bbbbbSAndreas Gohr    /**
390b1bbbbbSAndreas Gohr     * Evaluate the RPN query and return matching pages with scores
400b1bbbbbSAndreas Gohr     *
410b1bbbbbSAndreas Gohr     * The query is represented in Reverse Polish Notation (RPN), also known as postfix
420b1bbbbbSAndreas Gohr     * notation. In RPN, operands come first and operators follow. For example, the infix
430b1bbbbbSAndreas Gohr     * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for
440b1bbbbbSAndreas Gohr     * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND".
450b1bbbbbSAndreas Gohr     *
460b1bbbbbSAndreas Gohr     * Evaluation uses a stack. Each token is processed left to right: operand tokens
470b1bbbbbSAndreas Gohr     * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND,
480b1bbbbbSAndreas Gohr     * OR, NOT) pop their operands from the stack, compute a result, and push it back.
490b1bbbbbSAndreas Gohr     * After all tokens are processed, the single remaining stack entry is the final result.
500b1bbbbbSAndreas Gohr     *
510b1bbbbbSAndreas Gohr     * The stack entries are typed: word lookups produce PageSet entries (concrete page
520b1bbbbbSAndreas Gohr     * results with scores), namespace tokens produce NamespacePredicate entries (a filter
530b1bbbbbSAndreas Gohr     * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators
540b1bbbbbSAndreas Gohr     * inspect the types of their operands to choose the most efficient operation — for
550b1bbbbbSAndreas Gohr     * example, AND with a NegatedEntry performs set subtraction rather than requiring
560b1bbbbbSAndreas Gohr     * the full page universe.
570b1bbbbbSAndreas Gohr     *
580b1bbbbbSAndreas Gohr     * @return array<string, int> page ID => score
590b1bbbbbSAndreas Gohr     */
600b1bbbbbSAndreas Gohr    public function evaluate(): array
610b1bbbbbSAndreas Gohr    {
620b1bbbbbSAndreas Gohr        /** @var StackEntry[] $stack */
630b1bbbbbSAndreas Gohr        $stack = [];
640b1bbbbbSAndreas Gohr
650b1bbbbbSAndreas Gohr        foreach ($this->rpn as $token) {
660b1bbbbbSAndreas Gohr            switch (substr($token, 0, 3)) {
670b1bbbbbSAndreas Gohr                case 'W+:':
680b1bbbbbSAndreas Gohr                case 'W-:':
690b1bbbbbSAndreas Gohr                case 'W_:':
700b1bbbbbSAndreas Gohr                    $word = substr($token, 3);
710b1bbbbbSAndreas Gohr                    if (isset($this->terms[$word])) {
720b1bbbbbSAndreas Gohr                        $stack[] = new PageSet((array)$this->terms[$word]->getEntityFrequencies());
730b1bbbbbSAndreas Gohr                    }
740b1bbbbbSAndreas Gohr                    break;
750b1bbbbbSAndreas Gohr
760b1bbbbbSAndreas Gohr                case 'P+:':
770b1bbbbbSAndreas Gohr                case 'P-:':
780b1bbbbbSAndreas Gohr                    $phrase = substr($token, 3);
790b1bbbbbSAndreas Gohr                    // Phrases are always preceded by their component words AND'd together,
800b1bbbbbSAndreas Gohr                    // so the top of stack contains pages matching all words in the phrase.
810b1bbbbbSAndreas Gohr                    // We verify the actual phrase exists in those candidate pages.
820b1bbbbbSAndreas Gohr                    $candidates = end($stack) ?: new PageSet();
830b1bbbbbSAndreas Gohr                    $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates));
840b1bbbbbSAndreas Gohr                    break;
850b1bbbbbSAndreas Gohr
860b1bbbbbSAndreas Gohr                case 'N+:':
870b1bbbbbSAndreas Gohr                case 'N-:':
880b1bbbbbSAndreas Gohr                    $ns = cleanID(substr($token, 3)) . ':';
890b1bbbbbSAndreas Gohr                    $stack[] = new NamespacePredicate($ns);
900b1bbbbbSAndreas Gohr                    break;
910b1bbbbbSAndreas Gohr
920b1bbbbbSAndreas Gohr                case 'AND':
930b1bbbbbSAndreas Gohr                    $right = array_pop($stack);
940b1bbbbbSAndreas Gohr                    $left = array_pop($stack);
950b1bbbbbSAndreas Gohr                    $stack[] = $this->opAnd($left, $right);
960b1bbbbbSAndreas Gohr                    break;
970b1bbbbbSAndreas Gohr
980b1bbbbbSAndreas Gohr                case 'OR':
990b1bbbbbSAndreas Gohr                    $right = array_pop($stack);
1000b1bbbbbSAndreas Gohr                    $left = array_pop($stack);
1010b1bbbbbSAndreas Gohr                    $stack[] = $this->opOr($left, $right);
1020b1bbbbbSAndreas Gohr                    break;
1030b1bbbbbSAndreas Gohr
1040b1bbbbbSAndreas Gohr                case 'NOT':
1050b1bbbbbSAndreas Gohr                    $operand = array_pop($stack);
1060b1bbbbbSAndreas Gohr                    $stack[] = new NegatedEntry($operand);
1070b1bbbbbSAndreas Gohr                    break;
1080b1bbbbbSAndreas Gohr            }
1090b1bbbbbSAndreas Gohr        }
1100b1bbbbbSAndreas Gohr
1110b1bbbbbSAndreas Gohr        $result = array_pop($stack) ?? new PageSet();
1120b1bbbbbSAndreas Gohr        return $this->materialize($result)->getPages();
1130b1bbbbbSAndreas Gohr    }
1140b1bbbbbSAndreas Gohr
1150b1bbbbbSAndreas Gohr    // region Operators
1160b1bbbbbSAndreas Gohr
1170b1bbbbbSAndreas Gohr    /**
1180b1bbbbbSAndreas Gohr     * AND: combine two operands based on their types
1190b1bbbbbSAndreas Gohr     *
1200b1bbbbbSAndreas Gohr     * PageSet AND PageSet produces an intersection with summed scores. When one
1210b1bbbbbSAndreas Gohr     * operand is a NegatedEntry, the operation becomes set subtraction. When one
1220b1bbbbbSAndreas Gohr     * operand is a NamespacePredicate, the other is filtered by namespace prefix.
1230b1bbbbbSAndreas Gohr     *
1240b1bbbbbSAndreas Gohr     * @param StackEntry $left
1250b1bbbbbSAndreas Gohr     * @param StackEntry $right
1260b1bbbbbSAndreas Gohr     * @return StackEntry
1270b1bbbbbSAndreas Gohr     */
1280b1bbbbbSAndreas Gohr    protected function opAnd(StackEntry $left, StackEntry $right): StackEntry
1290b1bbbbbSAndreas Gohr    {
1300b1bbbbbSAndreas Gohr        // page set AND negated → subtract
1310b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof NegatedEntry) {
1320b1bbbbbSAndreas Gohr            return $this->subtractNegated($left, $right);
1330b1bbbbbSAndreas Gohr        }
1340b1bbbbbSAndreas Gohr        if ($left instanceof NegatedEntry && $right instanceof PageSet) {
1350b1bbbbbSAndreas Gohr            return $this->subtractNegated($right, $left);
1360b1bbbbbSAndreas Gohr        }
1370b1bbbbbSAndreas Gohr
1380b1bbbbbSAndreas Gohr        // page set AND namespace → filter by prefix
1390b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof NamespacePredicate) {
1400b1bbbbbSAndreas Gohr            return $right->filter($left);
1410b1bbbbbSAndreas Gohr        }
1420b1bbbbbSAndreas Gohr        if ($left instanceof NamespacePredicate && $right instanceof PageSet) {
1430b1bbbbbSAndreas Gohr            return $left->filter($right);
1440b1bbbbbSAndreas Gohr        }
1450b1bbbbbSAndreas Gohr
1460b1bbbbbSAndreas Gohr        // page set AND page set → intersect, sum scores
1470b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof PageSet) {
1480b1bbbbbSAndreas Gohr            return $left->intersect($right);
1490b1bbbbbSAndreas Gohr        }
1500b1bbbbbSAndreas Gohr
1510b1bbbbbSAndreas Gohr        // rare cases (negated AND negated, namespace AND namespace, etc.)
1520b1bbbbbSAndreas Gohr        return $this->materialize($left)->intersect($this->materialize($right));
1530b1bbbbbSAndreas Gohr    }
1540b1bbbbbSAndreas Gohr
1550b1bbbbbSAndreas Gohr    /**
1560b1bbbbbSAndreas Gohr     * OR: unite two operands
1570b1bbbbbSAndreas Gohr     *
1580b1bbbbbSAndreas Gohr     * PageSet OR PageSet produces a union with summed scores. Other combinations
1590b1bbbbbSAndreas Gohr     * require materializing operands into concrete page sets first.
1600b1bbbbbSAndreas Gohr     *
1610b1bbbbbSAndreas Gohr     * @param StackEntry $left
1620b1bbbbbSAndreas Gohr     * @param StackEntry $right
1630b1bbbbbSAndreas Gohr     * @return StackEntry
1640b1bbbbbSAndreas Gohr     */
1650b1bbbbbSAndreas Gohr    protected function opOr(StackEntry $left, StackEntry $right): StackEntry
1660b1bbbbbSAndreas Gohr    {
1670b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof PageSet) {
1680b1bbbbbSAndreas Gohr            return $left->unite($right);
1690b1bbbbbSAndreas Gohr        }
1700b1bbbbbSAndreas Gohr
1710b1bbbbbSAndreas Gohr        return $this->materialize($left)->unite($this->materialize($right));
1720b1bbbbbSAndreas Gohr    }
1730b1bbbbbSAndreas Gohr
1740b1bbbbbSAndreas Gohr    /**
1750b1bbbbbSAndreas Gohr     * Subtract a NegatedEntry from a PageSet
1760b1bbbbbSAndreas Gohr     *
1770b1bbbbbSAndreas Gohr     * The inner entry of the NegatedEntry determines the operation:
1780b1bbbbbSAndreas Gohr     * - NegatedEntry(PageSet): set subtraction
1790b1bbbbbSAndreas Gohr     * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace
1800b1bbbbbSAndreas Gohr     *
1810b1bbbbbSAndreas Gohr     * @param PageSet $pages the positive page set
1820b1bbbbbSAndreas Gohr     * @param NegatedEntry $negated the negated operand
1830b1bbbbbSAndreas Gohr     * @return PageSet
1840b1bbbbbSAndreas Gohr     */
1850b1bbbbbSAndreas Gohr    protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet
1860b1bbbbbSAndreas Gohr    {
1870b1bbbbbSAndreas Gohr        $inner = $negated->getInner();
1880b1bbbbbSAndreas Gohr
1890b1bbbbbSAndreas Gohr        if ($inner instanceof NamespacePredicate) {
1900b1bbbbbSAndreas Gohr            return $inner->exclude($pages);
1910b1bbbbbSAndreas Gohr        }
1920b1bbbbbSAndreas Gohr
1930b1bbbbbSAndreas Gohr        return $pages->subtract($this->materialize($inner));
1940b1bbbbbSAndreas Gohr    }
1950b1bbbbbSAndreas Gohr
1960b1bbbbbSAndreas Gohr    // endregion
1970b1bbbbbSAndreas Gohr
1980b1bbbbbSAndreas Gohr    // region Phrase matching
1990b1bbbbbSAndreas Gohr
2000b1bbbbbSAndreas Gohr    /**
2010b1bbbbbSAndreas Gohr     * Check which pages from the candidate set contain the given phrase
2020b1bbbbbSAndreas Gohr     *
2030b1bbbbbSAndreas Gohr     * Verifies phrase presence by reading each page's raw wiki text.
2040b1bbbbbSAndreas Gohr     * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event.
2050b1bbbbbSAndreas Gohr     * Pages that match retain their original scores from the candidate set.
2060b1bbbbbSAndreas Gohr     *
2070b1bbbbbSAndreas Gohr     * @param string $phrase the phrase to search for
2080b1bbbbbSAndreas Gohr     * @param PageSet $candidates pages to check (typically the AND'd word results)
2090b1bbbbbSAndreas Gohr     * @return PageSet pages where the phrase was found, with original scores preserved
2100b1bbbbbSAndreas Gohr     */
2110b1bbbbbSAndreas Gohr    protected function matchPhrase(string $phrase, PageSet $candidates): PageSet
2120b1bbbbbSAndreas Gohr    {
2130b1bbbbbSAndreas Gohr        $matched = [];
2140b1bbbbbSAndreas Gohr        foreach ($candidates->getPages() as $id => $score) {
2150b1bbbbbSAndreas Gohr            $evdata = [
2160b1bbbbbSAndreas Gohr                'id' => $id,
2170b1bbbbbSAndreas Gohr                'phrase' => $phrase,
2180b1bbbbbSAndreas Gohr                'text' => rawWiki($id),
2190b1bbbbbSAndreas Gohr            ];
2200b1bbbbbSAndreas Gohr            $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
2210b1bbbbbSAndreas Gohr            if ($event->advise_before() && $event->result !== true) {
2220b1bbbbbSAndreas Gohr                $text = Utf8\PhpString::strtolower($evdata['text']);
2230b1bbbbbSAndreas Gohr                if (str_contains($text, $phrase)) {
2240b1bbbbbSAndreas Gohr                    $event->result = true;
2250b1bbbbbSAndreas Gohr                }
2260b1bbbbbSAndreas Gohr            }
2270b1bbbbbSAndreas Gohr            $event->advise_after();
2280b1bbbbbSAndreas Gohr            if ($event->result === true) {
2290b1bbbbbSAndreas Gohr                $matched[$id] = $score;
2300b1bbbbbSAndreas Gohr            }
2310b1bbbbbSAndreas Gohr        }
2320b1bbbbbSAndreas Gohr        return new PageSet($matched);
2330b1bbbbbSAndreas Gohr    }
2340b1bbbbbSAndreas Gohr
2350b1bbbbbSAndreas Gohr    // endregion
2360b1bbbbbSAndreas Gohr
2370b1bbbbbSAndreas Gohr    // region Materialization
2380b1bbbbbSAndreas Gohr
2390b1bbbbbSAndreas Gohr    /**
2400b1bbbbbSAndreas Gohr     * Convert any StackEntry into a concrete PageSet
2410b1bbbbbSAndreas Gohr     *
2420b1bbbbbSAndreas Gohr     * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry
2430b1bbbbbSAndreas Gohr     * cannot be resolved without knowing all existing pages — a namespace
2440b1bbbbbSAndreas Gohr     * predicate needs the full page list to find matching pages, and a negated
2450b1bbbbbSAndreas Gohr     * entry needs it to compute the complement. This is why materialization
2460b1bbbbbSAndreas Gohr     * triggers a lazy-load of the full page index from disk.
2470b1bbbbbSAndreas Gohr     *
2480b1bbbbbSAndreas Gohr     * This is only needed for standalone namespace or negative-only queries
2490b1bbbbbSAndreas Gohr     * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed
2500b1bbbbbSAndreas Gohr     * operators handle these entries without the universe.
2510b1bbbbbSAndreas Gohr     *
2520b1bbbbbSAndreas Gohr     * @param StackEntry $entry
2530b1bbbbbSAndreas Gohr     * @return PageSet
2540b1bbbbbSAndreas Gohr     */
2550b1bbbbbSAndreas Gohr    protected function materialize(StackEntry $entry): PageSet
2560b1bbbbbSAndreas Gohr    {
2570b1bbbbbSAndreas Gohr        if ($entry instanceof PageSet) {
2580b1bbbbbSAndreas Gohr            return $entry;
2590b1bbbbbSAndreas Gohr        }
2600b1bbbbbSAndreas Gohr
2610b1bbbbbSAndreas Gohr        if ($entry instanceof NegatedEntry) {
2620b1bbbbbSAndreas Gohr            return $this->getAllPages()->subtract($this->materialize($entry->getInner()));
2630b1bbbbbSAndreas Gohr        }
2640b1bbbbbSAndreas Gohr
2650b1bbbbbSAndreas Gohr        if ($entry instanceof NamespacePredicate) {
2660b1bbbbbSAndreas Gohr            return $entry->filter($this->getAllPages());
2670b1bbbbbSAndreas Gohr        }
2680b1bbbbbSAndreas Gohr
2690b1bbbbbSAndreas Gohr        return new PageSet();
2700b1bbbbbSAndreas Gohr    }
2710b1bbbbbSAndreas Gohr
2720b1bbbbbSAndreas Gohr    /**
2730b1bbbbbSAndreas Gohr     * Lazy-load the universe of all indexed pages
2740b1bbbbbSAndreas Gohr     *
2750b1bbbbbSAndreas Gohr     * @return PageSet all pages with score 0
2760b1bbbbbSAndreas Gohr     */
2770b1bbbbbSAndreas Gohr    protected function getAllPages(): PageSet
2780b1bbbbbSAndreas Gohr    {
2790b1bbbbbSAndreas Gohr        if ($this->allPages === null) {
2800b1bbbbbSAndreas Gohr            $pages = (new Indexer())->getAllPages();
2810b1bbbbbSAndreas Gohr            $this->allPages = new PageSet(array_fill_keys($pages, 0));
2820b1bbbbbSAndreas Gohr        }
2830b1bbbbbSAndreas Gohr        return $this->allPages;
2840b1bbbbbSAndreas Gohr    }
2850b1bbbbbSAndreas Gohr
2860b1bbbbbSAndreas Gohr    // endregion
2870b1bbbbbSAndreas Gohr}
288