xref: /dokuwiki/inc/Search/Query/QueryEvaluator.php (revision 9369b4a991666bc911474806b106d8958e79f4c1)
10b1bbbbbSAndreas Gohr<?php
20b1bbbbbSAndreas Gohr
30b1bbbbbSAndreas Gohrnamespace dokuwiki\Search\Query;
40b1bbbbbSAndreas Gohr
5*9369b4a9SAndreas Gohruse dokuwiki\Utf8\PhpString;
60b1bbbbbSAndreas Gohruse dokuwiki\Extension\Event;
70b1bbbbbSAndreas Gohruse dokuwiki\Search\Collection\Term;
80b1bbbbbSAndreas Gohruse dokuwiki\Search\Indexer;
90b1bbbbbSAndreas Gohruse dokuwiki\Utf8;
100b1bbbbbSAndreas Gohr
110b1bbbbbSAndreas Gohr/**
120b1bbbbbSAndreas Gohr * Evaluates a parsed search query against word lookup results
130b1bbbbbSAndreas Gohr *
140b1bbbbbSAndreas Gohr * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry)
150b1bbbbbSAndreas Gohr * to avoid materializing the full page index unless absolutely necessary
160b1bbbbbSAndreas Gohr * (standalone NOT or namespace-only queries).
170b1bbbbbSAndreas Gohr */
180b1bbbbbSAndreas Gohrclass QueryEvaluator
190b1bbbbbSAndreas Gohr{
200b1bbbbbSAndreas Gohr    /** @var string[] RPN token array from QueryParser */
210b1bbbbbSAndreas Gohr    protected array $rpn;
220b1bbbbbSAndreas Gohr
236734bb8cSAndreas Gohr    /** @var Term[] word => Term mapping from CollectionSearch */
240b1bbbbbSAndreas Gohr    protected array $terms;
250b1bbbbbSAndreas Gohr
260b1bbbbbSAndreas Gohr    /** @var PageSet|null lazy-loaded universe of all indexed pages */
270b1bbbbbSAndreas Gohr    protected ?PageSet $allPages = null;
280b1bbbbbSAndreas Gohr
290b1bbbbbSAndreas Gohr    /**
300b1bbbbbSAndreas Gohr     * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary']
316734bb8cSAndreas Gohr     * @param Term[] $terms word => Term mapping from CollectionSearch::execute()
320b1bbbbbSAndreas Gohr     */
330b1bbbbbSAndreas Gohr    public function __construct(array $rpn, array $terms)
340b1bbbbbSAndreas Gohr    {
350b1bbbbbSAndreas Gohr        $this->rpn = $rpn;
360b1bbbbbSAndreas Gohr        $this->terms = $terms;
370b1bbbbbSAndreas Gohr    }
380b1bbbbbSAndreas Gohr
390b1bbbbbSAndreas Gohr    /**
400b1bbbbbSAndreas Gohr     * Evaluate the RPN query and return matching pages with scores
410b1bbbbbSAndreas Gohr     *
420b1bbbbbSAndreas Gohr     * The query is represented in Reverse Polish Notation (RPN), also known as postfix
430b1bbbbbSAndreas Gohr     * notation. In RPN, operands come first and operators follow. For example, the infix
440b1bbbbbSAndreas Gohr     * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for
450b1bbbbbSAndreas Gohr     * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND".
460b1bbbbbSAndreas Gohr     *
470b1bbbbbSAndreas Gohr     * Evaluation uses a stack. Each token is processed left to right: operand tokens
480b1bbbbbSAndreas Gohr     * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND,
490b1bbbbbSAndreas Gohr     * OR, NOT) pop their operands from the stack, compute a result, and push it back.
500b1bbbbbSAndreas Gohr     * After all tokens are processed, the single remaining stack entry is the final result.
510b1bbbbbSAndreas Gohr     *
520b1bbbbbSAndreas Gohr     * The stack entries are typed: word lookups produce PageSet entries (concrete page
530b1bbbbbSAndreas Gohr     * results with scores), namespace tokens produce NamespacePredicate entries (a filter
540b1bbbbbSAndreas Gohr     * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators
550b1bbbbbSAndreas Gohr     * inspect the types of their operands to choose the most efficient operation — for
560b1bbbbbSAndreas Gohr     * example, AND with a NegatedEntry performs set subtraction rather than requiring
570b1bbbbbSAndreas Gohr     * the full page universe.
580b1bbbbbSAndreas Gohr     *
590b1bbbbbSAndreas Gohr     * @return array<string, int> page ID => score
600b1bbbbbSAndreas Gohr     */
610b1bbbbbSAndreas Gohr    public function evaluate(): array
620b1bbbbbSAndreas Gohr    {
630b1bbbbbSAndreas Gohr        /** @var StackEntry[] $stack */
640b1bbbbbSAndreas Gohr        $stack = [];
650b1bbbbbSAndreas Gohr
660b1bbbbbSAndreas Gohr        foreach ($this->rpn as $token) {
670b1bbbbbSAndreas Gohr            switch (substr($token, 0, 3)) {
680b1bbbbbSAndreas Gohr                case 'W+:':
690b1bbbbbSAndreas Gohr                case 'W-:':
700b1bbbbbSAndreas Gohr                case 'W_:':
710b1bbbbbSAndreas Gohr                    $word = substr($token, 3);
720b1bbbbbSAndreas Gohr                    if (isset($this->terms[$word])) {
73*9369b4a9SAndreas Gohr                        $stack[] = new PageSet($this->terms[$word]->getEntityFrequencies());
740b1bbbbbSAndreas Gohr                    }
750b1bbbbbSAndreas Gohr                    break;
760b1bbbbbSAndreas Gohr
770b1bbbbbSAndreas Gohr                case 'P+:':
780b1bbbbbSAndreas Gohr                case 'P-:':
790b1bbbbbSAndreas Gohr                    $phrase = substr($token, 3);
800b1bbbbbSAndreas Gohr                    // Phrases are always preceded by their component words AND'd together,
810b1bbbbbSAndreas Gohr                    // so the top of stack contains pages matching all words in the phrase.
820b1bbbbbSAndreas Gohr                    // We verify the actual phrase exists in those candidate pages.
830b1bbbbbSAndreas Gohr                    $candidates = end($stack) ?: new PageSet();
840b1bbbbbSAndreas Gohr                    $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates));
850b1bbbbbSAndreas Gohr                    break;
860b1bbbbbSAndreas Gohr
870b1bbbbbSAndreas Gohr                case 'N+:':
880b1bbbbbSAndreas Gohr                case 'N-:':
890b1bbbbbSAndreas Gohr                    $ns = cleanID(substr($token, 3)) . ':';
900b1bbbbbSAndreas Gohr                    $stack[] = new NamespacePredicate($ns);
910b1bbbbbSAndreas Gohr                    break;
920b1bbbbbSAndreas Gohr
930b1bbbbbSAndreas Gohr                case 'AND':
940b1bbbbbSAndreas Gohr                    $right = array_pop($stack);
950b1bbbbbSAndreas Gohr                    $left = array_pop($stack);
960b1bbbbbSAndreas Gohr                    $stack[] = $this->opAnd($left, $right);
970b1bbbbbSAndreas Gohr                    break;
980b1bbbbbSAndreas Gohr
990b1bbbbbSAndreas Gohr                case 'OR':
1000b1bbbbbSAndreas Gohr                    $right = array_pop($stack);
1010b1bbbbbSAndreas Gohr                    $left = array_pop($stack);
1020b1bbbbbSAndreas Gohr                    $stack[] = $this->opOr($left, $right);
1030b1bbbbbSAndreas Gohr                    break;
1040b1bbbbbSAndreas Gohr
1050b1bbbbbSAndreas Gohr                case 'NOT':
1060b1bbbbbSAndreas Gohr                    $operand = array_pop($stack);
1070b1bbbbbSAndreas Gohr                    $stack[] = new NegatedEntry($operand);
1080b1bbbbbSAndreas Gohr                    break;
1090b1bbbbbSAndreas Gohr            }
1100b1bbbbbSAndreas Gohr        }
1110b1bbbbbSAndreas Gohr
1120b1bbbbbSAndreas Gohr        $result = array_pop($stack) ?? new PageSet();
1130b1bbbbbSAndreas Gohr        return $this->materialize($result)->getPages();
1140b1bbbbbSAndreas Gohr    }
1150b1bbbbbSAndreas Gohr
1160b1bbbbbSAndreas Gohr    // region Operators
1170b1bbbbbSAndreas Gohr
1180b1bbbbbSAndreas Gohr    /**
1190b1bbbbbSAndreas Gohr     * AND: combine two operands based on their types
1200b1bbbbbSAndreas Gohr     *
1210b1bbbbbSAndreas Gohr     * PageSet AND PageSet produces an intersection with summed scores. When one
1220b1bbbbbSAndreas Gohr     * operand is a NegatedEntry, the operation becomes set subtraction. When one
1230b1bbbbbSAndreas Gohr     * operand is a NamespacePredicate, the other is filtered by namespace prefix.
1240b1bbbbbSAndreas Gohr     *
1250b1bbbbbSAndreas Gohr     * @param StackEntry $left
1260b1bbbbbSAndreas Gohr     * @param StackEntry $right
1270b1bbbbbSAndreas Gohr     * @return StackEntry
1280b1bbbbbSAndreas Gohr     */
1290b1bbbbbSAndreas Gohr    protected function opAnd(StackEntry $left, StackEntry $right): StackEntry
1300b1bbbbbSAndreas Gohr    {
1310b1bbbbbSAndreas Gohr        // page set AND negated → subtract
1320b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof NegatedEntry) {
1330b1bbbbbSAndreas Gohr            return $this->subtractNegated($left, $right);
1340b1bbbbbSAndreas Gohr        }
1350b1bbbbbSAndreas Gohr        if ($left instanceof NegatedEntry && $right instanceof PageSet) {
1360b1bbbbbSAndreas Gohr            return $this->subtractNegated($right, $left);
1370b1bbbbbSAndreas Gohr        }
1380b1bbbbbSAndreas Gohr
1390b1bbbbbSAndreas Gohr        // page set AND namespace → filter by prefix
1400b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof NamespacePredicate) {
1410b1bbbbbSAndreas Gohr            return $right->filter($left);
1420b1bbbbbSAndreas Gohr        }
1430b1bbbbbSAndreas Gohr        if ($left instanceof NamespacePredicate && $right instanceof PageSet) {
1440b1bbbbbSAndreas Gohr            return $left->filter($right);
1450b1bbbbbSAndreas Gohr        }
1460b1bbbbbSAndreas Gohr
1470b1bbbbbSAndreas Gohr        // page set AND page set → intersect, sum scores
1480b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof PageSet) {
1490b1bbbbbSAndreas Gohr            return $left->intersect($right);
1500b1bbbbbSAndreas Gohr        }
1510b1bbbbbSAndreas Gohr
1520b1bbbbbSAndreas Gohr        // rare cases (negated AND negated, namespace AND namespace, etc.)
1530b1bbbbbSAndreas Gohr        return $this->materialize($left)->intersect($this->materialize($right));
1540b1bbbbbSAndreas Gohr    }
1550b1bbbbbSAndreas Gohr
1560b1bbbbbSAndreas Gohr    /**
1570b1bbbbbSAndreas Gohr     * OR: unite two operands
1580b1bbbbbSAndreas Gohr     *
1590b1bbbbbSAndreas Gohr     * PageSet OR PageSet produces a union with summed scores. Other combinations
1600b1bbbbbSAndreas Gohr     * require materializing operands into concrete page sets first.
1610b1bbbbbSAndreas Gohr     *
1620b1bbbbbSAndreas Gohr     * @param StackEntry $left
1630b1bbbbbSAndreas Gohr     * @param StackEntry $right
1640b1bbbbbSAndreas Gohr     * @return StackEntry
1650b1bbbbbSAndreas Gohr     */
1660b1bbbbbSAndreas Gohr    protected function opOr(StackEntry $left, StackEntry $right): StackEntry
1670b1bbbbbSAndreas Gohr    {
1680b1bbbbbSAndreas Gohr        if ($left instanceof PageSet && $right instanceof PageSet) {
1690b1bbbbbSAndreas Gohr            return $left->unite($right);
1700b1bbbbbSAndreas Gohr        }
1710b1bbbbbSAndreas Gohr
1720b1bbbbbSAndreas Gohr        return $this->materialize($left)->unite($this->materialize($right));
1730b1bbbbbSAndreas Gohr    }
1740b1bbbbbSAndreas Gohr
1750b1bbbbbSAndreas Gohr    /**
1760b1bbbbbSAndreas Gohr     * Subtract a NegatedEntry from a PageSet
1770b1bbbbbSAndreas Gohr     *
1780b1bbbbbSAndreas Gohr     * The inner entry of the NegatedEntry determines the operation:
1790b1bbbbbSAndreas Gohr     * - NegatedEntry(PageSet): set subtraction
1800b1bbbbbSAndreas Gohr     * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace
1810b1bbbbbSAndreas Gohr     *
1820b1bbbbbSAndreas Gohr     * @param PageSet $pages the positive page set
1830b1bbbbbSAndreas Gohr     * @param NegatedEntry $negated the negated operand
1840b1bbbbbSAndreas Gohr     * @return PageSet
1850b1bbbbbSAndreas Gohr     */
1860b1bbbbbSAndreas Gohr    protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet
1870b1bbbbbSAndreas Gohr    {
1880b1bbbbbSAndreas Gohr        $inner = $negated->getInner();
1890b1bbbbbSAndreas Gohr
1900b1bbbbbSAndreas Gohr        if ($inner instanceof NamespacePredicate) {
1910b1bbbbbSAndreas Gohr            return $inner->exclude($pages);
1920b1bbbbbSAndreas Gohr        }
1930b1bbbbbSAndreas Gohr
1940b1bbbbbSAndreas Gohr        return $pages->subtract($this->materialize($inner));
1950b1bbbbbSAndreas Gohr    }
1960b1bbbbbSAndreas Gohr
1970b1bbbbbSAndreas Gohr    // endregion
1980b1bbbbbSAndreas Gohr
1990b1bbbbbSAndreas Gohr    // region Phrase matching
2000b1bbbbbSAndreas Gohr
2010b1bbbbbSAndreas Gohr    /**
2020b1bbbbbSAndreas Gohr     * Check which pages from the candidate set contain the given phrase
2030b1bbbbbSAndreas Gohr     *
2040b1bbbbbSAndreas Gohr     * Verifies phrase presence by reading each page's raw wiki text.
2050b1bbbbbSAndreas Gohr     * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event.
2060b1bbbbbSAndreas Gohr     * Pages that match retain their original scores from the candidate set.
2070b1bbbbbSAndreas Gohr     *
2080b1bbbbbSAndreas Gohr     * @param string $phrase the phrase to search for
2090b1bbbbbSAndreas Gohr     * @param PageSet $candidates pages to check (typically the AND'd word results)
2100b1bbbbbSAndreas Gohr     * @return PageSet pages where the phrase was found, with original scores preserved
2110b1bbbbbSAndreas Gohr     */
2120b1bbbbbSAndreas Gohr    protected function matchPhrase(string $phrase, PageSet $candidates): PageSet
2130b1bbbbbSAndreas Gohr    {
2140b1bbbbbSAndreas Gohr        $matched = [];
2150b1bbbbbSAndreas Gohr        foreach ($candidates->getPages() as $id => $score) {
2160b1bbbbbSAndreas Gohr            $evdata = [
2170b1bbbbbSAndreas Gohr                'id' => $id,
2180b1bbbbbSAndreas Gohr                'phrase' => $phrase,
2190b1bbbbbSAndreas Gohr                'text' => rawWiki($id),
2200b1bbbbbSAndreas Gohr            ];
2210b1bbbbbSAndreas Gohr            $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
2220b1bbbbbSAndreas Gohr            if ($event->advise_before() && $event->result !== true) {
223*9369b4a9SAndreas Gohr                $text = PhpString::strtolower($evdata['text']);
2240b1bbbbbSAndreas Gohr                if (str_contains($text, $phrase)) {
2250b1bbbbbSAndreas Gohr                    $event->result = true;
2260b1bbbbbSAndreas Gohr                }
2270b1bbbbbSAndreas Gohr            }
2280b1bbbbbSAndreas Gohr            $event->advise_after();
2290b1bbbbbSAndreas Gohr            if ($event->result === true) {
2300b1bbbbbSAndreas Gohr                $matched[$id] = $score;
2310b1bbbbbSAndreas Gohr            }
2320b1bbbbbSAndreas Gohr        }
2330b1bbbbbSAndreas Gohr        return new PageSet($matched);
2340b1bbbbbSAndreas Gohr    }
2350b1bbbbbSAndreas Gohr
2360b1bbbbbSAndreas Gohr    // endregion
2370b1bbbbbSAndreas Gohr
2380b1bbbbbSAndreas Gohr    // region Materialization
2390b1bbbbbSAndreas Gohr
2400b1bbbbbSAndreas Gohr    /**
2410b1bbbbbSAndreas Gohr     * Convert any StackEntry into a concrete PageSet
2420b1bbbbbSAndreas Gohr     *
2430b1bbbbbSAndreas Gohr     * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry
2440b1bbbbbSAndreas Gohr     * cannot be resolved without knowing all existing pages — a namespace
2450b1bbbbbSAndreas Gohr     * predicate needs the full page list to find matching pages, and a negated
2460b1bbbbbSAndreas Gohr     * entry needs it to compute the complement. This is why materialization
2470b1bbbbbSAndreas Gohr     * triggers a lazy-load of the full page index from disk.
2480b1bbbbbSAndreas Gohr     *
2490b1bbbbbSAndreas Gohr     * This is only needed for standalone namespace or negative-only queries
2500b1bbbbbSAndreas Gohr     * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed
2510b1bbbbbSAndreas Gohr     * operators handle these entries without the universe.
2520b1bbbbbSAndreas Gohr     *
2530b1bbbbbSAndreas Gohr     * @param StackEntry $entry
2540b1bbbbbSAndreas Gohr     * @return PageSet
2550b1bbbbbSAndreas Gohr     */
2560b1bbbbbSAndreas Gohr    protected function materialize(StackEntry $entry): PageSet
2570b1bbbbbSAndreas Gohr    {
2580b1bbbbbSAndreas Gohr        if ($entry instanceof PageSet) {
2590b1bbbbbSAndreas Gohr            return $entry;
2600b1bbbbbSAndreas Gohr        }
2610b1bbbbbSAndreas Gohr
2620b1bbbbbSAndreas Gohr        if ($entry instanceof NegatedEntry) {
2630b1bbbbbSAndreas Gohr            return $this->getAllPages()->subtract($this->materialize($entry->getInner()));
2640b1bbbbbSAndreas Gohr        }
2650b1bbbbbSAndreas Gohr
2660b1bbbbbSAndreas Gohr        if ($entry instanceof NamespacePredicate) {
2670b1bbbbbSAndreas Gohr            return $entry->filter($this->getAllPages());
2680b1bbbbbSAndreas Gohr        }
2690b1bbbbbSAndreas Gohr
2700b1bbbbbSAndreas Gohr        return new PageSet();
2710b1bbbbbSAndreas Gohr    }
2720b1bbbbbSAndreas Gohr
2730b1bbbbbSAndreas Gohr    /**
2740b1bbbbbSAndreas Gohr     * Lazy-load the universe of all indexed pages
2750b1bbbbbSAndreas Gohr     *
2760b1bbbbbSAndreas Gohr     * @return PageSet all pages with score 0
2770b1bbbbbSAndreas Gohr     */
2780b1bbbbbSAndreas Gohr    protected function getAllPages(): PageSet
2790b1bbbbbSAndreas Gohr    {
280*9369b4a9SAndreas Gohr        if (!$this->allPages instanceof PageSet) {
2810b1bbbbbSAndreas Gohr            $pages = (new Indexer())->getAllPages();
2820b1bbbbbSAndreas Gohr            $this->allPages = new PageSet(array_fill_keys($pages, 0));
2830b1bbbbbSAndreas Gohr        }
2840b1bbbbbSAndreas Gohr        return $this->allPages;
2850b1bbbbbSAndreas Gohr    }
2860b1bbbbbSAndreas Gohr
2870b1bbbbbSAndreas Gohr    // endregion
2880b1bbbbbSAndreas Gohr}
289