10b1bbbbbSAndreas Gohr<?php 20b1bbbbbSAndreas Gohr 30b1bbbbbSAndreas Gohrnamespace dokuwiki\Search\Query; 40b1bbbbbSAndreas Gohr 5*9369b4a9SAndreas Gohruse dokuwiki\Utf8\PhpString; 60b1bbbbbSAndreas Gohruse dokuwiki\Extension\Event; 70b1bbbbbSAndreas Gohruse dokuwiki\Search\Collection\Term; 80b1bbbbbSAndreas Gohruse dokuwiki\Search\Indexer; 90b1bbbbbSAndreas Gohruse dokuwiki\Utf8; 100b1bbbbbSAndreas Gohr 110b1bbbbbSAndreas Gohr/** 120b1bbbbbSAndreas Gohr * Evaluates a parsed search query against word lookup results 130b1bbbbbSAndreas Gohr * 140b1bbbbbSAndreas Gohr * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry) 150b1bbbbbSAndreas Gohr * to avoid materializing the full page index unless absolutely necessary 160b1bbbbbSAndreas Gohr * (standalone NOT or namespace-only queries). 170b1bbbbbSAndreas Gohr */ 180b1bbbbbSAndreas Gohrclass QueryEvaluator 190b1bbbbbSAndreas Gohr{ 200b1bbbbbSAndreas Gohr /** @var string[] RPN token array from QueryParser */ 210b1bbbbbSAndreas Gohr protected array $rpn; 220b1bbbbbSAndreas Gohr 236734bb8cSAndreas Gohr /** @var Term[] word => Term mapping from CollectionSearch */ 240b1bbbbbSAndreas Gohr protected array $terms; 250b1bbbbbSAndreas Gohr 260b1bbbbbSAndreas Gohr /** @var PageSet|null lazy-loaded universe of all indexed pages */ 270b1bbbbbSAndreas Gohr protected ?PageSet $allPages = null; 280b1bbbbbSAndreas Gohr 290b1bbbbbSAndreas Gohr /** 300b1bbbbbSAndreas Gohr * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary'] 316734bb8cSAndreas Gohr * @param Term[] $terms word => Term mapping from CollectionSearch::execute() 320b1bbbbbSAndreas Gohr */ 330b1bbbbbSAndreas Gohr public function __construct(array $rpn, array $terms) 340b1bbbbbSAndreas Gohr { 350b1bbbbbSAndreas Gohr $this->rpn = $rpn; 360b1bbbbbSAndreas Gohr $this->terms = $terms; 370b1bbbbbSAndreas Gohr } 380b1bbbbbSAndreas Gohr 390b1bbbbbSAndreas Gohr /** 400b1bbbbbSAndreas Gohr * Evaluate the RPN query and return matching pages with scores 410b1bbbbbSAndreas Gohr * 420b1bbbbbSAndreas Gohr * The query is represented in Reverse Polish Notation (RPN), also known as postfix 430b1bbbbbSAndreas Gohr * notation. In RPN, operands come first and operators follow. For example, the infix 440b1bbbbbSAndreas Gohr * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for 450b1bbbbbSAndreas Gohr * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND". 460b1bbbbbSAndreas Gohr * 470b1bbbbbSAndreas Gohr * Evaluation uses a stack. Each token is processed left to right: operand tokens 480b1bbbbbSAndreas Gohr * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND, 490b1bbbbbSAndreas Gohr * OR, NOT) pop their operands from the stack, compute a result, and push it back. 500b1bbbbbSAndreas Gohr * After all tokens are processed, the single remaining stack entry is the final result. 510b1bbbbbSAndreas Gohr * 520b1bbbbbSAndreas Gohr * The stack entries are typed: word lookups produce PageSet entries (concrete page 530b1bbbbbSAndreas Gohr * results with scores), namespace tokens produce NamespacePredicate entries (a filter 540b1bbbbbSAndreas Gohr * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators 550b1bbbbbSAndreas Gohr * inspect the types of their operands to choose the most efficient operation — for 560b1bbbbbSAndreas Gohr * example, AND with a NegatedEntry performs set subtraction rather than requiring 570b1bbbbbSAndreas Gohr * the full page universe. 580b1bbbbbSAndreas Gohr * 590b1bbbbbSAndreas Gohr * @return array<string, int> page ID => score 600b1bbbbbSAndreas Gohr */ 610b1bbbbbSAndreas Gohr public function evaluate(): array 620b1bbbbbSAndreas Gohr { 630b1bbbbbSAndreas Gohr /** @var StackEntry[] $stack */ 640b1bbbbbSAndreas Gohr $stack = []; 650b1bbbbbSAndreas Gohr 660b1bbbbbSAndreas Gohr foreach ($this->rpn as $token) { 670b1bbbbbSAndreas Gohr switch (substr($token, 0, 3)) { 680b1bbbbbSAndreas Gohr case 'W+:': 690b1bbbbbSAndreas Gohr case 'W-:': 700b1bbbbbSAndreas Gohr case 'W_:': 710b1bbbbbSAndreas Gohr $word = substr($token, 3); 720b1bbbbbSAndreas Gohr if (isset($this->terms[$word])) { 73*9369b4a9SAndreas Gohr $stack[] = new PageSet($this->terms[$word]->getEntityFrequencies()); 740b1bbbbbSAndreas Gohr } 750b1bbbbbSAndreas Gohr break; 760b1bbbbbSAndreas Gohr 770b1bbbbbSAndreas Gohr case 'P+:': 780b1bbbbbSAndreas Gohr case 'P-:': 790b1bbbbbSAndreas Gohr $phrase = substr($token, 3); 800b1bbbbbSAndreas Gohr // Phrases are always preceded by their component words AND'd together, 810b1bbbbbSAndreas Gohr // so the top of stack contains pages matching all words in the phrase. 820b1bbbbbSAndreas Gohr // We verify the actual phrase exists in those candidate pages. 830b1bbbbbSAndreas Gohr $candidates = end($stack) ?: new PageSet(); 840b1bbbbbSAndreas Gohr $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates)); 850b1bbbbbSAndreas Gohr break; 860b1bbbbbSAndreas Gohr 870b1bbbbbSAndreas Gohr case 'N+:': 880b1bbbbbSAndreas Gohr case 'N-:': 890b1bbbbbSAndreas Gohr $ns = cleanID(substr($token, 3)) . ':'; 900b1bbbbbSAndreas Gohr $stack[] = new NamespacePredicate($ns); 910b1bbbbbSAndreas Gohr break; 920b1bbbbbSAndreas Gohr 930b1bbbbbSAndreas Gohr case 'AND': 940b1bbbbbSAndreas Gohr $right = array_pop($stack); 950b1bbbbbSAndreas Gohr $left = array_pop($stack); 960b1bbbbbSAndreas Gohr $stack[] = $this->opAnd($left, $right); 970b1bbbbbSAndreas Gohr break; 980b1bbbbbSAndreas Gohr 990b1bbbbbSAndreas Gohr case 'OR': 1000b1bbbbbSAndreas Gohr $right = array_pop($stack); 1010b1bbbbbSAndreas Gohr $left = array_pop($stack); 1020b1bbbbbSAndreas Gohr $stack[] = $this->opOr($left, $right); 1030b1bbbbbSAndreas Gohr break; 1040b1bbbbbSAndreas Gohr 1050b1bbbbbSAndreas Gohr case 'NOT': 1060b1bbbbbSAndreas Gohr $operand = array_pop($stack); 1070b1bbbbbSAndreas Gohr $stack[] = new NegatedEntry($operand); 1080b1bbbbbSAndreas Gohr break; 1090b1bbbbbSAndreas Gohr } 1100b1bbbbbSAndreas Gohr } 1110b1bbbbbSAndreas Gohr 1120b1bbbbbSAndreas Gohr $result = array_pop($stack) ?? new PageSet(); 1130b1bbbbbSAndreas Gohr return $this->materialize($result)->getPages(); 1140b1bbbbbSAndreas Gohr } 1150b1bbbbbSAndreas Gohr 1160b1bbbbbSAndreas Gohr // region Operators 1170b1bbbbbSAndreas Gohr 1180b1bbbbbSAndreas Gohr /** 1190b1bbbbbSAndreas Gohr * AND: combine two operands based on their types 1200b1bbbbbSAndreas Gohr * 1210b1bbbbbSAndreas Gohr * PageSet AND PageSet produces an intersection with summed scores. When one 1220b1bbbbbSAndreas Gohr * operand is a NegatedEntry, the operation becomes set subtraction. When one 1230b1bbbbbSAndreas Gohr * operand is a NamespacePredicate, the other is filtered by namespace prefix. 1240b1bbbbbSAndreas Gohr * 1250b1bbbbbSAndreas Gohr * @param StackEntry $left 1260b1bbbbbSAndreas Gohr * @param StackEntry $right 1270b1bbbbbSAndreas Gohr * @return StackEntry 1280b1bbbbbSAndreas Gohr */ 1290b1bbbbbSAndreas Gohr protected function opAnd(StackEntry $left, StackEntry $right): StackEntry 1300b1bbbbbSAndreas Gohr { 1310b1bbbbbSAndreas Gohr // page set AND negated → subtract 1320b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof NegatedEntry) { 1330b1bbbbbSAndreas Gohr return $this->subtractNegated($left, $right); 1340b1bbbbbSAndreas Gohr } 1350b1bbbbbSAndreas Gohr if ($left instanceof NegatedEntry && $right instanceof PageSet) { 1360b1bbbbbSAndreas Gohr return $this->subtractNegated($right, $left); 1370b1bbbbbSAndreas Gohr } 1380b1bbbbbSAndreas Gohr 1390b1bbbbbSAndreas Gohr // page set AND namespace → filter by prefix 1400b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof NamespacePredicate) { 1410b1bbbbbSAndreas Gohr return $right->filter($left); 1420b1bbbbbSAndreas Gohr } 1430b1bbbbbSAndreas Gohr if ($left instanceof NamespacePredicate && $right instanceof PageSet) { 1440b1bbbbbSAndreas Gohr return $left->filter($right); 1450b1bbbbbSAndreas Gohr } 1460b1bbbbbSAndreas Gohr 1470b1bbbbbSAndreas Gohr // page set AND page set → intersect, sum scores 1480b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof PageSet) { 1490b1bbbbbSAndreas Gohr return $left->intersect($right); 1500b1bbbbbSAndreas Gohr } 1510b1bbbbbSAndreas Gohr 1520b1bbbbbSAndreas Gohr // rare cases (negated AND negated, namespace AND namespace, etc.) 1530b1bbbbbSAndreas Gohr return $this->materialize($left)->intersect($this->materialize($right)); 1540b1bbbbbSAndreas Gohr } 1550b1bbbbbSAndreas Gohr 1560b1bbbbbSAndreas Gohr /** 1570b1bbbbbSAndreas Gohr * OR: unite two operands 1580b1bbbbbSAndreas Gohr * 1590b1bbbbbSAndreas Gohr * PageSet OR PageSet produces a union with summed scores. Other combinations 1600b1bbbbbSAndreas Gohr * require materializing operands into concrete page sets first. 1610b1bbbbbSAndreas Gohr * 1620b1bbbbbSAndreas Gohr * @param StackEntry $left 1630b1bbbbbSAndreas Gohr * @param StackEntry $right 1640b1bbbbbSAndreas Gohr * @return StackEntry 1650b1bbbbbSAndreas Gohr */ 1660b1bbbbbSAndreas Gohr protected function opOr(StackEntry $left, StackEntry $right): StackEntry 1670b1bbbbbSAndreas Gohr { 1680b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof PageSet) { 1690b1bbbbbSAndreas Gohr return $left->unite($right); 1700b1bbbbbSAndreas Gohr } 1710b1bbbbbSAndreas Gohr 1720b1bbbbbSAndreas Gohr return $this->materialize($left)->unite($this->materialize($right)); 1730b1bbbbbSAndreas Gohr } 1740b1bbbbbSAndreas Gohr 1750b1bbbbbSAndreas Gohr /** 1760b1bbbbbSAndreas Gohr * Subtract a NegatedEntry from a PageSet 1770b1bbbbbSAndreas Gohr * 1780b1bbbbbSAndreas Gohr * The inner entry of the NegatedEntry determines the operation: 1790b1bbbbbSAndreas Gohr * - NegatedEntry(PageSet): set subtraction 1800b1bbbbbSAndreas Gohr * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace 1810b1bbbbbSAndreas Gohr * 1820b1bbbbbSAndreas Gohr * @param PageSet $pages the positive page set 1830b1bbbbbSAndreas Gohr * @param NegatedEntry $negated the negated operand 1840b1bbbbbSAndreas Gohr * @return PageSet 1850b1bbbbbSAndreas Gohr */ 1860b1bbbbbSAndreas Gohr protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet 1870b1bbbbbSAndreas Gohr { 1880b1bbbbbSAndreas Gohr $inner = $negated->getInner(); 1890b1bbbbbSAndreas Gohr 1900b1bbbbbSAndreas Gohr if ($inner instanceof NamespacePredicate) { 1910b1bbbbbSAndreas Gohr return $inner->exclude($pages); 1920b1bbbbbSAndreas Gohr } 1930b1bbbbbSAndreas Gohr 1940b1bbbbbSAndreas Gohr return $pages->subtract($this->materialize($inner)); 1950b1bbbbbSAndreas Gohr } 1960b1bbbbbSAndreas Gohr 1970b1bbbbbSAndreas Gohr // endregion 1980b1bbbbbSAndreas Gohr 1990b1bbbbbSAndreas Gohr // region Phrase matching 2000b1bbbbbSAndreas Gohr 2010b1bbbbbSAndreas Gohr /** 2020b1bbbbbSAndreas Gohr * Check which pages from the candidate set contain the given phrase 2030b1bbbbbSAndreas Gohr * 2040b1bbbbbSAndreas Gohr * Verifies phrase presence by reading each page's raw wiki text. 2050b1bbbbbSAndreas Gohr * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event. 2060b1bbbbbSAndreas Gohr * Pages that match retain their original scores from the candidate set. 2070b1bbbbbSAndreas Gohr * 2080b1bbbbbSAndreas Gohr * @param string $phrase the phrase to search for 2090b1bbbbbSAndreas Gohr * @param PageSet $candidates pages to check (typically the AND'd word results) 2100b1bbbbbSAndreas Gohr * @return PageSet pages where the phrase was found, with original scores preserved 2110b1bbbbbSAndreas Gohr */ 2120b1bbbbbSAndreas Gohr protected function matchPhrase(string $phrase, PageSet $candidates): PageSet 2130b1bbbbbSAndreas Gohr { 2140b1bbbbbSAndreas Gohr $matched = []; 2150b1bbbbbSAndreas Gohr foreach ($candidates->getPages() as $id => $score) { 2160b1bbbbbSAndreas Gohr $evdata = [ 2170b1bbbbbSAndreas Gohr 'id' => $id, 2180b1bbbbbSAndreas Gohr 'phrase' => $phrase, 2190b1bbbbbSAndreas Gohr 'text' => rawWiki($id), 2200b1bbbbbSAndreas Gohr ]; 2210b1bbbbbSAndreas Gohr $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 2220b1bbbbbSAndreas Gohr if ($event->advise_before() && $event->result !== true) { 223*9369b4a9SAndreas Gohr $text = PhpString::strtolower($evdata['text']); 2240b1bbbbbSAndreas Gohr if (str_contains($text, $phrase)) { 2250b1bbbbbSAndreas Gohr $event->result = true; 2260b1bbbbbSAndreas Gohr } 2270b1bbbbbSAndreas Gohr } 2280b1bbbbbSAndreas Gohr $event->advise_after(); 2290b1bbbbbSAndreas Gohr if ($event->result === true) { 2300b1bbbbbSAndreas Gohr $matched[$id] = $score; 2310b1bbbbbSAndreas Gohr } 2320b1bbbbbSAndreas Gohr } 2330b1bbbbbSAndreas Gohr return new PageSet($matched); 2340b1bbbbbSAndreas Gohr } 2350b1bbbbbSAndreas Gohr 2360b1bbbbbSAndreas Gohr // endregion 2370b1bbbbbSAndreas Gohr 2380b1bbbbbSAndreas Gohr // region Materialization 2390b1bbbbbSAndreas Gohr 2400b1bbbbbSAndreas Gohr /** 2410b1bbbbbSAndreas Gohr * Convert any StackEntry into a concrete PageSet 2420b1bbbbbSAndreas Gohr * 2430b1bbbbbSAndreas Gohr * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry 2440b1bbbbbSAndreas Gohr * cannot be resolved without knowing all existing pages — a namespace 2450b1bbbbbSAndreas Gohr * predicate needs the full page list to find matching pages, and a negated 2460b1bbbbbSAndreas Gohr * entry needs it to compute the complement. This is why materialization 2470b1bbbbbSAndreas Gohr * triggers a lazy-load of the full page index from disk. 2480b1bbbbbSAndreas Gohr * 2490b1bbbbbSAndreas Gohr * This is only needed for standalone namespace or negative-only queries 2500b1bbbbbSAndreas Gohr * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed 2510b1bbbbbSAndreas Gohr * operators handle these entries without the universe. 2520b1bbbbbSAndreas Gohr * 2530b1bbbbbSAndreas Gohr * @param StackEntry $entry 2540b1bbbbbSAndreas Gohr * @return PageSet 2550b1bbbbbSAndreas Gohr */ 2560b1bbbbbSAndreas Gohr protected function materialize(StackEntry $entry): PageSet 2570b1bbbbbSAndreas Gohr { 2580b1bbbbbSAndreas Gohr if ($entry instanceof PageSet) { 2590b1bbbbbSAndreas Gohr return $entry; 2600b1bbbbbSAndreas Gohr } 2610b1bbbbbSAndreas Gohr 2620b1bbbbbSAndreas Gohr if ($entry instanceof NegatedEntry) { 2630b1bbbbbSAndreas Gohr return $this->getAllPages()->subtract($this->materialize($entry->getInner())); 2640b1bbbbbSAndreas Gohr } 2650b1bbbbbSAndreas Gohr 2660b1bbbbbSAndreas Gohr if ($entry instanceof NamespacePredicate) { 2670b1bbbbbSAndreas Gohr return $entry->filter($this->getAllPages()); 2680b1bbbbbSAndreas Gohr } 2690b1bbbbbSAndreas Gohr 2700b1bbbbbSAndreas Gohr return new PageSet(); 2710b1bbbbbSAndreas Gohr } 2720b1bbbbbSAndreas Gohr 2730b1bbbbbSAndreas Gohr /** 2740b1bbbbbSAndreas Gohr * Lazy-load the universe of all indexed pages 2750b1bbbbbSAndreas Gohr * 2760b1bbbbbSAndreas Gohr * @return PageSet all pages with score 0 2770b1bbbbbSAndreas Gohr */ 2780b1bbbbbSAndreas Gohr protected function getAllPages(): PageSet 2790b1bbbbbSAndreas Gohr { 280*9369b4a9SAndreas Gohr if (!$this->allPages instanceof PageSet) { 2810b1bbbbbSAndreas Gohr $pages = (new Indexer())->getAllPages(); 2820b1bbbbbSAndreas Gohr $this->allPages = new PageSet(array_fill_keys($pages, 0)); 2830b1bbbbbSAndreas Gohr } 2840b1bbbbbSAndreas Gohr return $this->allPages; 2850b1bbbbbSAndreas Gohr } 2860b1bbbbbSAndreas Gohr 2870b1bbbbbSAndreas Gohr // endregion 2880b1bbbbbSAndreas Gohr} 289