1*0b1bbbbbSAndreas Gohr<?php 2*0b1bbbbbSAndreas Gohr 3*0b1bbbbbSAndreas Gohrnamespace dokuwiki\Search\Query; 4*0b1bbbbbSAndreas Gohr 5*0b1bbbbbSAndreas Gohruse dokuwiki\Extension\Event; 6*0b1bbbbbSAndreas Gohruse dokuwiki\Search\Collection\Term; 7*0b1bbbbbSAndreas Gohruse dokuwiki\Search\Indexer; 8*0b1bbbbbSAndreas Gohruse dokuwiki\Utf8; 9*0b1bbbbbSAndreas Gohr 10*0b1bbbbbSAndreas Gohr/** 11*0b1bbbbbSAndreas Gohr * Evaluates a parsed search query against word lookup results 12*0b1bbbbbSAndreas Gohr * 13*0b1bbbbbSAndreas Gohr * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry) 14*0b1bbbbbSAndreas Gohr * to avoid materializing the full page index unless absolutely necessary 15*0b1bbbbbSAndreas Gohr * (standalone NOT or namespace-only queries). 16*0b1bbbbbSAndreas Gohr */ 17*0b1bbbbbSAndreas Gohrclass QueryEvaluator 18*0b1bbbbbSAndreas Gohr{ 19*0b1bbbbbSAndreas Gohr /** @var string[] RPN token array from QueryParser */ 20*0b1bbbbbSAndreas Gohr protected array $rpn; 21*0b1bbbbbSAndreas Gohr 22*0b1bbbbbSAndreas Gohr /** @var Term[] word => Term mapping from FrequencyCollectionSearch */ 23*0b1bbbbbSAndreas Gohr protected array $terms; 24*0b1bbbbbSAndreas Gohr 25*0b1bbbbbSAndreas Gohr /** @var PageSet|null lazy-loaded universe of all indexed pages */ 26*0b1bbbbbSAndreas Gohr protected ?PageSet $allPages = null; 27*0b1bbbbbSAndreas Gohr 28*0b1bbbbbSAndreas Gohr /** 29*0b1bbbbbSAndreas Gohr * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary'] 30*0b1bbbbbSAndreas Gohr * @param Term[] $terms word => Term mapping from FrequencyCollectionSearch::execute() 31*0b1bbbbbSAndreas Gohr */ 32*0b1bbbbbSAndreas Gohr public function __construct(array $rpn, array $terms) 33*0b1bbbbbSAndreas Gohr { 34*0b1bbbbbSAndreas Gohr $this->rpn = $rpn; 35*0b1bbbbbSAndreas Gohr $this->terms = $terms; 36*0b1bbbbbSAndreas Gohr } 37*0b1bbbbbSAndreas Gohr 38*0b1bbbbbSAndreas Gohr /** 39*0b1bbbbbSAndreas Gohr * Evaluate the RPN query and return matching pages with scores 40*0b1bbbbbSAndreas Gohr * 41*0b1bbbbbSAndreas Gohr * The query is represented in Reverse Polish Notation (RPN), also known as postfix 42*0b1bbbbbSAndreas Gohr * notation. In RPN, operands come first and operators follow. For example, the infix 43*0b1bbbbbSAndreas Gohr * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for 44*0b1bbbbbSAndreas Gohr * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND". 45*0b1bbbbbSAndreas Gohr * 46*0b1bbbbbSAndreas Gohr * Evaluation uses a stack. Each token is processed left to right: operand tokens 47*0b1bbbbbSAndreas Gohr * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND, 48*0b1bbbbbSAndreas Gohr * OR, NOT) pop their operands from the stack, compute a result, and push it back. 49*0b1bbbbbSAndreas Gohr * After all tokens are processed, the single remaining stack entry is the final result. 50*0b1bbbbbSAndreas Gohr * 51*0b1bbbbbSAndreas Gohr * The stack entries are typed: word lookups produce PageSet entries (concrete page 52*0b1bbbbbSAndreas Gohr * results with scores), namespace tokens produce NamespacePredicate entries (a filter 53*0b1bbbbbSAndreas Gohr * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators 54*0b1bbbbbSAndreas Gohr * inspect the types of their operands to choose the most efficient operation — for 55*0b1bbbbbSAndreas Gohr * example, AND with a NegatedEntry performs set subtraction rather than requiring 56*0b1bbbbbSAndreas Gohr * the full page universe. 57*0b1bbbbbSAndreas Gohr * 58*0b1bbbbbSAndreas Gohr * @return array<string, int> page ID => score 59*0b1bbbbbSAndreas Gohr */ 60*0b1bbbbbSAndreas Gohr public function evaluate(): array 61*0b1bbbbbSAndreas Gohr { 62*0b1bbbbbSAndreas Gohr /** @var StackEntry[] $stack */ 63*0b1bbbbbSAndreas Gohr $stack = []; 64*0b1bbbbbSAndreas Gohr 65*0b1bbbbbSAndreas Gohr foreach ($this->rpn as $token) { 66*0b1bbbbbSAndreas Gohr switch (substr($token, 0, 3)) { 67*0b1bbbbbSAndreas Gohr case 'W+:': 68*0b1bbbbbSAndreas Gohr case 'W-:': 69*0b1bbbbbSAndreas Gohr case 'W_:': 70*0b1bbbbbSAndreas Gohr $word = substr($token, 3); 71*0b1bbbbbSAndreas Gohr if (isset($this->terms[$word])) { 72*0b1bbbbbSAndreas Gohr $stack[] = new PageSet((array)$this->terms[$word]->getEntityFrequencies()); 73*0b1bbbbbSAndreas Gohr } 74*0b1bbbbbSAndreas Gohr break; 75*0b1bbbbbSAndreas Gohr 76*0b1bbbbbSAndreas Gohr case 'P+:': 77*0b1bbbbbSAndreas Gohr case 'P-:': 78*0b1bbbbbSAndreas Gohr $phrase = substr($token, 3); 79*0b1bbbbbSAndreas Gohr // Phrases are always preceded by their component words AND'd together, 80*0b1bbbbbSAndreas Gohr // so the top of stack contains pages matching all words in the phrase. 81*0b1bbbbbSAndreas Gohr // We verify the actual phrase exists in those candidate pages. 82*0b1bbbbbSAndreas Gohr $candidates = end($stack) ?: new PageSet(); 83*0b1bbbbbSAndreas Gohr $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates)); 84*0b1bbbbbSAndreas Gohr break; 85*0b1bbbbbSAndreas Gohr 86*0b1bbbbbSAndreas Gohr case 'N+:': 87*0b1bbbbbSAndreas Gohr case 'N-:': 88*0b1bbbbbSAndreas Gohr $ns = cleanID(substr($token, 3)) . ':'; 89*0b1bbbbbSAndreas Gohr $stack[] = new NamespacePredicate($ns); 90*0b1bbbbbSAndreas Gohr break; 91*0b1bbbbbSAndreas Gohr 92*0b1bbbbbSAndreas Gohr case 'AND': 93*0b1bbbbbSAndreas Gohr $right = array_pop($stack); 94*0b1bbbbbSAndreas Gohr $left = array_pop($stack); 95*0b1bbbbbSAndreas Gohr $stack[] = $this->opAnd($left, $right); 96*0b1bbbbbSAndreas Gohr break; 97*0b1bbbbbSAndreas Gohr 98*0b1bbbbbSAndreas Gohr case 'OR': 99*0b1bbbbbSAndreas Gohr $right = array_pop($stack); 100*0b1bbbbbSAndreas Gohr $left = array_pop($stack); 101*0b1bbbbbSAndreas Gohr $stack[] = $this->opOr($left, $right); 102*0b1bbbbbSAndreas Gohr break; 103*0b1bbbbbSAndreas Gohr 104*0b1bbbbbSAndreas Gohr case 'NOT': 105*0b1bbbbbSAndreas Gohr $operand = array_pop($stack); 106*0b1bbbbbSAndreas Gohr $stack[] = new NegatedEntry($operand); 107*0b1bbbbbSAndreas Gohr break; 108*0b1bbbbbSAndreas Gohr } 109*0b1bbbbbSAndreas Gohr } 110*0b1bbbbbSAndreas Gohr 111*0b1bbbbbSAndreas Gohr $result = array_pop($stack) ?? new PageSet(); 112*0b1bbbbbSAndreas Gohr return $this->materialize($result)->getPages(); 113*0b1bbbbbSAndreas Gohr } 114*0b1bbbbbSAndreas Gohr 115*0b1bbbbbSAndreas Gohr // region Operators 116*0b1bbbbbSAndreas Gohr 117*0b1bbbbbSAndreas Gohr /** 118*0b1bbbbbSAndreas Gohr * AND: combine two operands based on their types 119*0b1bbbbbSAndreas Gohr * 120*0b1bbbbbSAndreas Gohr * PageSet AND PageSet produces an intersection with summed scores. When one 121*0b1bbbbbSAndreas Gohr * operand is a NegatedEntry, the operation becomes set subtraction. When one 122*0b1bbbbbSAndreas Gohr * operand is a NamespacePredicate, the other is filtered by namespace prefix. 123*0b1bbbbbSAndreas Gohr * 124*0b1bbbbbSAndreas Gohr * @param StackEntry $left 125*0b1bbbbbSAndreas Gohr * @param StackEntry $right 126*0b1bbbbbSAndreas Gohr * @return StackEntry 127*0b1bbbbbSAndreas Gohr */ 128*0b1bbbbbSAndreas Gohr protected function opAnd(StackEntry $left, StackEntry $right): StackEntry 129*0b1bbbbbSAndreas Gohr { 130*0b1bbbbbSAndreas Gohr // page set AND negated → subtract 131*0b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof NegatedEntry) { 132*0b1bbbbbSAndreas Gohr return $this->subtractNegated($left, $right); 133*0b1bbbbbSAndreas Gohr } 134*0b1bbbbbSAndreas Gohr if ($left instanceof NegatedEntry && $right instanceof PageSet) { 135*0b1bbbbbSAndreas Gohr return $this->subtractNegated($right, $left); 136*0b1bbbbbSAndreas Gohr } 137*0b1bbbbbSAndreas Gohr 138*0b1bbbbbSAndreas Gohr // page set AND namespace → filter by prefix 139*0b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof NamespacePredicate) { 140*0b1bbbbbSAndreas Gohr return $right->filter($left); 141*0b1bbbbbSAndreas Gohr } 142*0b1bbbbbSAndreas Gohr if ($left instanceof NamespacePredicate && $right instanceof PageSet) { 143*0b1bbbbbSAndreas Gohr return $left->filter($right); 144*0b1bbbbbSAndreas Gohr } 145*0b1bbbbbSAndreas Gohr 146*0b1bbbbbSAndreas Gohr // page set AND page set → intersect, sum scores 147*0b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof PageSet) { 148*0b1bbbbbSAndreas Gohr return $left->intersect($right); 149*0b1bbbbbSAndreas Gohr } 150*0b1bbbbbSAndreas Gohr 151*0b1bbbbbSAndreas Gohr // rare cases (negated AND negated, namespace AND namespace, etc.) 152*0b1bbbbbSAndreas Gohr return $this->materialize($left)->intersect($this->materialize($right)); 153*0b1bbbbbSAndreas Gohr } 154*0b1bbbbbSAndreas Gohr 155*0b1bbbbbSAndreas Gohr /** 156*0b1bbbbbSAndreas Gohr * OR: unite two operands 157*0b1bbbbbSAndreas Gohr * 158*0b1bbbbbSAndreas Gohr * PageSet OR PageSet produces a union with summed scores. Other combinations 159*0b1bbbbbSAndreas Gohr * require materializing operands into concrete page sets first. 160*0b1bbbbbSAndreas Gohr * 161*0b1bbbbbSAndreas Gohr * @param StackEntry $left 162*0b1bbbbbSAndreas Gohr * @param StackEntry $right 163*0b1bbbbbSAndreas Gohr * @return StackEntry 164*0b1bbbbbSAndreas Gohr */ 165*0b1bbbbbSAndreas Gohr protected function opOr(StackEntry $left, StackEntry $right): StackEntry 166*0b1bbbbbSAndreas Gohr { 167*0b1bbbbbSAndreas Gohr if ($left instanceof PageSet && $right instanceof PageSet) { 168*0b1bbbbbSAndreas Gohr return $left->unite($right); 169*0b1bbbbbSAndreas Gohr } 170*0b1bbbbbSAndreas Gohr 171*0b1bbbbbSAndreas Gohr return $this->materialize($left)->unite($this->materialize($right)); 172*0b1bbbbbSAndreas Gohr } 173*0b1bbbbbSAndreas Gohr 174*0b1bbbbbSAndreas Gohr /** 175*0b1bbbbbSAndreas Gohr * Subtract a NegatedEntry from a PageSet 176*0b1bbbbbSAndreas Gohr * 177*0b1bbbbbSAndreas Gohr * The inner entry of the NegatedEntry determines the operation: 178*0b1bbbbbSAndreas Gohr * - NegatedEntry(PageSet): set subtraction 179*0b1bbbbbSAndreas Gohr * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace 180*0b1bbbbbSAndreas Gohr * 181*0b1bbbbbSAndreas Gohr * @param PageSet $pages the positive page set 182*0b1bbbbbSAndreas Gohr * @param NegatedEntry $negated the negated operand 183*0b1bbbbbSAndreas Gohr * @return PageSet 184*0b1bbbbbSAndreas Gohr */ 185*0b1bbbbbSAndreas Gohr protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet 186*0b1bbbbbSAndreas Gohr { 187*0b1bbbbbSAndreas Gohr $inner = $negated->getInner(); 188*0b1bbbbbSAndreas Gohr 189*0b1bbbbbSAndreas Gohr if ($inner instanceof NamespacePredicate) { 190*0b1bbbbbSAndreas Gohr return $inner->exclude($pages); 191*0b1bbbbbSAndreas Gohr } 192*0b1bbbbbSAndreas Gohr 193*0b1bbbbbSAndreas Gohr return $pages->subtract($this->materialize($inner)); 194*0b1bbbbbSAndreas Gohr } 195*0b1bbbbbSAndreas Gohr 196*0b1bbbbbSAndreas Gohr // endregion 197*0b1bbbbbSAndreas Gohr 198*0b1bbbbbSAndreas Gohr // region Phrase matching 199*0b1bbbbbSAndreas Gohr 200*0b1bbbbbSAndreas Gohr /** 201*0b1bbbbbSAndreas Gohr * Check which pages from the candidate set contain the given phrase 202*0b1bbbbbSAndreas Gohr * 203*0b1bbbbbSAndreas Gohr * Verifies phrase presence by reading each page's raw wiki text. 204*0b1bbbbbSAndreas Gohr * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event. 205*0b1bbbbbSAndreas Gohr * Pages that match retain their original scores from the candidate set. 206*0b1bbbbbSAndreas Gohr * 207*0b1bbbbbSAndreas Gohr * @param string $phrase the phrase to search for 208*0b1bbbbbSAndreas Gohr * @param PageSet $candidates pages to check (typically the AND'd word results) 209*0b1bbbbbSAndreas Gohr * @return PageSet pages where the phrase was found, with original scores preserved 210*0b1bbbbbSAndreas Gohr */ 211*0b1bbbbbSAndreas Gohr protected function matchPhrase(string $phrase, PageSet $candidates): PageSet 212*0b1bbbbbSAndreas Gohr { 213*0b1bbbbbSAndreas Gohr $matched = []; 214*0b1bbbbbSAndreas Gohr foreach ($candidates->getPages() as $id => $score) { 215*0b1bbbbbSAndreas Gohr $evdata = [ 216*0b1bbbbbSAndreas Gohr 'id' => $id, 217*0b1bbbbbSAndreas Gohr 'phrase' => $phrase, 218*0b1bbbbbSAndreas Gohr 'text' => rawWiki($id), 219*0b1bbbbbSAndreas Gohr ]; 220*0b1bbbbbSAndreas Gohr $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 221*0b1bbbbbSAndreas Gohr if ($event->advise_before() && $event->result !== true) { 222*0b1bbbbbSAndreas Gohr $text = Utf8\PhpString::strtolower($evdata['text']); 223*0b1bbbbbSAndreas Gohr if (str_contains($text, $phrase)) { 224*0b1bbbbbSAndreas Gohr $event->result = true; 225*0b1bbbbbSAndreas Gohr } 226*0b1bbbbbSAndreas Gohr } 227*0b1bbbbbSAndreas Gohr $event->advise_after(); 228*0b1bbbbbSAndreas Gohr if ($event->result === true) { 229*0b1bbbbbSAndreas Gohr $matched[$id] = $score; 230*0b1bbbbbSAndreas Gohr } 231*0b1bbbbbSAndreas Gohr } 232*0b1bbbbbSAndreas Gohr return new PageSet($matched); 233*0b1bbbbbSAndreas Gohr } 234*0b1bbbbbSAndreas Gohr 235*0b1bbbbbSAndreas Gohr // endregion 236*0b1bbbbbSAndreas Gohr 237*0b1bbbbbSAndreas Gohr // region Materialization 238*0b1bbbbbSAndreas Gohr 239*0b1bbbbbSAndreas Gohr /** 240*0b1bbbbbSAndreas Gohr * Convert any StackEntry into a concrete PageSet 241*0b1bbbbbSAndreas Gohr * 242*0b1bbbbbSAndreas Gohr * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry 243*0b1bbbbbSAndreas Gohr * cannot be resolved without knowing all existing pages — a namespace 244*0b1bbbbbSAndreas Gohr * predicate needs the full page list to find matching pages, and a negated 245*0b1bbbbbSAndreas Gohr * entry needs it to compute the complement. This is why materialization 246*0b1bbbbbSAndreas Gohr * triggers a lazy-load of the full page index from disk. 247*0b1bbbbbSAndreas Gohr * 248*0b1bbbbbSAndreas Gohr * This is only needed for standalone namespace or negative-only queries 249*0b1bbbbbSAndreas Gohr * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed 250*0b1bbbbbSAndreas Gohr * operators handle these entries without the universe. 251*0b1bbbbbSAndreas Gohr * 252*0b1bbbbbSAndreas Gohr * @param StackEntry $entry 253*0b1bbbbbSAndreas Gohr * @return PageSet 254*0b1bbbbbSAndreas Gohr */ 255*0b1bbbbbSAndreas Gohr protected function materialize(StackEntry $entry): PageSet 256*0b1bbbbbSAndreas Gohr { 257*0b1bbbbbSAndreas Gohr if ($entry instanceof PageSet) { 258*0b1bbbbbSAndreas Gohr return $entry; 259*0b1bbbbbSAndreas Gohr } 260*0b1bbbbbSAndreas Gohr 261*0b1bbbbbSAndreas Gohr if ($entry instanceof NegatedEntry) { 262*0b1bbbbbSAndreas Gohr return $this->getAllPages()->subtract($this->materialize($entry->getInner())); 263*0b1bbbbbSAndreas Gohr } 264*0b1bbbbbSAndreas Gohr 265*0b1bbbbbSAndreas Gohr if ($entry instanceof NamespacePredicate) { 266*0b1bbbbbSAndreas Gohr return $entry->filter($this->getAllPages()); 267*0b1bbbbbSAndreas Gohr } 268*0b1bbbbbSAndreas Gohr 269*0b1bbbbbSAndreas Gohr return new PageSet(); 270*0b1bbbbbSAndreas Gohr } 271*0b1bbbbbSAndreas Gohr 272*0b1bbbbbSAndreas Gohr /** 273*0b1bbbbbSAndreas Gohr * Lazy-load the universe of all indexed pages 274*0b1bbbbbSAndreas Gohr * 275*0b1bbbbbSAndreas Gohr * @return PageSet all pages with score 0 276*0b1bbbbbSAndreas Gohr */ 277*0b1bbbbbSAndreas Gohr protected function getAllPages(): PageSet 278*0b1bbbbbSAndreas Gohr { 279*0b1bbbbbSAndreas Gohr if ($this->allPages === null) { 280*0b1bbbbbSAndreas Gohr $pages = (new Indexer())->getAllPages(); 281*0b1bbbbbSAndreas Gohr $this->allPages = new PageSet(array_fill_keys($pages, 0)); 282*0b1bbbbbSAndreas Gohr } 283*0b1bbbbbSAndreas Gohr return $this->allPages; 284*0b1bbbbbSAndreas Gohr } 285*0b1bbbbbSAndreas Gohr 286*0b1bbbbbSAndreas Gohr // endregion 287*0b1bbbbbSAndreas Gohr} 288