1<?php 2 3namespace dokuwiki\Search\Query; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\Term; 7use dokuwiki\Search\Indexer; 8use dokuwiki\Utf8; 9 10/** 11 * Evaluates a parsed search query against word lookup results 12 * 13 * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry) 14 * to avoid materializing the full page index unless absolutely necessary 15 * (standalone NOT or namespace-only queries). 16 */ 17class QueryEvaluator 18{ 19 /** @var string[] RPN token array from QueryParser */ 20 protected array $rpn; 21 22 /** @var Term[] word => Term mapping from FrequencyCollectionSearch */ 23 protected array $terms; 24 25 /** @var PageSet|null lazy-loaded universe of all indexed pages */ 26 protected ?PageSet $allPages = null; 27 28 /** 29 * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary'] 30 * @param Term[] $terms word => Term mapping from FrequencyCollectionSearch::execute() 31 */ 32 public function __construct(array $rpn, array $terms) 33 { 34 $this->rpn = $rpn; 35 $this->terms = $terms; 36 } 37 38 /** 39 * Evaluate the RPN query and return matching pages with scores 40 * 41 * The query is represented in Reverse Polish Notation (RPN), also known as postfix 42 * notation. In RPN, operands come first and operators follow. For example, the infix 43 * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for 44 * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND". 45 * 46 * Evaluation uses a stack. Each token is processed left to right: operand tokens 47 * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND, 48 * OR, NOT) pop their operands from the stack, compute a result, and push it back. 49 * After all tokens are processed, the single remaining stack entry is the final result. 50 * 51 * The stack entries are typed: word lookups produce PageSet entries (concrete page 52 * results with scores), namespace tokens produce NamespacePredicate entries (a filter 53 * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators 54 * inspect the types of their operands to choose the most efficient operation — for 55 * example, AND with a NegatedEntry performs set subtraction rather than requiring 56 * the full page universe. 57 * 58 * @return array<string, int> page ID => score 59 */ 60 public function evaluate(): array 61 { 62 /** @var StackEntry[] $stack */ 63 $stack = []; 64 65 foreach ($this->rpn as $token) { 66 switch (substr($token, 0, 3)) { 67 case 'W+:': 68 case 'W-:': 69 case 'W_:': 70 $word = substr($token, 3); 71 if (isset($this->terms[$word])) { 72 $stack[] = new PageSet((array)$this->terms[$word]->getEntityFrequencies()); 73 } 74 break; 75 76 case 'P+:': 77 case 'P-:': 78 $phrase = substr($token, 3); 79 // Phrases are always preceded by their component words AND'd together, 80 // so the top of stack contains pages matching all words in the phrase. 81 // We verify the actual phrase exists in those candidate pages. 82 $candidates = end($stack) ?: new PageSet(); 83 $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates)); 84 break; 85 86 case 'N+:': 87 case 'N-:': 88 $ns = cleanID(substr($token, 3)) . ':'; 89 $stack[] = new NamespacePredicate($ns); 90 break; 91 92 case 'AND': 93 $right = array_pop($stack); 94 $left = array_pop($stack); 95 $stack[] = $this->opAnd($left, $right); 96 break; 97 98 case 'OR': 99 $right = array_pop($stack); 100 $left = array_pop($stack); 101 $stack[] = $this->opOr($left, $right); 102 break; 103 104 case 'NOT': 105 $operand = array_pop($stack); 106 $stack[] = new NegatedEntry($operand); 107 break; 108 } 109 } 110 111 $result = array_pop($stack) ?? new PageSet(); 112 return $this->materialize($result)->getPages(); 113 } 114 115 // region Operators 116 117 /** 118 * AND: combine two operands based on their types 119 * 120 * PageSet AND PageSet produces an intersection with summed scores. When one 121 * operand is a NegatedEntry, the operation becomes set subtraction. When one 122 * operand is a NamespacePredicate, the other is filtered by namespace prefix. 123 * 124 * @param StackEntry $left 125 * @param StackEntry $right 126 * @return StackEntry 127 */ 128 protected function opAnd(StackEntry $left, StackEntry $right): StackEntry 129 { 130 // page set AND negated → subtract 131 if ($left instanceof PageSet && $right instanceof NegatedEntry) { 132 return $this->subtractNegated($left, $right); 133 } 134 if ($left instanceof NegatedEntry && $right instanceof PageSet) { 135 return $this->subtractNegated($right, $left); 136 } 137 138 // page set AND namespace → filter by prefix 139 if ($left instanceof PageSet && $right instanceof NamespacePredicate) { 140 return $right->filter($left); 141 } 142 if ($left instanceof NamespacePredicate && $right instanceof PageSet) { 143 return $left->filter($right); 144 } 145 146 // page set AND page set → intersect, sum scores 147 if ($left instanceof PageSet && $right instanceof PageSet) { 148 return $left->intersect($right); 149 } 150 151 // rare cases (negated AND negated, namespace AND namespace, etc.) 152 return $this->materialize($left)->intersect($this->materialize($right)); 153 } 154 155 /** 156 * OR: unite two operands 157 * 158 * PageSet OR PageSet produces a union with summed scores. Other combinations 159 * require materializing operands into concrete page sets first. 160 * 161 * @param StackEntry $left 162 * @param StackEntry $right 163 * @return StackEntry 164 */ 165 protected function opOr(StackEntry $left, StackEntry $right): StackEntry 166 { 167 if ($left instanceof PageSet && $right instanceof PageSet) { 168 return $left->unite($right); 169 } 170 171 return $this->materialize($left)->unite($this->materialize($right)); 172 } 173 174 /** 175 * Subtract a NegatedEntry from a PageSet 176 * 177 * The inner entry of the NegatedEntry determines the operation: 178 * - NegatedEntry(PageSet): set subtraction 179 * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace 180 * 181 * @param PageSet $pages the positive page set 182 * @param NegatedEntry $negated the negated operand 183 * @return PageSet 184 */ 185 protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet 186 { 187 $inner = $negated->getInner(); 188 189 if ($inner instanceof NamespacePredicate) { 190 return $inner->exclude($pages); 191 } 192 193 return $pages->subtract($this->materialize($inner)); 194 } 195 196 // endregion 197 198 // region Phrase matching 199 200 /** 201 * Check which pages from the candidate set contain the given phrase 202 * 203 * Verifies phrase presence by reading each page's raw wiki text. 204 * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event. 205 * Pages that match retain their original scores from the candidate set. 206 * 207 * @param string $phrase the phrase to search for 208 * @param PageSet $candidates pages to check (typically the AND'd word results) 209 * @return PageSet pages where the phrase was found, with original scores preserved 210 */ 211 protected function matchPhrase(string $phrase, PageSet $candidates): PageSet 212 { 213 $matched = []; 214 foreach ($candidates->getPages() as $id => $score) { 215 $evdata = [ 216 'id' => $id, 217 'phrase' => $phrase, 218 'text' => rawWiki($id), 219 ]; 220 $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 221 if ($event->advise_before() && $event->result !== true) { 222 $text = Utf8\PhpString::strtolower($evdata['text']); 223 if (str_contains($text, $phrase)) { 224 $event->result = true; 225 } 226 } 227 $event->advise_after(); 228 if ($event->result === true) { 229 $matched[$id] = $score; 230 } 231 } 232 return new PageSet($matched); 233 } 234 235 // endregion 236 237 // region Materialization 238 239 /** 240 * Convert any StackEntry into a concrete PageSet 241 * 242 * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry 243 * cannot be resolved without knowing all existing pages — a namespace 244 * predicate needs the full page list to find matching pages, and a negated 245 * entry needs it to compute the complement. This is why materialization 246 * triggers a lazy-load of the full page index from disk. 247 * 248 * This is only needed for standalone namespace or negative-only queries 249 * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed 250 * operators handle these entries without the universe. 251 * 252 * @param StackEntry $entry 253 * @return PageSet 254 */ 255 protected function materialize(StackEntry $entry): PageSet 256 { 257 if ($entry instanceof PageSet) { 258 return $entry; 259 } 260 261 if ($entry instanceof NegatedEntry) { 262 return $this->getAllPages()->subtract($this->materialize($entry->getInner())); 263 } 264 265 if ($entry instanceof NamespacePredicate) { 266 return $entry->filter($this->getAllPages()); 267 } 268 269 return new PageSet(); 270 } 271 272 /** 273 * Lazy-load the universe of all indexed pages 274 * 275 * @return PageSet all pages with score 0 276 */ 277 protected function getAllPages(): PageSet 278 { 279 if ($this->allPages === null) { 280 $pages = (new Indexer())->getAllPages(); 281 $this->allPages = new PageSet(array_fill_keys($pages, 0)); 282 } 283 return $this->allPages; 284 } 285 286 // endregion 287} 288