1<?php 2 3namespace dokuwiki\Search\Query; 4 5use dokuwiki\Utf8\PhpString; 6use dokuwiki\Extension\Event; 7use dokuwiki\Search\Collection\Term; 8use dokuwiki\Search\Indexer; 9use dokuwiki\Utf8; 10 11/** 12 * Evaluates a parsed search query against word lookup results 13 * 14 * Uses typed stack entries (PageSet, NamespacePredicate, NegatedEntry) 15 * to avoid materializing the full page index unless absolutely necessary 16 * (standalone NOT or namespace-only queries). 17 */ 18class QueryEvaluator 19{ 20 /** @var string[] RPN token array from QueryParser */ 21 protected array $rpn; 22 23 /** @var Term[] word => Term mapping from CollectionSearch */ 24 protected array $terms; 25 26 /** @var PageSet|null lazy-loaded universe of all indexed pages */ 27 protected ?PageSet $allPages = null; 28 29 /** 30 * @param string[] $rpn RPN token array from QueryParser::convert()['parsed_ary'] 31 * @param Term[] $terms word => Term mapping from CollectionSearch::execute() 32 */ 33 public function __construct(array $rpn, array $terms) 34 { 35 $this->rpn = $rpn; 36 $this->terms = $terms; 37 } 38 39 /** 40 * Evaluate the RPN query and return matching pages with scores 41 * 42 * The query is represented in Reverse Polish Notation (RPN), also known as postfix 43 * notation. In RPN, operands come first and operators follow. For example, the infix 44 * expression "A AND B" becomes "A B AND" in RPN. This eliminates the need for 45 * parentheses — the expression "(A OR B) AND C" is simply "A B OR C AND". 46 * 47 * Evaluation uses a stack. Each token is processed left to right: operand tokens 48 * (words, phrases, namespaces) push an entry onto the stack; operator tokens (AND, 49 * OR, NOT) pop their operands from the stack, compute a result, and push it back. 50 * After all tokens are processed, the single remaining stack entry is the final result. 51 * 52 * The stack entries are typed: word lookups produce PageSet entries (concrete page 53 * results with scores), namespace tokens produce NamespacePredicate entries (a filter 54 * to be applied later), and NOT wraps its operand in a NegatedEntry. Binary operators 55 * inspect the types of their operands to choose the most efficient operation — for 56 * example, AND with a NegatedEntry performs set subtraction rather than requiring 57 * the full page universe. 58 * 59 * @return array<string, int> page ID => score 60 */ 61 public function evaluate(): array 62 { 63 /** @var StackEntry[] $stack */ 64 $stack = []; 65 66 foreach ($this->rpn as $token) { 67 switch (substr($token, 0, 3)) { 68 case 'W+:': 69 case 'W-:': 70 case 'W_:': 71 $word = substr($token, 3); 72 if (isset($this->terms[$word])) { 73 $stack[] = new PageSet($this->terms[$word]->getEntityFrequencies()); 74 } 75 break; 76 77 case 'P+:': 78 case 'P-:': 79 $phrase = substr($token, 3); 80 // Phrases are always preceded by their component words AND'd together, 81 // so the top of stack contains pages matching all words in the phrase. 82 // We verify the actual phrase exists in those candidate pages. 83 $candidates = end($stack) ?: new PageSet(); 84 $stack[] = $this->matchPhrase($phrase, $this->materialize($candidates)); 85 break; 86 87 case 'N+:': 88 case 'N-:': 89 $ns = cleanID(substr($token, 3)) . ':'; 90 $stack[] = new NamespacePredicate($ns); 91 break; 92 93 case 'AND': 94 $right = array_pop($stack); 95 $left = array_pop($stack); 96 $stack[] = $this->opAnd($left, $right); 97 break; 98 99 case 'OR': 100 $right = array_pop($stack); 101 $left = array_pop($stack); 102 $stack[] = $this->opOr($left, $right); 103 break; 104 105 case 'NOT': 106 $operand = array_pop($stack); 107 $stack[] = new NegatedEntry($operand); 108 break; 109 } 110 } 111 112 $result = array_pop($stack) ?? new PageSet(); 113 return $this->materialize($result)->getPages(); 114 } 115 116 // region Operators 117 118 /** 119 * AND: combine two operands based on their types 120 * 121 * PageSet AND PageSet produces an intersection with summed scores. When one 122 * operand is a NegatedEntry, the operation becomes set subtraction. When one 123 * operand is a NamespacePredicate, the other is filtered by namespace prefix. 124 * 125 * @param StackEntry $left 126 * @param StackEntry $right 127 * @return StackEntry 128 */ 129 protected function opAnd(StackEntry $left, StackEntry $right): StackEntry 130 { 131 // page set AND negated → subtract 132 if ($left instanceof PageSet && $right instanceof NegatedEntry) { 133 return $this->subtractNegated($left, $right); 134 } 135 if ($left instanceof NegatedEntry && $right instanceof PageSet) { 136 return $this->subtractNegated($right, $left); 137 } 138 139 // page set AND namespace → filter by prefix 140 if ($left instanceof PageSet && $right instanceof NamespacePredicate) { 141 return $right->filter($left); 142 } 143 if ($left instanceof NamespacePredicate && $right instanceof PageSet) { 144 return $left->filter($right); 145 } 146 147 // page set AND page set → intersect, sum scores 148 if ($left instanceof PageSet && $right instanceof PageSet) { 149 return $left->intersect($right); 150 } 151 152 // rare cases (negated AND negated, namespace AND namespace, etc.) 153 return $this->materialize($left)->intersect($this->materialize($right)); 154 } 155 156 /** 157 * OR: unite two operands 158 * 159 * PageSet OR PageSet produces a union with summed scores. Other combinations 160 * require materializing operands into concrete page sets first. 161 * 162 * @param StackEntry $left 163 * @param StackEntry $right 164 * @return StackEntry 165 */ 166 protected function opOr(StackEntry $left, StackEntry $right): StackEntry 167 { 168 if ($left instanceof PageSet && $right instanceof PageSet) { 169 return $left->unite($right); 170 } 171 172 return $this->materialize($left)->unite($this->materialize($right)); 173 } 174 175 /** 176 * Subtract a NegatedEntry from a PageSet 177 * 178 * The inner entry of the NegatedEntry determines the operation: 179 * - NegatedEntry(PageSet): set subtraction 180 * - NegatedEntry(NamespacePredicate): exclude pages matching the namespace 181 * 182 * @param PageSet $pages the positive page set 183 * @param NegatedEntry $negated the negated operand 184 * @return PageSet 185 */ 186 protected function subtractNegated(PageSet $pages, NegatedEntry $negated): PageSet 187 { 188 $inner = $negated->getInner(); 189 190 if ($inner instanceof NamespacePredicate) { 191 return $inner->exclude($pages); 192 } 193 194 return $pages->subtract($this->materialize($inner)); 195 } 196 197 // endregion 198 199 // region Phrase matching 200 201 /** 202 * Check which pages from the candidate set contain the given phrase 203 * 204 * Verifies phrase presence by reading each page's raw wiki text. 205 * Plugins can override phrase matching via the FULLTEXT_PHRASE_MATCH event. 206 * Pages that match retain their original scores from the candidate set. 207 * 208 * @param string $phrase the phrase to search for 209 * @param PageSet $candidates pages to check (typically the AND'd word results) 210 * @return PageSet pages where the phrase was found, with original scores preserved 211 */ 212 protected function matchPhrase(string $phrase, PageSet $candidates): PageSet 213 { 214 $matched = []; 215 foreach ($candidates->getPages() as $id => $score) { 216 $evdata = [ 217 'id' => $id, 218 'phrase' => $phrase, 219 'text' => rawWiki($id), 220 ]; 221 $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 222 if ($event->advise_before() && $event->result !== true) { 223 $text = PhpString::strtolower($evdata['text']); 224 if (str_contains($text, $phrase)) { 225 $event->result = true; 226 } 227 } 228 $event->advise_after(); 229 if ($event->result === true) { 230 $matched[$id] = $score; 231 } 232 } 233 return new PageSet($matched); 234 } 235 236 // endregion 237 238 // region Materialization 239 240 /** 241 * Convert any StackEntry into a concrete PageSet 242 * 243 * For PageSet entries, returns as-is. NamespacePredicate and NegatedEntry 244 * cannot be resolved without knowing all existing pages — a namespace 245 * predicate needs the full page list to find matching pages, and a negated 246 * entry needs it to compute the complement. This is why materialization 247 * triggers a lazy-load of the full page index from disk. 248 * 249 * This is only needed for standalone namespace or negative-only queries 250 * (e.g., just "@wiki:" or just "-foo"). In combined queries, the typed 251 * operators handle these entries without the universe. 252 * 253 * @param StackEntry $entry 254 * @return PageSet 255 */ 256 protected function materialize(StackEntry $entry): PageSet 257 { 258 if ($entry instanceof PageSet) { 259 return $entry; 260 } 261 262 if ($entry instanceof NegatedEntry) { 263 return $this->getAllPages()->subtract($this->materialize($entry->getInner())); 264 } 265 266 if ($entry instanceof NamespacePredicate) { 267 return $entry->filter($this->getAllPages()); 268 } 269 270 return new PageSet(); 271 } 272 273 /** 274 * Lazy-load the universe of all indexed pages 275 * 276 * @return PageSet all pages with score 0 277 */ 278 protected function getAllPages(): PageSet 279 { 280 if (!$this->allPages instanceof PageSet) { 281 $pages = (new Indexer())->getAllPages(); 282 $this->allPages = new PageSet(array_fill_keys($pages, 0)); 283 } 284 return $this->allPages; 285 } 286 287 // endregion 288} 289