1<?php 2 3namespace dokuwiki\Search\Query; 4 5use dokuwiki\Search\Tokenizer; 6use dokuwiki\Utf8\Asian; 7use dokuwiki\Utf8\PhpString; 8 9/** 10 * DokuWuki QueryParser class 11 */ 12class QueryParser 13{ 14 /** 15 * Transforms given search term into intermediate representation 16 * 17 * This function is used in QueryParser::convert() and not for general purpose use. 18 * 19 * @param string $term 20 * @param bool $consider_asian 21 * @param bool $phrase_mode 22 * @return string 23 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 24 * 25 */ 26 public function termParser(string $term, bool $consider_asian = true, bool $phrase_mode = false): string 27 { 28 $parsed = ''; 29 if ($consider_asian) { 30 // successive asian characters need to be searched as a phrase 31 $words = Asian::splitAsianWords($term); 32 foreach ($words as $word) { 33 $phrase_mode = $phrase_mode || Asian::isAsianWords($word); 34 $parsed .= $this->termParser($word, false, $phrase_mode); 35 } 36 } else { 37 $term_noparen = str_replace(['(', ')'], ' ', $term); 38 $words = Tokenizer::getWords($term_noparen, true); 39 40 // W_: no need to highlight 41 if ($words === []) { 42 $parsed = '()'; // important: do not remove 43 } elseif ($words[0] === $term) { 44 $parsed = '(W+:' . $words[0] . ')'; 45 } elseif ($phrase_mode) { 46 $term_encoded = str_replace(['(', ')'], ['OP', 'CP'], $term); 47 $parsed = '((W_:' . implode(')(W_:', $words) . ')(P+:' . $term_encoded . '))'; 48 } else { 49 $parsed = '((W+:' . implode(')(W+:', $words) . '))'; 50 } 51 } 52 return $parsed; 53 } 54 55 /** 56 * Parses a search query and builds an array of search formulas 57 * 58 * @param string $query search query 59 * @return array of search formulas 60 * @author Andreas Gohr <andi@splitbrain.org> 61 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 62 * 63 */ 64 public function convert(string $query): array 65 { 66 /** 67 * parse a search query and transform it into intermediate representation 68 * 69 * in a search query, you can use the following expressions: 70 * 71 * words: 72 * include 73 * -exclude 74 * phrases: 75 * "phrase to be included" 76 * -"phrase you want to exclude" 77 * namespaces: 78 * @include:namespace (or ns:include:namespace) 79 * ^exclude:namespace (or -ns:exclude:namespace) 80 * groups: 81 * () 82 * -() 83 * operators: 84 * and ('and' is the default operator: you can always omit this) 85 * or (or pipe symbol '|', lower precedence than 'and') 86 * 87 * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain 88 * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". 89 * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] 90 * as long as you don't mind hit counts. 91 * 92 * intermediate representation consists of the following parts: 93 * 94 * ( ) - group 95 * AND - logical and 96 * OR - logical or 97 * NOT - logical not 98 * W+:, W-:, W_: - word (underscore: no need to highlight) 99 * P+:, P-: - phrase (minus sign: logically in NOT group) 100 * N+:, N-: - namespace 101 */ 102 $parsed_query = ''; 103 $parens_level = 0; 104 $terms = preg_split( 105 '/(-?".*?")/u', 106 PhpString::strtolower($query), 107 -1, 108 PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 109 ); 110 111 foreach ($terms as $term) { 112 $parsed = ''; 113 if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { 114 // phrase-include and phrase-exclude 115 $not = $matches[1] ? 'NOT' : ''; 116 $parsed = $not . $this->termParser($matches[2], false, true); 117 } else { 118 // fix incomplete phrase 119 $term = str_replace('"', ' ', $term); 120 121 // fix parentheses 122 $term = str_replace(')', ' ) ', $term); 123 $term = str_replace('(', ' ( ', $term); 124 $term = str_replace('- (', ' -(', $term); 125 126 // treat pipe symbols as 'OR' operators 127 $term = str_replace('|', ' or ', $term); 128 129 // treat ideographic spaces (U+3000) as search term separators 130 // FIXME: some more separators? 131 $term = preg_replace('/[ \x{3000}]+/u', ' ', $term); 132 $term = trim($term); 133 if ($term === '') continue; 134 135 $tokens = explode(' ', $term); 136 foreach ($tokens as $token) { 137 if ($token === '(') { 138 // parenthesis-include-open 139 $parsed .= '('; 140 ++$parens_level; 141 } elseif ($token === '-(') { 142 // parenthesis-exclude-open 143 $parsed .= 'NOT('; 144 ++$parens_level; 145 } elseif ($token === ')') { 146 // parenthesis-any-close 147 if ($parens_level === 0) continue; 148 $parsed .= ')'; 149 $parens_level--; 150 } elseif ($token === 'and') { 151 // logical-and (do nothing) 152 } elseif ($token === 'or') { 153 // logical-or 154 $parsed .= 'OR'; 155 } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { 156 // namespace-exclude 157 $parsed .= 'NOT(N+:' . $matches[1] . ')'; 158 } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { 159 // namespace-include 160 $parsed .= '(N+:' . $matches[1] . ')'; 161 } elseif (preg_match('/^-(.+)$/', $token, $matches)) { 162 // word-exclude 163 $parsed .= 'NOT(' . $this->termParser($matches[1]) . ')'; 164 } else { 165 // word-include 166 $parsed .= $this->termParser($token); 167 } 168 } 169 } 170 $parsed_query .= $parsed; 171 } 172 173 // cleanup (very sensitive) 174 $parsed_query .= str_repeat(')', $parens_level); 175 do { 176 $parsed_query_old = $parsed_query; 177 $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); 178 } while ($parsed_query !== $parsed_query_old); 179 $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')', $parsed_query); 180 $parsed_query = preg_replace('/(OR)+/u', 'OR', $parsed_query); 181 $parsed_query = preg_replace('/\(OR/u', '(', $parsed_query); 182 $parsed_query = preg_replace('/^OR|OR$/u', '', $parsed_query); 183 $parsed_query = preg_replace('/\)(NOT)?\(/u', ')AND$1(', $parsed_query); 184 185 // adjustment: make highlightings right 186 $parens_level = 0; 187 $notgrp_levels = []; 188 $parsed_query_new = ''; 189 $tokens = preg_split( 190 '/(NOT\(|[()])/u', 191 $parsed_query, 192 -1, 193 PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 194 ); 195 foreach ($tokens as $token) { 196 if ($token === 'NOT(') { 197 $notgrp_levels[] = ++$parens_level; 198 } elseif ($token === '(') { 199 ++$parens_level; 200 } elseif ($token === ')') { 201 if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); 202 } elseif (count($notgrp_levels) % 2 === 1) { 203 // turn highlight-flag off if terms are logically in "NOT" group 204 $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); 205 } 206 $parsed_query_new .= $token; 207 } 208 $parsed_query = $parsed_query_new; 209 210 /** 211 * convert infix notation string into postfix (Reverse Polish notation) array 212 * by Shunting-yard algorithm 213 * 214 * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation 215 * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm 216 */ 217 $parsed_ary = []; 218 $ope_stack = []; 219 $ope_precedence = [')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5]; 220 $ope_regex = '/([()]|OR|AND|NOT)/u'; 221 222 $tokens = preg_split( 223 $ope_regex, 224 $parsed_query, 225 -1, 226 PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 227 ); 228 foreach ($tokens as $token) { 229 if (preg_match($ope_regex, $token)) { 230 // operator 231 $last_ope = end($ope_stack); 232 while ( 233 $last_ope !== false 234 && $ope_precedence[$token] <= $ope_precedence[$last_ope] 235 && $last_ope != '(' 236 ) { 237 $parsed_ary[] = array_pop($ope_stack); 238 $last_ope = end($ope_stack); 239 } 240 if ($token == ')') { 241 array_pop($ope_stack); // this array_pop always deletes '(' 242 } else { 243 $ope_stack[] = $token; 244 } 245 } else { 246 // operand 247 $token_decoded = str_replace(['OP', 'CP'], ['(', ')'], $token); 248 $parsed_ary[] = $token_decoded; 249 } 250 } 251 $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack))); 252 253 // cleanup: each double "NOT" in RPN array actually does nothing 254 $parsed_ary_count = count($parsed_ary); 255 for ($i = 1; $i < $parsed_ary_count; ++$i) { 256 if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { 257 unset($parsed_ary[$i], $parsed_ary[$i - 1]); 258 } 259 } 260 $parsed_ary = array_values($parsed_ary); 261 262 // build return value 263 $q = []; 264 $q['query'] = $query; 265 $q['parsed_str'] = $parsed_query; 266 $q['parsed_ary'] = $parsed_ary; 267 268 foreach ($q['parsed_ary'] as $token) { 269 if ($token[2] !== ':') continue; 270 $body = substr($token, 3); 271 272 switch (substr($token, 0, 3)) { 273 case 'N+:': 274 $q['ns'][] = $body; // for backward compatibility 275 break; 276 case 'N-:': 277 $q['notns'][] = $body; // for backward compatibility 278 break; 279 case 'W_:': 280 $q['words'][] = $body; 281 break; 282 case 'W-:': 283 $q['words'][] = $body; 284 $q['not'][] = $body; // for backward compatibility 285 break; 286 case 'W+:': 287 $q['words'][] = $body; 288 $q['highlight'][] = $body; 289 $q['and'][] = $body; // for backward compatibility 290 break; 291 case 'P-:': 292 $q['phrases'][] = $body; 293 break; 294 case 'P+:': 295 $q['phrases'][] = $body; 296 $q['highlight'][] = $body; 297 break; 298 } 299 } 300 foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) { 301 $q[$key] = empty($q[$key]) ? [] : array_values(array_unique($q[$key])); 302 } 303 304 return $q; 305 } 306 307 /** 308 * Recreate a search query string based on parsed parts, 309 * doesn't support negated phrases and `OR` searches 310 * 311 * @param array $and 312 * @param array $not 313 * @param array $phrases 314 * @param array $ns 315 * @param array $notns 316 * 317 * @return string 318 */ 319 public function revert(array $and, array $not, array $phrases, array $ns, array $notns) 320 { 321 $query = implode(' ', $and); 322 323 if ($not !== []) { 324 $query .= ' -' . implode(' -', $not); 325 } 326 if ($phrases !== []) { 327 $query .= ' "' . implode('" "', $phrases) . '"'; 328 } 329 if ($ns !== []) { 330 $query .= ' @' . implode(' @', $ns); 331 } 332 if ($notns !== []) { 333 $query .= ' ^' . implode(' ^', $notns); 334 } 335 return $query; 336 } 337} 338