1<?php 2 3namespace dokuwiki\Search\Query; 4 5use dokuwiki\Search\Tokenizer; 6use dokuwiki\Utf8; 7 8/** 9 * DokuWuki QueryParser class 10 */ 11class QueryParser 12{ 13 /** 14 * Transforms given search term into intermediate representation 15 * 16 * This function is used in QueryParser::convert() and not for general purpose use. 17 * 18 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 19 * 20 * @param string $term 21 * @param bool $consider_asian 22 * @param bool $phrase_mode 23 * @return string 24 */ 25 public function termParser($term, $consider_asian = true, $phrase_mode = false) 26 { 27 $parsed = ''; 28 if ($consider_asian) { 29 // successive asian characters need to be searched as a phrase 30 $words = Utf8\Asian::splitAsianWords($term); 31 foreach ($words as $word) { 32 $phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word); 33 $parsed .= $this->termParser($word, false, $phrase_mode); 34 } 35 } else { 36 $term_noparen = str_replace(['(',')'], ' ', $term); 37 $words = Tokenizer::getWords($term_noparen, true); 38 39 // W_: no need to highlight 40 if (empty($words)) { 41 $parsed = '()'; // important: do not remove 42 } elseif ($words[0] === $term) { 43 $parsed = '(W+:'.$words[0].')'; 44 } elseif ($phrase_mode) { 45 $term_encoded = str_replace(['(',')'], ['OP','CP'], $term); 46 $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))'; 47 } else { 48 $parsed = '((W+:'.implode(')(W+:', $words).'))'; 49 } 50 } 51 return $parsed; 52 } 53 54 /** 55 * Parses a search query and builds an array of search formulas 56 * 57 * @author Andreas Gohr <andi@splitbrain.org> 58 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 59 * 60 * @param string $query search query 61 * @return array of search formulas 62 */ 63 public function convert($query) 64 { 65 /** 66 * parse a search query and transform it into intermediate representation 67 * 68 * in a search query, you can use the following expressions: 69 * 70 * words: 71 * include 72 * -exclude 73 * phrases: 74 * "phrase to be included" 75 * -"phrase you want to exclude" 76 * namespaces: 77 * @include:namespace (or ns:include:namespace) 78 * ^exclude:namespace (or -ns:exclude:namespace) 79 * groups: 80 * () 81 * -() 82 * operators: 83 * and ('and' is the default operator: you can always omit this) 84 * or (or pipe symbol '|', lower precedence than 'and') 85 * 86 * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain 87 * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". 88 * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] 89 * as long as you don't mind hit counts. 90 * 91 * intermediate representation consists of the following parts: 92 * 93 * ( ) - group 94 * AND - logical and 95 * OR - logical or 96 * NOT - logical not 97 * W+:, W-:, W_: - word (underscore: no need to highlight) 98 * P+:, P-: - phrase (minus sign: logically in NOT group) 99 * N+:, N-: - namespace 100 */ 101 $parsed_query = ''; 102 $parens_level = 0; 103 $terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query), 104 -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 105 ); 106 107 foreach ($terms as $term) { 108 $parsed = ''; 109 if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { 110 // phrase-include and phrase-exclude 111 $not = $matches[1] ? 'NOT' : ''; 112 $parsed = $not . $this->termParser($matches[2], false, true); 113 } else { 114 // fix incomplete phrase 115 $term = str_replace('"', ' ', $term); 116 117 // fix parentheses 118 $term = str_replace(')' , ' ) ', $term); 119 $term = str_replace('(' , ' ( ', $term); 120 $term = str_replace('- (', ' -(', $term); 121 122 // treat pipe symbols as 'OR' operators 123 $term = str_replace('|', ' or ', $term); 124 125 // treat ideographic spaces (U+3000) as search term separators 126 // FIXME: some more separators? 127 $term = preg_replace('/[ \x{3000}]+/u', ' ', $term); 128 $term = trim($term); 129 if ($term === '') continue; 130 131 $tokens = explode(' ', $term); 132 foreach ($tokens as $token) { 133 if ($token === '(') { 134 // parenthesis-include-open 135 $parsed .= '('; 136 ++$parens_level; 137 } elseif ($token === '-(') { 138 // parenthesis-exclude-open 139 $parsed .= 'NOT('; 140 ++$parens_level; 141 } elseif ($token === ')') { 142 // parenthesis-any-close 143 if ($parens_level === 0) continue; 144 $parsed .= ')'; 145 $parens_level--; 146 } elseif ($token === 'and') { 147 // logical-and (do nothing) 148 } elseif ($token === 'or') { 149 // logical-or 150 $parsed .= 'OR'; 151 } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { 152 // namespace-exclude 153 $parsed .= 'NOT(N+:'.$matches[1].')'; 154 } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { 155 // namespace-include 156 $parsed .= '(N+:'.$matches[1].')'; 157 } elseif (preg_match('/^-(.+)$/', $token, $matches)) { 158 // word-exclude 159 $parsed .= 'NOT('.$this->termParser($matches[1]).')'; 160 } else { 161 // word-include 162 $parsed .= $this->termParser($token); 163 } 164 } 165 } 166 $parsed_query .= $parsed; 167 } 168 169 // cleanup (very sensitive) 170 $parsed_query .= str_repeat(')', $parens_level); 171 do { 172 $parsed_query_old = $parsed_query; 173 $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); 174 } while ($parsed_query !== $parsed_query_old); 175 $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query); 176 $parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query); 177 $parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query); 178 $parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query); 179 $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query); 180 181 // adjustment: make highlightings right 182 $parens_level = 0; 183 $notgrp_levels = array(); 184 $parsed_query_new = ''; 185 $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, 186 -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 187 ); 188 foreach ($tokens as $token) { 189 if ($token === 'NOT(') { 190 $notgrp_levels[] = ++$parens_level; 191 } elseif ($token === '(') { 192 ++$parens_level; 193 } elseif ($token === ')') { 194 if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); 195 } elseif (count($notgrp_levels) % 2 === 1) { 196 // turn highlight-flag off if terms are logically in "NOT" group 197 $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); 198 } 199 $parsed_query_new .= $token; 200 } 201 $parsed_query = $parsed_query_new; 202 203 /** 204 * convert infix notation string into postfix (Reverse Polish notation) array 205 * by Shunting-yard algorithm 206 * 207 * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation 208 * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm 209 */ 210 $parsed_ary = array(); 211 $ope_stack = array(); 212 $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5); 213 $ope_regex = '/([()]|OR|AND|NOT)/u'; 214 215 $tokens = preg_split($ope_regex, $parsed_query, 216 -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 217 ); 218 foreach ($tokens as $token) { 219 if (preg_match($ope_regex, $token)) { 220 // operator 221 $last_ope = end($ope_stack); 222 while ($last_ope !== false 223 && $ope_precedence[$token] <= $ope_precedence[$last_ope] 224 && $last_ope != '(' 225 ) { 226 $parsed_ary[] = array_pop($ope_stack); 227 $last_ope = end($ope_stack); 228 } 229 if ($token == ')') { 230 array_pop($ope_stack); // this array_pop always deletes '(' 231 } else { 232 $ope_stack[] = $token; 233 } 234 } else { 235 // operand 236 $token_decoded = str_replace(['OP','CP'], ['(',')'], $token); 237 $parsed_ary[] = $token_decoded; 238 } 239 } 240 $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack))); 241 242 // cleanup: each double "NOT" in RPN array actually does nothing 243 $parsed_ary_count = count($parsed_ary); 244 for ($i = 1; $i < $parsed_ary_count; ++$i) { 245 if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { 246 unset($parsed_ary[$i], $parsed_ary[$i - 1]); 247 } 248 } 249 $parsed_ary = array_values($parsed_ary); 250 251 // build return value 252 $q = array(); 253 $q['query'] = $query; 254 $q['parsed_str'] = $parsed_query; 255 $q['parsed_ary'] = $parsed_ary; 256 257 foreach ($q['parsed_ary'] as $token) { 258 if ($token[2] !== ':') continue; 259 $body = substr($token, 3); 260 261 switch (substr($token, 0, 3)) { 262 case 'N+:': 263 $q['ns'][] = $body; // for backward compatibility 264 break; 265 case 'N-:': 266 $q['notns'][] = $body; // for backward compatibility 267 break; 268 case 'W_:': 269 $q['words'][] = $body; 270 break; 271 case 'W-:': 272 $q['words'][] = $body; 273 $q['not'][] = $body; // for backward compatibility 274 break; 275 case 'W+:': 276 $q['words'][] = $body; 277 $q['highlight'][] = $body; 278 $q['and'][] = $body; // for backward compatibility 279 break; 280 case 'P-:': 281 $q['phrases'][] = $body; 282 break; 283 case 'P+:': 284 $q['phrases'][] = $body; 285 $q['highlight'][] = $body; 286 break; 287 } 288 } 289 foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) { 290 $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key])); 291 } 292 293 return $q; 294 } 295 296 /** 297 * Recreate a search query string based on parsed parts, 298 * doesn't support negated phrases and `OR` searches 299 * 300 * @param array $and 301 * @param array $not 302 * @param array $phrases 303 * @param array $ns 304 * @param array $notns 305 * 306 * @return string 307 */ 308 public function revert(array $and, array $not, array $phrases, array $ns, array $notns) 309 { 310 $query = implode(' ', $and); 311 312 if (!empty($not)) { 313 $query .= ' -' . implode(' -', $not); 314 } 315 if (!empty($phrases)) { 316 $query .= ' "' . implode('" "', $phrases) . '"'; 317 } 318 if (!empty($ns)) { 319 $query .= ' @' . implode(' @', $ns); 320 } 321 if (!empty($notns)) { 322 $query .= ' ^' . implode(' ^', $notns); 323 } 324 return $query; 325 } 326} 327