1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9if(!defined('DOKU_INC')) die('meh.'); 10require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * The fulltext search 15 * 16 * Returns a list of matching documents for the given query 17 * 18 * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event() 19 * 20 */ 21function ft_pageSearch($query,&$highlight){ 22 23 $data['query'] = $query; 24 $data['highlight'] =& $highlight; 25 26 return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch'); 27} 28 29/** 30 * Returns a list of matching documents for the given query 31 * 32 * @author Andreas Gohr <andi@splitbrain.org> 33 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 34 */ 35function _ft_pageSearch(&$data) { 36 // parse the given query 37 $q = ft_queryParser($data['query']); 38 $data['highlight'] = $q['highlight']; 39 40 if (empty($q['parsed_ary'])) return array(); 41 42 // lookup all words found in the query 43 $lookup = idx_lookup($q['words']); 44 45 // get all pages in this dokuwiki site (!: includes nonexistent pages) 46 $pages_all = array(); 47 foreach (idx_getIndex('page', '') as $id) { 48 $pages_all[trim($id)] = 0; // base: 0 hit 49 } 50 51 // process the query 52 $stack = array(); 53 foreach ($q['parsed_ary'] as $token) { 54 switch (substr($token, 0, 3)) { 55 case 'W+:': 56 case 'W-:': 57 case 'W_:': // word 58 $word = substr($token, 3); 59 $stack[] = (array) $lookup[$word]; 60 break; 61 case 'P+:': 62 case 'P-:': // phrase 63 $phrase = substr($token, 3); 64 // since phrases are always parsed as ((W1)(W2)...(P)), 65 // the end($stack) always points the pages that contain 66 // all words in this phrase 67 $pages = end($stack); 68 $pages_matched = array(); 69 foreach(array_keys($pages) as $id){ 70 $text = utf8_strtolower(rawWiki($id)); 71 if (strpos($text, $phrase) !== false) { 72 $pages_matched[$id] = 0; // phrase: always 0 hit 73 } 74 } 75 $stack[] = $pages_matched; 76 break; 77 case 'N+:': 78 case 'N-:': // namespace 79 $ns = substr($token, 3); 80 $pages_matched = array(); 81 foreach (array_keys($pages_all) as $id) { 82 if (strpos($id, $ns) === 0) { 83 $pages_matched[$id] = 0; // namespace: always 0 hit 84 } 85 } 86 $stack[] = $pages_matched; 87 break; 88 case 'AND': // and operation 89 list($pages1, $pages2) = array_splice($stack, -2); 90 $stack[] = ft_resultCombine(array($pages1, $pages2)); 91 break; 92 case 'OR': // or operation 93 list($pages1, $pages2) = array_splice($stack, -2); 94 $stack[] = ft_resultUnite(array($pages1, $pages2)); 95 break; 96 case 'NOT': // not operation (unary) 97 $pages = array_pop($stack); 98 $stack[] = ft_resultComplement(array($pages_all, $pages)); 99 break; 100 } 101 } 102 $docs = array_pop($stack); 103 104 if (empty($docs)) return array(); 105 106 // check: settings, acls, existence 107 foreach (array_keys($docs) as $id) { 108 if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) { 109 unset($docs[$id]); 110 } 111 } 112 113 // sort docs by count 114 arsort($docs); 115 116 return $docs; 117} 118 119/** 120 * Returns the backlinks for a given page 121 * 122 * Does a quick lookup with the fulltext index, then 123 * evaluates the instructions of the found pages 124 */ 125function ft_backlinks($id){ 126 global $conf; 127 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 128 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 129 130 $result = array(); 131 132 // quick lookup of the pagename 133 $page = noNS($id); 134 $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) 135 $docs = array_keys(ft_resultCombine(array_values($matches))); 136 $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages 137 if(!count($docs)) return $result; 138 require_once(DOKU_INC.'inc/parserutils.php'); 139 140 // check metadata for matching links 141 foreach($docs as $match){ 142 // metadata relation reference links are already resolved 143 $links = p_get_metadata($match,'relation references'); 144 if (isset($links[$id])) $result[] = $match; 145 } 146 147 if(!count($result)) return $result; 148 149 // check ACL permissions 150 foreach(array_keys($result) as $idx){ 151 if(auth_quickaclcheck($result[$idx]) < AUTH_READ){ 152 unset($result[$idx]); 153 } 154 } 155 156 sort($result); 157 return $result; 158} 159 160/** 161 * Returns the pages that use a given media file 162 * 163 * Does a quick lookup with the fulltext index, then 164 * evaluates the instructions of the found pages 165 * 166 * Aborts after $max found results 167 */ 168function ft_mediause($id,$max){ 169 global $conf; 170 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 171 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 172 173 if(!$max) $max = 1; // need to find at least one 174 175 $result = array(); 176 177 // quick lookup of the mediafile 178 $media = noNS($id); 179 $matches = idx_lookup(idx_tokenizer($media,$stopwords)); 180 $docs = array_keys(ft_resultCombine(array_values($matches))); 181 if(!count($docs)) return $result; 182 183 // go through all found pages 184 $found = 0; 185 $pcre = preg_quote($media,'/'); 186 foreach($docs as $doc){ 187 $ns = getNS($doc); 188 preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches); 189 foreach($matches[1] as $img){ 190 $img = trim($img); 191 if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images 192 list($img) = explode('?',$img); // remove any parameters 193 resolve_mediaid($ns,$img,$exists); // resolve the possibly relative img 194 195 if($img == $id){ // we have a match 196 $result[] = $doc; 197 $found++; 198 break; 199 } 200 } 201 if($found >= $max) break; 202 } 203 204 sort($result); 205 return $result; 206} 207 208 209 210/** 211 * Quicksearch for pagenames 212 * 213 * By default it only matches the pagename and ignores the 214 * namespace. This can be changed with the second parameter 215 * 216 * refactored into ft_pageLookup(), _ft_pageLookup() and trigger_event() 217 * 218 * @author Andreas Gohr <andi@splitbrain.org> 219 */ 220function ft_pageLookup($id,$pageonly=true){ 221 $data = array('id' => $id, 'pageonly' => $pageonly); 222 return trigger_event('SEARCH_QUERY_PAGELOOKUP',$data,'_ft_pageLookup'); 223} 224 225function _ft_pageLookup(&$data){ 226 // split out original parameterrs 227 $id = $data['id']; 228 $pageonly = $data['pageonly']; 229 230 global $conf; 231 $id = preg_quote($id,'/'); 232 $pages = file($conf['indexdir'].'/page.idx'); 233 if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages)); 234 235 $cnt = count($pages); 236 for($i=0; $i<$cnt; $i++){ 237 if($pageonly){ 238 if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 239 unset($pages[$i]); 240 continue; 241 } 242 } 243 if(!page_exists($pages[$i])){ 244 unset($pages[$i]); 245 continue; 246 } 247 } 248 249 $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages 250 if(!count($pages)) return array(); 251 252 // check ACL permissions 253 foreach(array_keys($pages) as $idx){ 254 if(auth_quickaclcheck(trim($pages[$idx])) < AUTH_READ){ 255 unset($pages[$idx]); 256 } 257 } 258 259 $pages = array_map('trim',$pages); 260 usort($pages,'ft_pagesorter'); 261 return $pages; 262} 263 264/** 265 * Sort pages based on their namespace level first, then on their string 266 * values. This makes higher hierarchy pages rank higher than lower hierarchy 267 * pages. 268 */ 269function ft_pagesorter($a, $b){ 270 $ac = count(explode(':',$a)); 271 $bc = count(explode(':',$b)); 272 if($ac < $bc){ 273 return -1; 274 }elseif($ac > $bc){ 275 return 1; 276 } 277 return strcmp ($a,$b); 278} 279 280/** 281 * Creates a snippet extract 282 * 283 * @author Andreas Gohr <andi@splitbrain.org> 284 * @triggers FULLTEXT_SNIPPET_CREATE 285 */ 286function ft_snippet($id,$highlight){ 287 $text = rawWiki($id); 288 $evdata = array( 289 'id' => $id, 290 'text' => &$text, 291 'highlight' => &$highlight, 292 'snippet' => '', 293 ); 294 295 $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata); 296 if ($evt->advise_before()) { 297 $match = array(); 298 $snippets = array(); 299 $utf8_offset = $offset = $end = 0; 300 $len = utf8_strlen($text); 301 302 // build a regexp from the phrases to highlight 303 $re1 = '('.join('|',array_map('preg_quote_cb',array_filter((array) $highlight))).')'; 304 $re2 = "$re1.{0,75}(?!\\1)$re1"; 305 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 306 307 for ($cnt=4; $cnt--;) { 308 if (0) { 309 } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { 310 } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { 311 } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { 312 } else { 313 break; 314 } 315 316 list($str,$idx) = $match[0]; 317 318 // convert $idx (a byte offset) into a utf8 character offset 319 $utf8_idx = utf8_strlen(substr($text,0,$idx)); 320 $utf8_len = utf8_strlen($str); 321 322 // establish context, 100 bytes surrounding the match string 323 // first look to see if we can go 100 either side, 324 // then drop to 50 adding any excess if the other side can't go to 50, 325 $pre = min($utf8_idx-$utf8_offset,100); 326 $post = min($len-$utf8_idx-$utf8_len,100); 327 328 if ($pre>50 && $post>50) { 329 $pre = $post = 50; 330 } else if ($pre>50) { 331 $pre = min($pre,100-$post); 332 } else if ($post>50) { 333 $post = min($post, 100-$pre); 334 } else { 335 // both are less than 50, means the context is the whole string 336 // make it so and break out of this loop - there is no need for the 337 // complex snippet calculations 338 $snippets = array($text); 339 break; 340 } 341 342 // establish context start and end points, try to append to previous 343 // context if possible 344 $start = $utf8_idx - $pre; 345 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 346 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 347 348 if ($append) { 349 $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); 350 } else { 351 $snippets[] = utf8_substr($text,$start,$end-$start); 352 } 353 354 // set $offset for next match attempt 355 // substract strlen to avoid splitting a potential search success, 356 // this is an approximation as the search pattern may match strings 357 // of varying length and it will fail if the context snippet 358 // boundary breaks a matching string longer than the current match 359 $utf8_offset = $utf8_idx + $post; 360 $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); 361 $offset = utf8_correctIdx($text,$offset); 362 } 363 364 $m = "\1"; 365 $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets); 366 $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets))); 367 368 $evdata['snippet'] = $snippet; 369 } 370 $evt->advise_after(); 371 unset($evt); 372 373 return $evdata['snippet']; 374} 375 376/** 377 * Combine found documents and sum up their scores 378 * 379 * This function is used to combine searched words with a logical 380 * AND. Only documents available in all arrays are returned. 381 * 382 * based upon PEAR's PHP_Compat function for array_intersect_key() 383 * 384 * @param array $args An array of page arrays 385 */ 386function ft_resultCombine($args){ 387 $array_count = count($args); 388 if($array_count == 1){ 389 return $args[0]; 390 } 391 392 $result = array(); 393 if ($array_count > 1) { 394 foreach ($args[0] as $key => $value) { 395 $result[$key] = $value; 396 for ($i = 1; $i !== $array_count; $i++) { 397 if (!isset($args[$i][$key])) { 398 unset($result[$key]); 399 break; 400 } 401 $result[$key] += $args[$i][$key]; 402 } 403 } 404 } 405 return $result; 406} 407 408/** 409 * Unites found documents and sum up their scores 410 * 411 * based upon ft_resultCombine() function 412 * 413 * @param array $args An array of page arrays 414 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 415 */ 416function ft_resultUnite($args) { 417 $array_count = count($args); 418 if ($array_count === 1) { 419 return $args[0]; 420 } 421 422 $result = $args[0]; 423 for ($i = 1; $i !== $array_count; $i++) { 424 foreach (array_keys($args[$i]) as $id) { 425 $result[$id] += $args[$i][$id]; 426 } 427 } 428 return $result; 429} 430 431/** 432 * Computes the difference of documents using page id for comparison 433 * 434 * nearly identical to PHP5's array_diff_key() 435 * 436 * @param array $args An array of page arrays 437 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 438 */ 439function ft_resultComplement($args) { 440 $array_count = count($args); 441 if ($array_count === 1) { 442 return $args[0]; 443 } 444 445 $result = $args[0]; 446 foreach (array_keys($result) as $id) { 447 for ($i = 1; $i !== $array_count; $i++) { 448 if (isset($args[$i][$id])) unset($result[$id]); 449 } 450 } 451 return $result; 452} 453 454/** 455 * Parses a search query and builds an array of search formulas 456 * 457 * @author Andreas Gohr <andi@splitbrain.org> 458 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 459 */ 460function ft_queryParser($query){ 461 global $conf; 462 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 463 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 464 465 /** 466 * parse a search query and transform it into intermediate representation 467 * 468 * in a search query, you can use the following expressions: 469 * 470 * words: 471 * include 472 * -exclude 473 * phrases: 474 * "phrase to be included" 475 * -"phrase you want to exclude" 476 * namespaces: 477 * @include:namespace (or ns:include:namespace) 478 * ^exclude:namespace (or -ns:exclude:namespace) 479 * groups: 480 * () 481 * -() 482 * operators: 483 * and ('and' is the default operator: you can always omit this) 484 * or (or pipe symbol '|', lower precedence than 'and') 485 * 486 * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain 487 * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". 488 * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] 489 * as long as you don't mind hit counts. 490 * 491 * intermediate representation consists of the following parts: 492 * 493 * ( ) - group 494 * AND - logical and 495 * OR - logical or 496 * NOT - logical not 497 * W+:, W-:, W_: - word (underscore: no need to highlight) 498 * P+:, P-: - phrase (minus sign: logically in NOT group) 499 * N+:, N-: - namespace 500 */ 501 $parsed_query = ''; 502 $parens_level = 0; 503 $terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 504 505 foreach ($terms as $term) { 506 $parsed = ''; 507 if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { 508 // phrase-include and phrase-exclude 509 $not = $matches[1] ? 'NOT' : ''; 510 $parsed = $not.ft_termParser($matches[2], $stopwords, false, true); 511 } else { 512 // fix incomplete phrase 513 $term = str_replace('"', ' ', $term); 514 515 // fix parentheses 516 $term = str_replace(')' , ' ) ', $term); 517 $term = str_replace('(' , ' ( ', $term); 518 $term = str_replace('- (', ' -(', $term); 519 520 // treat pipe symbols as 'OR' operators 521 $term = str_replace('|', ' or ', $term); 522 523 // treat ideographic spaces (U+3000) as search term separators 524 // FIXME: some more separators? 525 $term = preg_replace('/[ \x{3000}]+/u', ' ', $term); 526 $term = trim($term); 527 if ($term === '') continue; 528 529 $tokens = explode(' ', $term); 530 foreach ($tokens as $token) { 531 if ($token === '(') { 532 // parenthesis-include-open 533 $parsed .= '('; 534 ++$parens_level; 535 } elseif ($token === '-(') { 536 // parenthesis-exclude-open 537 $parsed .= 'NOT('; 538 ++$parens_level; 539 } elseif ($token === ')') { 540 // parenthesis-any-close 541 if ($parens_level === 0) continue; 542 $parsed .= ')'; 543 $parens_level--; 544 } elseif ($token === 'and') { 545 // logical-and (do nothing) 546 } elseif ($token === 'or') { 547 // logical-or 548 $parsed .= 'OR'; 549 } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { 550 // namespace-exclude 551 $parsed .= 'NOT(N+:'.$matches[1].')'; 552 } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { 553 // namespace-include 554 $parsed .= '(N+:'.$matches[1].')'; 555 } elseif (preg_match('/^-(.+)$/', $token, $matches)) { 556 // word-exclude 557 $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')'; 558 } else { 559 // word-include 560 $parsed .= ft_termParser($token, $stopwords); 561 } 562 } 563 } 564 $parsed_query .= $parsed; 565 } 566 567 // cleanup (very sensitive) 568 $parsed_query .= str_repeat(')', $parens_level); 569 do { 570 $parsed_query_old = $parsed_query; 571 $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); 572 } while ($parsed_query !== $parsed_query_old); 573 $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query); 574 $parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query); 575 $parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query); 576 $parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query); 577 $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query); 578 579 // adjustment: make highlightings right 580 $parens_level = 0; 581 $notgrp_levels = array(); 582 $parsed_query_new = ''; 583 $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 584 foreach ($tokens as $token) { 585 if ($token === 'NOT(') { 586 $notgrp_levels[] = ++$parens_level; 587 } elseif ($token === '(') { 588 ++$parens_level; 589 } elseif ($token === ')') { 590 if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); 591 } elseif (count($notgrp_levels) % 2 === 1) { 592 // turn highlight-flag off if terms are logically in "NOT" group 593 $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); 594 } 595 $parsed_query_new .= $token; 596 } 597 $parsed_query = $parsed_query_new; 598 599 /** 600 * convert infix notation string into postfix (Reverse Polish notation) array 601 * by Shunting-yard algorithm 602 * 603 * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation 604 * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm 605 */ 606 $parsed_ary = array(); 607 $ope_stack = array(); 608 $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5); 609 $ope_regex = '/([()]|OR|AND|NOT)/u'; 610 611 $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 612 foreach ($tokens as $token) { 613 if (preg_match($ope_regex, $token)) { 614 // operator 615 $last_ope = end($ope_stack); 616 while ($ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') { 617 $parsed_ary[] = array_pop($ope_stack); 618 $last_ope = end($ope_stack); 619 } 620 if ($token == ')') { 621 array_pop($ope_stack); // this array_pop always deletes '(' 622 } else { 623 $ope_stack[] = $token; 624 } 625 } else { 626 // operand 627 $token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token); 628 $parsed_ary[] = $token_decoded; 629 } 630 } 631 $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack))); 632 633 // cleanup: each double "NOT" in RPN array actually does nothing 634 $parsed_ary_count = count($parsed_ary); 635 for ($i = 1; $i < $parsed_ary_count; ++$i) { 636 if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { 637 unset($parsed_ary[$i], $parsed_ary[$i - 1]); 638 } 639 } 640 $parsed_ary = array_values($parsed_ary); 641 642 // build return value 643 $q = array(); 644 $q['query'] = $query; 645 $q['parsed_str'] = $parsed_query; 646 $q['parsed_ary'] = $parsed_ary; 647 648 foreach ($q['parsed_ary'] as $token) { 649 if ($token[2] !== ':') continue; 650 $body = substr($token, 3); 651 652 switch (substr($token, 0, 3)) { 653 case 'N+:': 654 $q['ns'][] = $body; // for backward compatibility 655 break; 656 case 'N-:': 657 $q['notns'][] = $body; // for backward compatibility 658 break; 659 case 'W_:': 660 $q['words'][] = $body; 661 break; 662 case 'W-:': 663 $q['words'][] = $body; 664 $q['not'][] = $body; // for backward compatibility 665 break; 666 case 'W+:': 667 $q['words'][] = $body; 668 $q['highlight'][] = str_replace('*', '', $body); 669 $q['and'][] = $body; // for backward compatibility 670 break; 671 case 'P-:': 672 $q['phrases'][] = $body; 673 break; 674 case 'P+:': 675 $q['phrases'][] = $body; 676 $q['highlight'][] = str_replace('*', '', $body); 677 break; 678 } 679 } 680 foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) { 681 $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key])); 682 } 683 684 return $q; 685} 686 687/** 688 * Transforms given search term into intermediate representation 689 * 690 * This function is used in ft_queryParser() and not for general purpose use. 691 * 692 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 693 */ 694function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { 695 $parsed = ''; 696 if ($consider_asian) { 697 // successive asian characters need to be searched as a phrase 698 $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 699 foreach ($words as $word) { 700 if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true; 701 $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); 702 } 703 } else { 704 $term_noparen = str_replace(array('(', ')'), ' ', $term); 705 $words = idx_tokenizer($term_noparen, $stopwords, true); 706 707 // W_: no need to highlight 708 if (empty($words)) { 709 $parsed = '()'; // important: do not remove 710 } elseif ($words[0] === $term) { 711 $parsed = '(W+:'.$words[0].')'; 712 } elseif ($phrase_mode) { 713 $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term); 714 $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))'; 715 } else { 716 $parsed = '((W+:'.implode(')(W+:', $words).'))'; 717 } 718 } 719 return $parsed; 720} 721 722//Setup VIM: ex: et ts=4 enc=utf-8 : 723