xref: /dokuwiki/inc/fulltext.php (revision 16d24031576f3dd597934facc8b906d512208761)
1<?php
2/**
3 * DokuWiki fulltextsearch functions using the index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9if(!defined('DOKU_INC')) die('meh.');
10require_once(DOKU_INC.'inc/indexer.php');
11
12
13/**
14 * The fulltext search
15 *
16 * Returns a list of matching documents for the given query
17 *
18 * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
19 *
20 */
21function ft_pageSearch($query,&$highlight){
22
23  $data['query'] = $query;
24  $data['highlight'] =& $highlight;
25
26  return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
27}
28
29/**
30 * Returns a list of matching documents for the given query
31 *
32 * @author Andreas Gohr <andi@splitbrain.org>
33 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
34 */
35function _ft_pageSearch(&$data) {
36    // parse the given query
37    $q = ft_queryParser($data['query']);
38    $data['highlight'] = $q['highlight'];
39
40    if (empty($q['parsed_ary'])) return array();
41
42    // lookup all words found in the query
43    $lookup = idx_lookup($q['words']);
44
45    // get all pages in this dokuwiki site (!: includes nonexistent pages)
46    $pages_all = array();
47    foreach (idx_getIndex('page', '') as $id) {
48        $pages_all[trim($id)] = 0; // base: 0 hit
49    }
50
51    // process the query
52    $stack = array();
53    foreach ($q['parsed_ary'] as $token) {
54        switch (substr($token, 0, 3)) {
55            case 'W+:':
56            case 'W-:':
57            case 'W_:': // word
58                $word    = substr($token, 3);
59                $stack[] = (array) $lookup[$word];
60                break;
61            case 'P+:':
62            case 'P-:': // phrase
63                $phrase = substr($token, 3);
64                // since phrases are always parsed as ((W1)(W2)...(P)),
65                // the end($stack) always points the pages that contain
66                // all words in this phrase
67                $pages  = end($stack);
68                $pages_matched = array();
69                foreach(array_keys($pages) as $id){
70                    $text = utf8_strtolower(rawWiki($id));
71                    if (strpos($text, $phrase) !== false) {
72                        $pages_matched[$id] = 0; // phrase: always 0 hit
73                    }
74                }
75                $stack[] = $pages_matched;
76                break;
77            case 'N+:':
78            case 'N-:': // namespace
79                $ns = substr($token, 3);
80                $pages_matched = array();
81                foreach (array_keys($pages_all) as $id) {
82                    if (strpos($id, $ns) === 0) {
83                        $pages_matched[$id] = 0; // namespace: always 0 hit
84                    }
85                }
86                $stack[] = $pages_matched;
87                break;
88            case 'AND': // and operation
89                list($pages1, $pages2) = array_splice($stack, -2);
90                $stack[] = ft_resultCombine(array($pages1, $pages2));
91                break;
92            case 'OR':  // or operation
93                list($pages1, $pages2) = array_splice($stack, -2);
94                $stack[] = ft_resultUnite(array($pages1, $pages2));
95                break;
96            case 'NOT': // not operation (unary)
97                $pages   = array_pop($stack);
98                $stack[] = ft_resultComplement(array($pages_all, $pages));
99                break;
100        }
101    }
102    $docs = array_pop($stack);
103
104    if (empty($docs)) return array();
105
106    // check: settings, acls, existence
107    foreach (array_keys($docs) as $id) {
108        if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) {
109            unset($docs[$id]);
110        }
111    }
112
113    // sort docs by count
114    arsort($docs);
115
116    return $docs;
117}
118
119/**
120 * Returns the backlinks for a given page
121 *
122 * Does a quick lookup with the fulltext index, then
123 * evaluates the instructions of the found pages
124 */
125function ft_backlinks($id){
126    global $conf;
127    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
128    $stopwords = @file_exists($swfile) ? file($swfile) : array();
129
130    $result = array();
131
132    // quick lookup of the pagename
133    $page    = noNS($id);
134    $matches = idx_lookup(idx_tokenizer($page,$stopwords));  // pagename may contain specials (_ or .)
135    $docs    = array_keys(ft_resultCombine(array_values($matches)));
136    $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
137    if(!count($docs)) return $result;
138    require_once(DOKU_INC.'inc/parserutils.php');
139
140    // check metadata for matching links
141    foreach($docs as $match){
142        // metadata relation reference links are already resolved
143        $links = p_get_metadata($match,'relation references');
144        if (isset($links[$id])) $result[] = $match;
145    }
146
147    if(!count($result)) return $result;
148
149    // check ACL permissions
150    foreach(array_keys($result) as $idx){
151        if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
152            unset($result[$idx]);
153        }
154    }
155
156    sort($result);
157    return $result;
158}
159
160/**
161 * Returns the pages that use a given media file
162 *
163 * Does a quick lookup with the fulltext index, then
164 * evaluates the instructions of the found pages
165 *
166 * Aborts after $max found results
167 */
168function ft_mediause($id,$max){
169    global $conf;
170    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
171    $stopwords = @file_exists($swfile) ? file($swfile) : array();
172
173    if(!$max) $max = 1; // need to find at least one
174
175    $result = array();
176
177    // quick lookup of the mediafile
178    $media   = noNS($id);
179    $matches = idx_lookup(idx_tokenizer($media,$stopwords));
180    $docs    = array_keys(ft_resultCombine(array_values($matches)));
181    if(!count($docs)) return $result;
182
183    // go through all found pages
184    $found = 0;
185    $pcre  = preg_quote($media,'/');
186    foreach($docs as $doc){
187        $ns = getNS($doc);
188        preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches);
189        foreach($matches[1] as $img){
190            $img = trim($img);
191            if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images
192            list($img) = explode('?',$img);                  // remove any parameters
193            resolve_mediaid($ns,$img,$exists);               // resolve the possibly relative img
194
195            if($img == $id){                                 // we have a match
196                $result[] = $doc;
197                $found++;
198                break;
199            }
200        }
201        if($found >= $max) break;
202    }
203
204    sort($result);
205    return $result;
206}
207
208
209
210/**
211 * Quicksearch for pagenames
212 *
213 * By default it only matches the pagename and ignores the
214 * namespace. This can be changed with the second parameter
215 *
216 * refactored into ft_pageLookup(), _ft_pageLookup() and trigger_event()
217 *
218 * @author Andreas Gohr <andi@splitbrain.org>
219 */
220function ft_pageLookup($id,$pageonly=true){
221    $data = array('id' => $id, 'pageonly' => $pageonly);
222    return trigger_event('SEARCH_QUERY_PAGELOOKUP',$data,'_ft_pageLookup');
223}
224
225function _ft_pageLookup(&$data){
226    // split out original parameterrs
227    $id = $data['id'];
228    $pageonly = $data['pageonly'];
229
230    global $conf;
231    $id    = preg_quote($id,'/');
232    $pages = file($conf['indexdir'].'/page.idx');
233    if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages));
234
235    $cnt = count($pages);
236    for($i=0; $i<$cnt; $i++){
237        if($pageonly){
238            if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
239                unset($pages[$i]);
240                continue;
241            }
242        }
243        if(!page_exists($pages[$i])){
244            unset($pages[$i]);
245            continue;
246        }
247    }
248
249    $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages
250    if(!count($pages)) return array();
251
252    // check ACL permissions
253    foreach(array_keys($pages) as $idx){
254        if(auth_quickaclcheck(trim($pages[$idx])) < AUTH_READ){
255            unset($pages[$idx]);
256        }
257    }
258
259    $pages = array_map('trim',$pages);
260    usort($pages,'ft_pagesorter');
261    return $pages;
262}
263
264/**
265 * Sort pages based on their namespace level first, then on their string
266 * values. This makes higher hierarchy pages rank higher than lower hierarchy
267 * pages.
268 */
269function ft_pagesorter($a, $b){
270    $ac = count(explode(':',$a));
271    $bc = count(explode(':',$b));
272    if($ac < $bc){
273        return -1;
274    }elseif($ac > $bc){
275        return 1;
276    }
277    return strcmp ($a,$b);
278}
279
280/**
281 * Creates a snippet extract
282 *
283 * @author Andreas Gohr <andi@splitbrain.org>
284 * @triggers FULLTEXT_SNIPPET_CREATE
285 */
286function ft_snippet($id,$highlight){
287    $text = rawWiki($id);
288    $evdata = array(
289                'id'        => $id,
290                'text'      => &$text,
291                'highlight' => &$highlight,
292                'snippet'   => '',
293              );
294
295    $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata);
296    if ($evt->advise_before()) {
297        $match = array();
298        $snippets = array();
299        $utf8_offset = $offset = $end = 0;
300        $len = utf8_strlen($text);
301
302        // build a regexp from the phrases to highlight
303        $re1 = '('.join('|',array_map('preg_quote_cb',array_filter((array) $highlight))).')';
304        $re2 = "$re1.{0,75}(?!\\1)$re1";
305        $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
306
307        for ($cnt=4; $cnt--;) {
308          if (0) {
309          } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
310          } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
311          } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
312          } else {
313            break;
314          }
315
316          list($str,$idx) = $match[0];
317
318          // convert $idx (a byte offset) into a utf8 character offset
319          $utf8_idx = utf8_strlen(substr($text,0,$idx));
320          $utf8_len = utf8_strlen($str);
321
322          // establish context, 100 bytes surrounding the match string
323          // first look to see if we can go 100 either side,
324          // then drop to 50 adding any excess if the other side can't go to 50,
325          $pre = min($utf8_idx-$utf8_offset,100);
326          $post = min($len-$utf8_idx-$utf8_len,100);
327
328          if ($pre>50 && $post>50) {
329            $pre = $post = 50;
330          } else if ($pre>50) {
331            $pre = min($pre,100-$post);
332          } else if ($post>50) {
333            $post = min($post, 100-$pre);
334          } else {
335            // both are less than 50, means the context is the whole string
336            // make it so and break out of this loop - there is no need for the
337            // complex snippet calculations
338            $snippets = array($text);
339            break;
340          }
341
342          // establish context start and end points, try to append to previous
343          // context if possible
344          $start = $utf8_idx - $pre;
345          $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
346          $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
347
348          if ($append) {
349            $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
350          } else {
351            $snippets[] = utf8_substr($text,$start,$end-$start);
352          }
353
354          // set $offset for next match attempt
355          //   substract strlen to avoid splitting a potential search success,
356          //   this is an approximation as the search pattern may match strings
357          //   of varying length and it will fail if the context snippet
358          //   boundary breaks a matching string longer than the current match
359          $utf8_offset = $utf8_idx + $post;
360          $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
361          $offset = utf8_correctIdx($text,$offset);
362        }
363
364        $m = "\1";
365        $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets);
366        $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
367
368        $evdata['snippet'] = $snippet;
369    }
370    $evt->advise_after();
371    unset($evt);
372
373    return $evdata['snippet'];
374}
375
376/**
377 * Combine found documents and sum up their scores
378 *
379 * This function is used to combine searched words with a logical
380 * AND. Only documents available in all arrays are returned.
381 *
382 * based upon PEAR's PHP_Compat function for array_intersect_key()
383 *
384 * @param array $args An array of page arrays
385 */
386function ft_resultCombine($args){
387    $array_count = count($args);
388    if($array_count == 1){
389        return $args[0];
390    }
391
392    $result = array();
393    if ($array_count > 1) {
394      foreach ($args[0] as $key => $value) {
395        $result[$key] = $value;
396        for ($i = 1; $i !== $array_count; $i++) {
397            if (!isset($args[$i][$key])) {
398                unset($result[$key]);
399                break;
400            }
401            $result[$key] += $args[$i][$key];
402        }
403      }
404    }
405    return $result;
406}
407
408/**
409 * Unites found documents and sum up their scores
410 *
411 * based upon ft_resultCombine() function
412 *
413 * @param array $args An array of page arrays
414 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
415 */
416function ft_resultUnite($args) {
417    $array_count = count($args);
418    if ($array_count === 1) {
419        return $args[0];
420    }
421
422    $result = $args[0];
423    for ($i = 1; $i !== $array_count; $i++) {
424        foreach (array_keys($args[$i]) as $id) {
425            $result[$id] += $args[$i][$id];
426        }
427    }
428    return $result;
429}
430
431/**
432 * Computes the difference of documents using page id for comparison
433 *
434 * nearly identical to PHP5's array_diff_key()
435 *
436 * @param array $args An array of page arrays
437 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
438 */
439function ft_resultComplement($args) {
440    $array_count = count($args);
441    if ($array_count === 1) {
442        return $args[0];
443    }
444
445    $result = $args[0];
446    foreach (array_keys($result) as $id) {
447        for ($i = 1; $i !== $array_count; $i++) {
448            if (isset($args[$i][$id])) unset($result[$id]);
449        }
450    }
451    return $result;
452}
453
454/**
455 * Parses a search query and builds an array of search formulas
456 *
457 * @author Andreas Gohr <andi@splitbrain.org>
458 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
459 */
460function ft_queryParser($query){
461    global $conf;
462    $swfile    = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
463    $stopwords = @file_exists($swfile) ? file($swfile) : array();
464
465    /**
466     * parse a search query and transform it into intermediate representation
467     *
468     * in a search query, you can use the following expressions:
469     *
470     *   words:
471     *     include
472     *     -exclude
473     *   phrases:
474     *     "phrase to be included"
475     *     -"phrase you want to exclude"
476     *   namespaces:
477     *     @include:namespace (or ns:include:namespace)
478     *     ^exclude:namespace (or -ns:exclude:namespace)
479     *   groups:
480     *     ()
481     *     -()
482     *   operators:
483     *     and ('and' is the default operator: you can always omit this)
484     *     or  (or pipe symbol '|', lower precedence than 'and')
485     *
486     * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
487     *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
488     *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
489     *      as long as you don't mind hit counts.
490     *
491     * intermediate representation consists of the following parts:
492     *
493     *   ( )           - group
494     *   AND           - logical and
495     *   OR            - logical or
496     *   NOT           - logical not
497     *   W+:, W-:, W_: - word      (underscore: no need to highlight)
498     *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
499     *   N+:, N-:      - namespace
500     */
501    $parsed_query = '';
502    $parens_level = 0;
503    $terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
504
505    foreach ($terms as $term) {
506        $parsed = '';
507        if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
508            // phrase-include and phrase-exclude
509            $not = $matches[1] ? 'NOT' : '';
510            $parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
511        } else {
512            // fix incomplete phrase
513            $term = str_replace('"', ' ', $term);
514
515            // fix parentheses
516            $term = str_replace(')'  , ' ) ', $term);
517            $term = str_replace('('  , ' ( ', $term);
518            $term = str_replace('- (', ' -(', $term);
519
520            // treat pipe symbols as 'OR' operators
521            $term = str_replace('|', ' or ', $term);
522
523            // treat ideographic spaces (U+3000) as search term separators
524            // FIXME: some more separators?
525            $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
526            $term = trim($term);
527            if ($term === '') continue;
528
529            $tokens = explode(' ', $term);
530            foreach ($tokens as $token) {
531                if ($token === '(') {
532                    // parenthesis-include-open
533                    $parsed .= '(';
534                    ++$parens_level;
535                } elseif ($token === '-(') {
536                    // parenthesis-exclude-open
537                    $parsed .= 'NOT(';
538                    ++$parens_level;
539                } elseif ($token === ')') {
540                    // parenthesis-any-close
541                    if ($parens_level === 0) continue;
542                    $parsed .= ')';
543                    $parens_level--;
544                } elseif ($token === 'and') {
545                    // logical-and (do nothing)
546                } elseif ($token === 'or') {
547                    // logical-or
548                    $parsed .= 'OR';
549                } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
550                    // namespace-exclude
551                    $parsed .= 'NOT(N+:'.$matches[1].')';
552                } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
553                    // namespace-include
554                    $parsed .= '(N+:'.$matches[1].')';
555                } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
556                    // word-exclude
557                    $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
558                } else {
559                    // word-include
560                    $parsed .= ft_termParser($token, $stopwords);
561                }
562            }
563        }
564        $parsed_query .= $parsed;
565    }
566
567    // cleanup (very sensitive)
568    $parsed_query .= str_repeat(')', $parens_level);
569    do {
570        $parsed_query_old = $parsed_query;
571        $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
572    } while ($parsed_query !== $parsed_query_old);
573    $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
574    $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
575    $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
576    $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
577    $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
578
579    // adjustment: make highlightings right
580    $parens_level     = 0;
581    $notgrp_levels    = array();
582    $parsed_query_new = '';
583    $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
584    foreach ($tokens as $token) {
585        if ($token === 'NOT(') {
586            $notgrp_levels[] = ++$parens_level;
587        } elseif ($token === '(') {
588            ++$parens_level;
589        } elseif ($token === ')') {
590            if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
591        } elseif (count($notgrp_levels) % 2 === 1) {
592            // turn highlight-flag off if terms are logically in "NOT" group
593            $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
594        }
595        $parsed_query_new .= $token;
596    }
597    $parsed_query = $parsed_query_new;
598
599    /**
600     * convert infix notation string into postfix (Reverse Polish notation) array
601     * by Shunting-yard algorithm
602     *
603     * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
604     * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
605     */
606    $parsed_ary     = array();
607    $ope_stack      = array();
608    $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
609    $ope_regex      = '/([()]|OR|AND|NOT)/u';
610
611    $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
612    foreach ($tokens as $token) {
613        if (preg_match($ope_regex, $token)) {
614            // operator
615            $last_ope = end($ope_stack);
616            while ($ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
617                $parsed_ary[] = array_pop($ope_stack);
618                $last_ope = end($ope_stack);
619            }
620            if ($token == ')') {
621                array_pop($ope_stack); // this array_pop always deletes '('
622            } else {
623                $ope_stack[] = $token;
624            }
625        } else {
626            // operand
627            $token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token);
628            $parsed_ary[] = $token_decoded;
629        }
630    }
631    $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
632
633    // cleanup: each double "NOT" in RPN array actually does nothing
634    $parsed_ary_count = count($parsed_ary);
635    for ($i = 1; $i < $parsed_ary_count; ++$i) {
636        if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
637            unset($parsed_ary[$i], $parsed_ary[$i - 1]);
638        }
639    }
640    $parsed_ary = array_values($parsed_ary);
641
642    // build return value
643    $q = array();
644    $q['query']      = $query;
645    $q['parsed_str'] = $parsed_query;
646    $q['parsed_ary'] = $parsed_ary;
647
648    foreach ($q['parsed_ary'] as $token) {
649        if ($token[2] !== ':') continue;
650        $body = substr($token, 3);
651
652        switch (substr($token, 0, 3)) {
653            case 'N+:':
654                $q['ns'][]        = $body; // for backward compatibility
655                break;
656            case 'N-:':
657                $q['notns'][]     = $body; // for backward compatibility
658                break;
659            case 'W_:':
660                $q['words'][]     = $body;
661                break;
662            case 'W-:':
663                $q['words'][]     = $body;
664                $q['not'][]       = $body; // for backward compatibility
665                break;
666            case 'W+:':
667                $q['words'][]     = $body;
668                $q['highlight'][] = str_replace('*', '', $body);
669                $q['and'][]       = $body; // for backward compatibility
670                break;
671            case 'P-:':
672                $q['phrases'][]   = $body;
673                break;
674            case 'P+:':
675                $q['phrases'][]   = $body;
676                $q['highlight'][] = str_replace('*', '', $body);
677                break;
678        }
679    }
680    foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
681        $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
682    }
683
684    return $q;
685}
686
687/**
688 * Transforms given search term into intermediate representation
689 *
690 * This function is used in ft_queryParser() and not for general purpose use.
691 *
692 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
693 */
694function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
695    $parsed = '';
696    if ($consider_asian) {
697        // successive asian characters need to be searched as a phrase
698        $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
699        foreach ($words as $word) {
700            if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true;
701            $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
702        }
703    } else {
704        $term_noparen = str_replace(array('(', ')'), ' ', $term);
705        $words = idx_tokenizer($term_noparen, $stopwords, true);
706
707        // W_: no need to highlight
708        if (empty($words)) {
709            $parsed = '()'; // important: do not remove
710        } elseif ($words[0] === $term) {
711            $parsed = '(W+:'.$words[0].')';
712        } elseif ($phrase_mode) {
713            $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
714            $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
715        } else {
716            $parsed = '((W+:'.implode(')(W+:', $words).'))';
717        }
718    }
719    return $parsed;
720}
721
722//Setup VIM: ex: et ts=4 enc=utf-8 :
723