xref: /dokuwiki/inc/fulltext.php (revision 90a0f2e151531db5b76c3d1c340f70da35922456)
1<?php
2/**
3 * DokuWiki fulltextsearch functions using the index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9if(!defined('DOKU_INC')) die('meh.');
10require_once(DOKU_INC.'inc/indexer.php');
11
12/**
13 * create snippets for the first few results only
14 */
15if(!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER',15);
16
17/**
18 * The fulltext search
19 *
20 * Returns a list of matching documents for the given query
21 *
22 * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
23 *
24 */
25function ft_pageSearch($query,&$highlight){
26
27    $data['query'] = $query;
28    $data['highlight'] =& $highlight;
29
30    return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
31}
32
33/**
34 * Returns a list of matching documents for the given query
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
38 */
39function _ft_pageSearch(&$data) {
40    // parse the given query
41    $q = ft_queryParser($data['query']);
42    $data['highlight'] = $q['highlight'];
43
44    if (empty($q['parsed_ary'])) return array();
45
46    // lookup all words found in the query
47    $lookup = idx_lookup($q['words']);
48
49    // get all pages in this dokuwiki site (!: includes nonexistent pages)
50    $pages_all = array();
51    foreach (idx_getIndex('page', '') as $id) {
52        $pages_all[trim($id)] = 0; // base: 0 hit
53    }
54
55    // process the query
56    $stack = array();
57    foreach ($q['parsed_ary'] as $token) {
58        switch (substr($token, 0, 3)) {
59            case 'W+:':
60            case 'W-:':
61            case 'W_:': // word
62                $word    = substr($token, 3);
63                $stack[] = (array) $lookup[$word];
64                break;
65            case 'P+:':
66            case 'P-:': // phrase
67                $phrase = substr($token, 3);
68                // since phrases are always parsed as ((W1)(W2)...(P)),
69                // the end($stack) always points the pages that contain
70                // all words in this phrase
71                $pages  = end($stack);
72                $pages_matched = array();
73                foreach(array_keys($pages) as $id){
74                    $text = utf8_strtolower(rawWiki($id));
75                    if (strpos($text, $phrase) !== false) {
76                        $pages_matched[$id] = 0; // phrase: always 0 hit
77                    }
78                }
79                $stack[] = $pages_matched;
80                break;
81            case 'N+:':
82            case 'N-:': // namespace
83                $ns = substr($token, 3);
84                $pages_matched = array();
85                foreach (array_keys($pages_all) as $id) {
86                    if (strpos($id, $ns) === 0) {
87                        $pages_matched[$id] = 0; // namespace: always 0 hit
88                    }
89                }
90                $stack[] = $pages_matched;
91                break;
92            case 'AND': // and operation
93                list($pages1, $pages2) = array_splice($stack, -2);
94                $stack[] = ft_resultCombine(array($pages1, $pages2));
95                break;
96            case 'OR':  // or operation
97                list($pages1, $pages2) = array_splice($stack, -2);
98                $stack[] = ft_resultUnite(array($pages1, $pages2));
99                break;
100            case 'NOT': // not operation (unary)
101                $pages   = array_pop($stack);
102                $stack[] = ft_resultComplement(array($pages_all, $pages));
103                break;
104        }
105    }
106    $docs = array_pop($stack);
107
108    if (empty($docs)) return array();
109
110    // check: settings, acls, existence
111    foreach (array_keys($docs) as $id) {
112        if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) {
113            unset($docs[$id]);
114        }
115    }
116
117    // sort docs by count
118    arsort($docs);
119
120    return $docs;
121}
122
123/**
124 * Returns the backlinks for a given page
125 *
126 * Does a quick lookup with the fulltext index, then
127 * evaluates the instructions of the found pages
128 */
129function ft_backlinks($id){
130    global $conf;
131    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
132    $stopwords = @file_exists($swfile) ? file($swfile) : array();
133
134    $result = array();
135
136    // quick lookup of the pagename
137    $page    = noNS($id);
138    $matches = idx_lookup(idx_tokenizer($page,$stopwords));  // pagename may contain specials (_ or .)
139    $docs    = array_keys(ft_resultCombine(array_values($matches)));
140    $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
141    if(!count($docs)) return $result;
142    require_once(DOKU_INC.'inc/parserutils.php');
143
144    // check metadata for matching links
145    foreach($docs as $match){
146        // metadata relation reference links are already resolved
147        $links = p_get_metadata($match,'relation references');
148        if (isset($links[$id])) $result[] = $match;
149    }
150
151    if(!count($result)) return $result;
152
153    // check ACL permissions
154    foreach(array_keys($result) as $idx){
155        if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
156            unset($result[$idx]);
157        }
158    }
159
160    sort($result);
161    return $result;
162}
163
164/**
165 * Returns the pages that use a given media file
166 *
167 * Does a quick lookup with the fulltext index, then
168 * evaluates the instructions of the found pages
169 *
170 * Aborts after $max found results
171 */
172function ft_mediause($id,$max){
173    global $conf;
174    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
175    $stopwords = @file_exists($swfile) ? file($swfile) : array();
176
177    if(!$max) $max = 1; // need to find at least one
178
179    $result = array();
180
181    // quick lookup of the mediafile
182    $media   = noNS($id);
183    $matches = idx_lookup(idx_tokenizer($media,$stopwords));
184    $docs    = array_keys(ft_resultCombine(array_values($matches)));
185    if(!count($docs)) return $result;
186
187    // go through all found pages
188    $found = 0;
189    $pcre  = preg_quote($media,'/');
190    foreach($docs as $doc){
191        $ns = getNS($doc);
192        preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches);
193        foreach($matches[1] as $img){
194            $img = trim($img);
195            if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images
196                list($img) = explode('?',$img);                  // remove any parameters
197            resolve_mediaid($ns,$img,$exists);               // resolve the possibly relative img
198
199            if($img == $id){                                 // we have a match
200                $result[] = $doc;
201                $found++;
202                break;
203            }
204        }
205        if($found >= $max) break;
206    }
207
208    sort($result);
209    return $result;
210}
211
212
213
214/**
215 * Quicksearch for pagenames
216 *
217 * By default it only matches the pagename and ignores the
218 * namespace. This can be changed with the second parameter
219 *
220 * refactored into ft_pageLookup(), _ft_pageLookup() and trigger_event()
221 *
222 * @author Andreas Gohr <andi@splitbrain.org>
223 */
224function ft_pageLookup($id,$pageonly=true){
225    $data = array('id' => $id, 'pageonly' => $pageonly);
226    return trigger_event('SEARCH_QUERY_PAGELOOKUP',$data,'_ft_pageLookup');
227}
228
229function _ft_pageLookup(&$data){
230    // split out original parameterrs
231    $id = $data['id'];
232    $pageonly = $data['pageonly'];
233
234    global $conf;
235    $id    = preg_quote($id,'/');
236    $pages = file($conf['indexdir'].'/page.idx');
237    if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages));
238
239    $cnt = count($pages);
240    for($i=0; $i<$cnt; $i++){
241        if($pageonly){
242            if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
243                unset($pages[$i]);
244                continue;
245            }
246        }
247        if(!page_exists($pages[$i])){
248            unset($pages[$i]);
249            continue;
250        }
251    }
252
253    $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages
254    if(!count($pages)) return array();
255
256    // check ACL permissions
257    foreach(array_keys($pages) as $idx){
258        if(auth_quickaclcheck(trim($pages[$idx])) < AUTH_READ){
259            unset($pages[$idx]);
260        }
261    }
262
263    $pages = array_map('trim',$pages);
264    usort($pages,'ft_pagesorter');
265    return $pages;
266}
267
268/**
269 * Sort pages based on their namespace level first, then on their string
270 * values. This makes higher hierarchy pages rank higher than lower hierarchy
271 * pages.
272 */
273function ft_pagesorter($a, $b){
274    $ac = count(explode(':',$a));
275    $bc = count(explode(':',$b));
276    if($ac < $bc){
277        return -1;
278    }elseif($ac > $bc){
279        return 1;
280    }
281    return strcmp ($a,$b);
282}
283
284/**
285 * Creates a snippet extract
286 *
287 * @author Andreas Gohr <andi@splitbrain.org>
288 * @triggers FULLTEXT_SNIPPET_CREATE
289 */
290function ft_snippet($id,$highlight){
291    $text = rawWiki($id);
292    $evdata = array(
293            'id'        => $id,
294            'text'      => &$text,
295            'highlight' => &$highlight,
296            'snippet'   => '',
297            );
298
299    $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata);
300    if ($evt->advise_before()) {
301        $match = array();
302        $snippets = array();
303        $utf8_offset = $offset = $end = 0;
304        $len = utf8_strlen($text);
305
306        // build a regexp from the phrases to highlight
307        $re1 = '('.join('|',array_map('preg_quote_cb',array_filter((array) $highlight))).')';
308        $re2 = "$re1.{0,75}(?!\\1)$re1";
309        $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
310
311        for ($cnt=4; $cnt--;) {
312            if (0) {
313            } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
314            } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
315            } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
316            } else {
317                break;
318            }
319
320            list($str,$idx) = $match[0];
321
322            // convert $idx (a byte offset) into a utf8 character offset
323            $utf8_idx = utf8_strlen(substr($text,0,$idx));
324            $utf8_len = utf8_strlen($str);
325
326            // establish context, 100 bytes surrounding the match string
327            // first look to see if we can go 100 either side,
328            // then drop to 50 adding any excess if the other side can't go to 50,
329            $pre = min($utf8_idx-$utf8_offset,100);
330            $post = min($len-$utf8_idx-$utf8_len,100);
331
332            if ($pre>50 && $post>50) {
333                $pre = $post = 50;
334            } else if ($pre>50) {
335                $pre = min($pre,100-$post);
336            } else if ($post>50) {
337                $post = min($post, 100-$pre);
338            } else {
339                // both are less than 50, means the context is the whole string
340                // make it so and break out of this loop - there is no need for the
341                // complex snippet calculations
342                $snippets = array($text);
343                break;
344            }
345
346            // establish context start and end points, try to append to previous
347            // context if possible
348            $start = $utf8_idx - $pre;
349            $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
350            $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
351
352            if ($append) {
353                $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
354            } else {
355                $snippets[] = utf8_substr($text,$start,$end-$start);
356            }
357
358            // set $offset for next match attempt
359            //   substract strlen to avoid splitting a potential search success,
360            //   this is an approximation as the search pattern may match strings
361            //   of varying length and it will fail if the context snippet
362            //   boundary breaks a matching string longer than the current match
363            $utf8_offset = $utf8_idx + $post;
364            $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
365            $offset = utf8_correctIdx($text,$offset);
366        }
367
368        $m = "\1";
369        $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets);
370        $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
371
372        $evdata['snippet'] = $snippet;
373    }
374    $evt->advise_after();
375    unset($evt);
376
377    return $evdata['snippet'];
378}
379
380/**
381 * Combine found documents and sum up their scores
382 *
383 * This function is used to combine searched words with a logical
384 * AND. Only documents available in all arrays are returned.
385 *
386 * based upon PEAR's PHP_Compat function for array_intersect_key()
387 *
388 * @param array $args An array of page arrays
389 */
390function ft_resultCombine($args){
391    $array_count = count($args);
392    if($array_count == 1){
393        return $args[0];
394    }
395
396    $result = array();
397    if ($array_count > 1) {
398        foreach ($args[0] as $key => $value) {
399            $result[$key] = $value;
400            for ($i = 1; $i !== $array_count; $i++) {
401                if (!isset($args[$i][$key])) {
402                    unset($result[$key]);
403                    break;
404                }
405                $result[$key] += $args[$i][$key];
406            }
407        }
408    }
409    return $result;
410}
411
412/**
413 * Unites found documents and sum up their scores
414 *
415 * based upon ft_resultCombine() function
416 *
417 * @param array $args An array of page arrays
418 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
419 */
420function ft_resultUnite($args) {
421    $array_count = count($args);
422    if ($array_count === 1) {
423        return $args[0];
424    }
425
426    $result = $args[0];
427    for ($i = 1; $i !== $array_count; $i++) {
428        foreach (array_keys($args[$i]) as $id) {
429            $result[$id] += $args[$i][$id];
430        }
431    }
432    return $result;
433}
434
435/**
436 * Computes the difference of documents using page id for comparison
437 *
438 * nearly identical to PHP5's array_diff_key()
439 *
440 * @param array $args An array of page arrays
441 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
442 */
443function ft_resultComplement($args) {
444    $array_count = count($args);
445    if ($array_count === 1) {
446        return $args[0];
447    }
448
449    $result = $args[0];
450    foreach (array_keys($result) as $id) {
451        for ($i = 1; $i !== $array_count; $i++) {
452            if (isset($args[$i][$id])) unset($result[$id]);
453        }
454    }
455    return $result;
456}
457
458/**
459 * Parses a search query and builds an array of search formulas
460 *
461 * @author Andreas Gohr <andi@splitbrain.org>
462 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
463 */
464function ft_queryParser($query){
465    global $conf;
466    $swfile    = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
467    $stopwords = @file_exists($swfile) ? file($swfile) : array();
468
469    /**
470     * parse a search query and transform it into intermediate representation
471     *
472     * in a search query, you can use the following expressions:
473     *
474     *   words:
475     *     include
476     *     -exclude
477     *   phrases:
478     *     "phrase to be included"
479     *     -"phrase you want to exclude"
480     *   namespaces:
481     *     @include:namespace (or ns:include:namespace)
482     *     ^exclude:namespace (or -ns:exclude:namespace)
483     *   groups:
484     *     ()
485     *     -()
486     *   operators:
487     *     and ('and' is the default operator: you can always omit this)
488     *     or  (or pipe symbol '|', lower precedence than 'and')
489     *
490     * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
491     *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
492     *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
493     *      as long as you don't mind hit counts.
494     *
495     * intermediate representation consists of the following parts:
496     *
497     *   ( )           - group
498     *   AND           - logical and
499     *   OR            - logical or
500     *   NOT           - logical not
501     *   W+:, W-:, W_: - word      (underscore: no need to highlight)
502     *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
503     *   N+:, N-:      - namespace
504     */
505    $parsed_query = '';
506    $parens_level = 0;
507    $terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
508
509    foreach ($terms as $term) {
510        $parsed = '';
511        if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
512            // phrase-include and phrase-exclude
513            $not = $matches[1] ? 'NOT' : '';
514            $parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
515        } else {
516            // fix incomplete phrase
517            $term = str_replace('"', ' ', $term);
518
519            // fix parentheses
520            $term = str_replace(')'  , ' ) ', $term);
521            $term = str_replace('('  , ' ( ', $term);
522            $term = str_replace('- (', ' -(', $term);
523
524            // treat pipe symbols as 'OR' operators
525            $term = str_replace('|', ' or ', $term);
526
527            // treat ideographic spaces (U+3000) as search term separators
528            // FIXME: some more separators?
529            $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
530            $term = trim($term);
531            if ($term === '') continue;
532
533            $tokens = explode(' ', $term);
534            foreach ($tokens as $token) {
535                if ($token === '(') {
536                    // parenthesis-include-open
537                    $parsed .= '(';
538                    ++$parens_level;
539                } elseif ($token === '-(') {
540                    // parenthesis-exclude-open
541                    $parsed .= 'NOT(';
542                    ++$parens_level;
543                } elseif ($token === ')') {
544                    // parenthesis-any-close
545                    if ($parens_level === 0) continue;
546                    $parsed .= ')';
547                    $parens_level--;
548                } elseif ($token === 'and') {
549                    // logical-and (do nothing)
550                } elseif ($token === 'or') {
551                    // logical-or
552                    $parsed .= 'OR';
553                } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
554                    // namespace-exclude
555                    $parsed .= 'NOT(N+:'.$matches[1].')';
556                } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
557                    // namespace-include
558                    $parsed .= '(N+:'.$matches[1].')';
559                } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
560                    // word-exclude
561                    $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
562                } else {
563                    // word-include
564                    $parsed .= ft_termParser($token, $stopwords);
565                }
566            }
567        }
568        $parsed_query .= $parsed;
569    }
570
571    // cleanup (very sensitive)
572    $parsed_query .= str_repeat(')', $parens_level);
573    do {
574        $parsed_query_old = $parsed_query;
575        $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
576    } while ($parsed_query !== $parsed_query_old);
577    $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
578    $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
579    $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
580    $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
581    $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
582
583    // adjustment: make highlightings right
584    $parens_level     = 0;
585    $notgrp_levels    = array();
586    $parsed_query_new = '';
587    $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
588    foreach ($tokens as $token) {
589        if ($token === 'NOT(') {
590            $notgrp_levels[] = ++$parens_level;
591        } elseif ($token === '(') {
592            ++$parens_level;
593        } elseif ($token === ')') {
594            if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
595        } elseif (count($notgrp_levels) % 2 === 1) {
596            // turn highlight-flag off if terms are logically in "NOT" group
597            $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
598        }
599        $parsed_query_new .= $token;
600    }
601    $parsed_query = $parsed_query_new;
602
603    /**
604     * convert infix notation string into postfix (Reverse Polish notation) array
605     * by Shunting-yard algorithm
606     *
607     * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
608     * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
609     */
610    $parsed_ary     = array();
611    $ope_stack      = array();
612    $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
613    $ope_regex      = '/([()]|OR|AND|NOT)/u';
614
615    $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
616    foreach ($tokens as $token) {
617        if (preg_match($ope_regex, $token)) {
618            // operator
619            $last_ope = end($ope_stack);
620            while ($ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
621                $parsed_ary[] = array_pop($ope_stack);
622                $last_ope = end($ope_stack);
623            }
624            if ($token == ')') {
625                array_pop($ope_stack); // this array_pop always deletes '('
626            } else {
627                $ope_stack[] = $token;
628            }
629        } else {
630            // operand
631            $token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token);
632            $parsed_ary[] = $token_decoded;
633        }
634    }
635    $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
636
637    // cleanup: each double "NOT" in RPN array actually does nothing
638    $parsed_ary_count = count($parsed_ary);
639    for ($i = 1; $i < $parsed_ary_count; ++$i) {
640        if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
641            unset($parsed_ary[$i], $parsed_ary[$i - 1]);
642        }
643    }
644    $parsed_ary = array_values($parsed_ary);
645
646    // build return value
647    $q = array();
648    $q['query']      = $query;
649    $q['parsed_str'] = $parsed_query;
650    $q['parsed_ary'] = $parsed_ary;
651
652    foreach ($q['parsed_ary'] as $token) {
653        if ($token[2] !== ':') continue;
654        $body = substr($token, 3);
655
656        switch (substr($token, 0, 3)) {
657            case 'N+:':
658                     $q['ns'][]        = $body; // for backward compatibility
659                     break;
660            case 'N-:':
661                     $q['notns'][]     = $body; // for backward compatibility
662                     break;
663            case 'W_:':
664                     $q['words'][]     = $body;
665                     break;
666            case 'W-:':
667                     $q['words'][]     = $body;
668                     $q['not'][]       = $body; // for backward compatibility
669                     break;
670            case 'W+:':
671                     $q['words'][]     = $body;
672                     $q['highlight'][] = str_replace('*', '', $body);
673                     $q['and'][]       = $body; // for backward compatibility
674                     break;
675            case 'P-:':
676                     $q['phrases'][]   = $body;
677                     break;
678            case 'P+:':
679                     $q['phrases'][]   = $body;
680                     $q['highlight'][] = str_replace('*', '', $body);
681                     break;
682        }
683    }
684    foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
685        $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
686    }
687
688    return $q;
689}
690
691/**
692 * Transforms given search term into intermediate representation
693 *
694 * This function is used in ft_queryParser() and not for general purpose use.
695 *
696 * @author Kazutaka Miyasaka <kazmiya@gmail.com>
697 */
698function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
699    $parsed = '';
700    if ($consider_asian) {
701        // successive asian characters need to be searched as a phrase
702        $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
703        foreach ($words as $word) {
704            if (preg_match('/'.IDX_ASIAN.'/u', $word)) $phrase_mode = true;
705            $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
706        }
707    } else {
708        $term_noparen = str_replace(array('(', ')'), ' ', $term);
709        $words = idx_tokenizer($term_noparen, $stopwords, true);
710
711        // W_: no need to highlight
712        if (empty($words)) {
713            $parsed = '()'; // important: do not remove
714        } elseif ($words[0] === $term) {
715            $parsed = '(W+:'.$words[0].')';
716        } elseif ($phrase_mode) {
717            $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
718            $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
719        } else {
720            $parsed = '((W+:'.implode(')(W+:', $words).'))';
721        }
722    }
723    return $parsed;
724}
725
726//Setup VIM: ex: et ts=4 enc=utf-8 :
727