xref: /dokuwiki/inc/Search/Query/QueryParser.php (revision ede4646658cf51245060332d97a319a39c788ea1)
1<?php
2
3namespace dokuwiki\Search\Query;
4
5use dokuwiki\Search\Tokenizer;
6use dokuwiki\Utf8;
7
8/**
9 * DokuWuki QueryParser class
10 */
11class QueryParser
12{
13    /**
14     * Transforms given search term into intermediate representation
15     *
16     * This function is used in QueryParser::convert() and not for general purpose use.
17     *
18     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
19     *
20     * @param string       $term
21     * @param bool         $consider_asian
22     * @param bool         $phrase_mode
23     * @return string
24     */
25    public function termParser($term, $consider_asian = true, $phrase_mode = false)
26    {
27        $parsed = '';
28        if ($consider_asian) {
29            // successive asian characters need to be searched as a phrase
30            $words = Utf8\Asian::splitAsianWords($term);
31            foreach ($words as $word) {
32                $phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word);
33                $parsed .= $this->termParser($word, false, $phrase_mode);
34            }
35        } else {
36            $term_noparen = str_replace(['(',')'], ' ', $term);
37            $words = Tokenizer::getWords($term_noparen, true);
38
39            // W_: no need to highlight
40            if (empty($words)) {
41                $parsed = '()'; // important: do not remove
42            } elseif ($words[0] === $term) {
43                $parsed = '(W+:'.$words[0].')';
44            } elseif ($phrase_mode) {
45                $term_encoded = str_replace(['(',')'], ['OP','CP'], $term);
46                $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
47            } else {
48                $parsed = '((W+:'.implode(')(W+:', $words).'))';
49            }
50        }
51        return $parsed;
52    }
53
54    /**
55     * Parses a search query and builds an array of search formulas
56     *
57     * @author Andreas Gohr <andi@splitbrain.org>
58     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
59     *
60     * @param string $query search query
61     * @return array of search formulas
62     */
63    public function convert($query)
64    {
65        /**
66         * parse a search query and transform it into intermediate representation
67         *
68         * in a search query, you can use the following expressions:
69         *
70         *   words:
71         *     include
72         *     -exclude
73         *   phrases:
74         *     "phrase to be included"
75         *     -"phrase you want to exclude"
76         *   namespaces:
77         *     @include:namespace (or ns:include:namespace)
78         *     ^exclude:namespace (or -ns:exclude:namespace)
79         *   groups:
80         *     ()
81         *     -()
82         *   operators:
83         *     and ('and' is the default operator: you can always omit this)
84         *     or  (or pipe symbol '|', lower precedence than 'and')
85         *
86         * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
87         *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
88         *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
89         *      as long as you don't mind hit counts.
90         *
91         * intermediate representation consists of the following parts:
92         *
93         *   ( )           - group
94         *   AND           - logical and
95         *   OR            - logical or
96         *   NOT           - logical not
97         *   W+:, W-:, W_: - word      (underscore: no need to highlight)
98         *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
99         *   N+:, N-:      - namespace
100         */
101        $parsed_query = '';
102        $parens_level = 0;
103        $terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query),
104                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
105        );
106
107        foreach ($terms as $term) {
108            $parsed = '';
109            if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
110                // phrase-include and phrase-exclude
111                $not = $matches[1] ? 'NOT' : '';
112                $parsed = $not . $this->termParser($matches[2], false, true);
113            } else {
114                // fix incomplete phrase
115                $term = str_replace('"', ' ', $term);
116
117                // fix parentheses
118                $term = str_replace(')'  , ' ) ', $term);
119                $term = str_replace('('  , ' ( ', $term);
120                $term = str_replace('- (', ' -(', $term);
121
122                // treat pipe symbols as 'OR' operators
123                $term = str_replace('|', ' or ', $term);
124
125                // treat ideographic spaces (U+3000) as search term separators
126                // FIXME: some more separators?
127                $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
128                $term = trim($term);
129                if ($term === '') continue;
130
131                $tokens = explode(' ', $term);
132                foreach ($tokens as $token) {
133                    if ($token === '(') {
134                        // parenthesis-include-open
135                        $parsed .= '(';
136                        ++$parens_level;
137                    } elseif ($token === '-(') {
138                        // parenthesis-exclude-open
139                        $parsed .= 'NOT(';
140                        ++$parens_level;
141                    } elseif ($token === ')') {
142                        // parenthesis-any-close
143                        if ($parens_level === 0) continue;
144                        $parsed .= ')';
145                        $parens_level--;
146                    } elseif ($token === 'and') {
147                        // logical-and (do nothing)
148                    } elseif ($token === 'or') {
149                        // logical-or
150                        $parsed .= 'OR';
151                    } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
152                        // namespace-exclude
153                        $parsed .= 'NOT(N+:'.$matches[1].')';
154                    } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
155                        // namespace-include
156                        $parsed .= '(N+:'.$matches[1].')';
157                    } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
158                        // word-exclude
159                        $parsed .= 'NOT('.$this->termParser($matches[1]).')';
160                    } else {
161                        // word-include
162                        $parsed .= $this->termParser($token);
163                    }
164                }
165            }
166            $parsed_query .= $parsed;
167        }
168
169        // cleanup (very sensitive)
170        $parsed_query .= str_repeat(')', $parens_level);
171        do {
172            $parsed_query_old = $parsed_query;
173            $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
174        } while ($parsed_query !== $parsed_query_old);
175        $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
176        $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
177        $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
178        $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
179        $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
180
181        // adjustment: make highlightings right
182        $parens_level     = 0;
183        $notgrp_levels    = array();
184        $parsed_query_new = '';
185        $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query,
186                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
187        );
188        foreach ($tokens as $token) {
189            if ($token === 'NOT(') {
190                $notgrp_levels[] = ++$parens_level;
191            } elseif ($token === '(') {
192                ++$parens_level;
193            } elseif ($token === ')') {
194                if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
195            } elseif (count($notgrp_levels) % 2 === 1) {
196                // turn highlight-flag off if terms are logically in "NOT" group
197                $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
198            }
199            $parsed_query_new .= $token;
200        }
201        $parsed_query = $parsed_query_new;
202
203        /**
204         * convert infix notation string into postfix (Reverse Polish notation) array
205         * by Shunting-yard algorithm
206         *
207         * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
208         * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
209         */
210        $parsed_ary     = array();
211        $ope_stack      = array();
212        $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
213        $ope_regex      = '/([()]|OR|AND|NOT)/u';
214
215        $tokens = preg_split($ope_regex, $parsed_query,
216                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
217        );
218        foreach ($tokens as $token) {
219            if (preg_match($ope_regex, $token)) {
220                // operator
221                $last_ope = end($ope_stack);
222                while ($last_ope !== false
223                    && $ope_precedence[$token] <= $ope_precedence[$last_ope]
224                    && $last_ope != '('
225                ) {
226                    $parsed_ary[] = array_pop($ope_stack);
227                    $last_ope = end($ope_stack);
228                }
229                if ($token == ')') {
230                    array_pop($ope_stack); // this array_pop always deletes '('
231                } else {
232                    $ope_stack[] = $token;
233                }
234            } else {
235                // operand
236                $token_decoded = str_replace(['OP','CP'], ['(',')'], $token);
237                $parsed_ary[] = $token_decoded;
238            }
239        }
240        $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
241
242        // cleanup: each double "NOT" in RPN array actually does nothing
243        $parsed_ary_count = count($parsed_ary);
244        for ($i = 1; $i < $parsed_ary_count; ++$i) {
245            if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
246                unset($parsed_ary[$i], $parsed_ary[$i - 1]);
247            }
248        }
249        $parsed_ary = array_values($parsed_ary);
250
251        // build return value
252        $q = array();
253        $q['query']      = $query;
254        $q['parsed_str'] = $parsed_query;
255        $q['parsed_ary'] = $parsed_ary;
256
257        foreach ($q['parsed_ary'] as $token) {
258            if ($token[2] !== ':') continue;
259            $body = substr($token, 3);
260
261            switch (substr($token, 0, 3)) {
262                case 'N+:':
263                     $q['ns'][]        = $body; // for backward compatibility
264                     break;
265                case 'N-:':
266                     $q['notns'][]     = $body; // for backward compatibility
267                     break;
268                case 'W_:':
269                     $q['words'][]     = $body;
270                     break;
271                case 'W-:':
272                     $q['words'][]     = $body;
273                     $q['not'][]       = $body; // for backward compatibility
274                     break;
275                case 'W+:':
276                     $q['words'][]     = $body;
277                     $q['highlight'][] = $body;
278                     $q['and'][]       = $body; // for backward compatibility
279                     break;
280                case 'P-:':
281                     $q['phrases'][]   = $body;
282                     break;
283                case 'P+:':
284                     $q['phrases'][]   = $body;
285                     $q['highlight'][] = $body;
286                     break;
287            }
288        }
289        foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
290            $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
291        }
292
293        return $q;
294    }
295
296    /**
297     * Recreate a search query string based on parsed parts,
298     * doesn't support negated phrases and `OR` searches
299     *
300     * @param array $and
301     * @param array $not
302     * @param array $phrases
303     * @param array $ns
304     * @param array $notns
305     *
306     * @return string
307     */
308    public function revert(array $and, array $not, array $phrases, array $ns, array $notns)
309    {
310        $query = implode(' ', $and);
311
312        if (!empty($not)) {
313            $query .= ' -' . implode(' -', $not);
314        }
315        if (!empty($phrases)) {
316            $query .= ' "' . implode('" "', $phrases) . '"';
317        }
318        if (!empty($ns)) {
319            $query .= ' @' . implode(' @', $ns);
320        }
321        if (!empty($notns)) {
322            $query .= ' ^' . implode(' ^', $notns);
323        }
324        return $query;
325    }
326}
327