xref: /dokuwiki/inc/Search/Query/QueryParser.php (revision 9369b4a991666bc911474806b106d8958e79f4c1)
1<?php
2
3namespace dokuwiki\Search\Query;
4
5use dokuwiki\Search\Tokenizer;
6use dokuwiki\Utf8\Asian;
7use dokuwiki\Utf8\PhpString;
8
9/**
10 * DokuWuki QueryParser class
11 */
12class QueryParser
13{
14    /**
15     * Transforms given search term into intermediate representation
16     *
17     * This function is used in QueryParser::convert() and not for general purpose use.
18     *
19     * @param string $term
20     * @param bool $consider_asian
21     * @param bool $phrase_mode
22     * @return string
23     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
24     *
25     */
26    public function termParser(string $term, bool $consider_asian = true, bool $phrase_mode = false): string
27    {
28        $parsed = '';
29        if ($consider_asian) {
30            // successive asian characters need to be searched as a phrase
31            $words = Asian::splitAsianWords($term);
32            foreach ($words as $word) {
33                $phrase_mode = $phrase_mode || Asian::isAsianWords($word);
34                $parsed .= $this->termParser($word, false, $phrase_mode);
35            }
36        } else {
37            $term_noparen = str_replace(['(', ')'], ' ', $term);
38            $words = Tokenizer::getWords($term_noparen, true);
39
40            // W_: no need to highlight
41            if ($words === []) {
42                $parsed = '()'; // important: do not remove
43            } elseif ($words[0] === $term) {
44                $parsed = '(W+:' . $words[0] . ')';
45            } elseif ($phrase_mode) {
46                $term_encoded = str_replace(['(', ')'], ['OP', 'CP'], $term);
47                $parsed = '((W_:' . implode(')(W_:', $words) . ')(P+:' . $term_encoded . '))';
48            } else {
49                $parsed = '((W+:' . implode(')(W+:', $words) . '))';
50            }
51        }
52        return $parsed;
53    }
54
55    /**
56     * Parses a search query and builds an array of search formulas
57     *
58     * @param string $query search query
59     * @return array of search formulas
60     * @author Andreas Gohr <andi@splitbrain.org>
61     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
62     *
63     */
64    public function convert(string $query): array
65    {
66        /**
67         * parse a search query and transform it into intermediate representation
68         *
69         * in a search query, you can use the following expressions:
70         *
71         *   words:
72         *     include
73         *     -exclude
74         *   phrases:
75         *     "phrase to be included"
76         *     -"phrase you want to exclude"
77         *   namespaces:
78         * @include:namespace (or ns:include:namespace)
79         *     ^exclude:namespace (or -ns:exclude:namespace)
80         *   groups:
81         *     ()
82         *     -()
83         *   operators:
84         *     and ('and' is the default operator: you can always omit this)
85         *     or  (or pipe symbol '|', lower precedence than 'and')
86         *
87         * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
88         *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
89         *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
90         *      as long as you don't mind hit counts.
91         *
92         * intermediate representation consists of the following parts:
93         *
94         *   ( )           - group
95         *   AND           - logical and
96         *   OR            - logical or
97         *   NOT           - logical not
98         *   W+:, W-:, W_: - word      (underscore: no need to highlight)
99         *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
100         *   N+:, N-:      - namespace
101         */
102        $parsed_query = '';
103        $parens_level = 0;
104        $terms = preg_split(
105            '/(-?".*?")/u',
106            PhpString::strtolower($query),
107            -1,
108            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
109        );
110
111        foreach ($terms as $term) {
112            $parsed = '';
113            if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
114                // phrase-include and phrase-exclude
115                $not = $matches[1] ? 'NOT' : '';
116                $parsed = $not . $this->termParser($matches[2], false, true);
117            } else {
118                // fix incomplete phrase
119                $term = str_replace('"', ' ', $term);
120
121                // fix parentheses
122                $term = str_replace(')', ' ) ', $term);
123                $term = str_replace('(', ' ( ', $term);
124                $term = str_replace('- (', ' -(', $term);
125
126                // treat pipe symbols as 'OR' operators
127                $term = str_replace('|', ' or ', $term);
128
129                // treat ideographic spaces (U+3000) as search term separators
130                // FIXME: some more separators?
131                $term = preg_replace('/[ \x{3000}]+/u', ' ', $term);
132                $term = trim($term);
133                if ($term === '') continue;
134
135                $tokens = explode(' ', $term);
136                foreach ($tokens as $token) {
137                    if ($token === '(') {
138                        // parenthesis-include-open
139                        $parsed .= '(';
140                        ++$parens_level;
141                    } elseif ($token === '-(') {
142                        // parenthesis-exclude-open
143                        $parsed .= 'NOT(';
144                        ++$parens_level;
145                    } elseif ($token === ')') {
146                        // parenthesis-any-close
147                        if ($parens_level === 0) continue;
148                        $parsed .= ')';
149                        $parens_level--;
150                    } elseif ($token === 'and') {
151                        // logical-and (do nothing)
152                    } elseif ($token === 'or') {
153                        // logical-or
154                        $parsed .= 'OR';
155                    } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
156                        // namespace-exclude
157                        $parsed .= 'NOT(N+:' . $matches[1] . ')';
158                    } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
159                        // namespace-include
160                        $parsed .= '(N+:' . $matches[1] . ')';
161                    } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
162                        // word-exclude
163                        $parsed .= 'NOT(' . $this->termParser($matches[1]) . ')';
164                    } else {
165                        // word-include
166                        $parsed .= $this->termParser($token);
167                    }
168                }
169            }
170            $parsed_query .= $parsed;
171        }
172
173        // cleanup (very sensitive)
174        $parsed_query .= str_repeat(')', $parens_level);
175        do {
176            $parsed_query_old = $parsed_query;
177            $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
178        } while ($parsed_query !== $parsed_query_old);
179        $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')', $parsed_query);
180        $parsed_query = preg_replace('/(OR)+/u', 'OR', $parsed_query);
181        $parsed_query = preg_replace('/\(OR/u', '(', $parsed_query);
182        $parsed_query = preg_replace('/^OR|OR$/u', '', $parsed_query);
183        $parsed_query = preg_replace('/\)(NOT)?\(/u', ')AND$1(', $parsed_query);
184
185        // adjustment: make highlightings right
186        $parens_level = 0;
187        $notgrp_levels = [];
188        $parsed_query_new = '';
189        $tokens = preg_split(
190            '/(NOT\(|[()])/u',
191            $parsed_query,
192            -1,
193            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
194        );
195        foreach ($tokens as $token) {
196            if ($token === 'NOT(') {
197                $notgrp_levels[] = ++$parens_level;
198            } elseif ($token === '(') {
199                ++$parens_level;
200            } elseif ($token === ')') {
201                if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
202            } elseif (count($notgrp_levels) % 2 === 1) {
203                // turn highlight-flag off if terms are logically in "NOT" group
204                $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
205            }
206            $parsed_query_new .= $token;
207        }
208        $parsed_query = $parsed_query_new;
209
210        /**
211         * convert infix notation string into postfix (Reverse Polish notation) array
212         * by Shunting-yard algorithm
213         *
214         * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
215         * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
216         */
217        $parsed_ary = [];
218        $ope_stack = [];
219        $ope_precedence = [')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5];
220        $ope_regex = '/([()]|OR|AND|NOT)/u';
221
222        $tokens = preg_split(
223            $ope_regex,
224            $parsed_query,
225            -1,
226            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
227        );
228        foreach ($tokens as $token) {
229            if (preg_match($ope_regex, $token)) {
230                // operator
231                $last_ope = end($ope_stack);
232                while (
233                    $last_ope !== false
234                    && $ope_precedence[$token] <= $ope_precedence[$last_ope]
235                    && $last_ope != '('
236                ) {
237                    $parsed_ary[] = array_pop($ope_stack);
238                    $last_ope = end($ope_stack);
239                }
240                if ($token == ')') {
241                    array_pop($ope_stack); // this array_pop always deletes '('
242                } else {
243                    $ope_stack[] = $token;
244                }
245            } else {
246                // operand
247                $token_decoded = str_replace(['OP', 'CP'], ['(', ')'], $token);
248                $parsed_ary[] = $token_decoded;
249            }
250        }
251        $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
252
253        // cleanup: each double "NOT" in RPN array actually does nothing
254        $parsed_ary_count = count($parsed_ary);
255        for ($i = 1; $i < $parsed_ary_count; ++$i) {
256            if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
257                unset($parsed_ary[$i], $parsed_ary[$i - 1]);
258            }
259        }
260        $parsed_ary = array_values($parsed_ary);
261
262        // build return value
263        $q = [];
264        $q['query'] = $query;
265        $q['parsed_str'] = $parsed_query;
266        $q['parsed_ary'] = $parsed_ary;
267
268        foreach ($q['parsed_ary'] as $token) {
269            if ($token[2] !== ':') continue;
270            $body = substr($token, 3);
271
272            switch (substr($token, 0, 3)) {
273                case 'N+:':
274                    $q['ns'][] = $body; // for backward compatibility
275                    break;
276                case 'N-:':
277                    $q['notns'][] = $body; // for backward compatibility
278                    break;
279                case 'W_:':
280                    $q['words'][] = $body;
281                    break;
282                case 'W-:':
283                    $q['words'][] = $body;
284                    $q['not'][] = $body; // for backward compatibility
285                    break;
286                case 'W+:':
287                    $q['words'][] = $body;
288                    $q['highlight'][] = $body;
289                    $q['and'][] = $body; // for backward compatibility
290                    break;
291                case 'P-:':
292                    $q['phrases'][] = $body;
293                    break;
294                case 'P+:':
295                    $q['phrases'][] = $body;
296                    $q['highlight'][] = $body;
297                    break;
298            }
299        }
300        foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
301            $q[$key] = empty($q[$key]) ? [] : array_values(array_unique($q[$key]));
302        }
303
304        return $q;
305    }
306
307    /**
308     * Recreate a search query string based on parsed parts,
309     * doesn't support negated phrases and `OR` searches
310     *
311     * @param array $and
312     * @param array $not
313     * @param array $phrases
314     * @param array $ns
315     * @param array $notns
316     *
317     * @return string
318     */
319    public function revert(array $and, array $not, array $phrases, array $ns, array $notns)
320    {
321        $query = implode(' ', $and);
322
323        if ($not !== []) {
324            $query .= ' -' . implode(' -', $not);
325        }
326        if ($phrases !== []) {
327            $query .= ' "' . implode('" "', $phrases) . '"';
328        }
329        if ($ns !== []) {
330            $query .= ' @' . implode(' @', $ns);
331        }
332        if ($notns !== []) {
333            $query .= ' ^' . implode(' ^', $notns);
334        }
335        return $query;
336    }
337}
338