xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 173bfbcabe2c339f24be46ed77aa8727524bac7f)
1*173bfbcaSSatoshi Sahara<?php
2*173bfbcaSSatoshi Saharanamespace dokuwiki\Search;
3*173bfbcaSSatoshi Sahara
4*173bfbcaSSatoshi Saharause dokuwiki\Extension\Event;
5*173bfbcaSSatoshi Sahara
6*173bfbcaSSatoshi Sahara
7*173bfbcaSSatoshi Sahara/**
8*173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search
9*173bfbcaSSatoshi Sahara *
10*173bfbcaSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
11*173bfbcaSSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
12*173bfbcaSSatoshi Sahara */
13*173bfbcaSSatoshi Saharaclass FulltextSearch
14*173bfbcaSSatoshi Sahara{
15*173bfbcaSSatoshi Sahara    /**
16*173bfbcaSSatoshi Sahara     *  Fulltext Search constructor. prevent direct object creation
17*173bfbcaSSatoshi Sahara     */
18*173bfbcaSSatoshi Sahara    protected function __construct() {}
19*173bfbcaSSatoshi Sahara
20*173bfbcaSSatoshi Sahara    /**
21*173bfbcaSSatoshi Sahara     * The fulltext search
22*173bfbcaSSatoshi Sahara     *
23*173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
24*173bfbcaSSatoshi Sahara     *
25*173bfbcaSSatoshi Sahara     * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
26*173bfbcaSSatoshi Sahara     *
27*173bfbcaSSatoshi Sahara     * @param string     $query
28*173bfbcaSSatoshi Sahara     * @param array      $highlight
29*173bfbcaSSatoshi Sahara     * @param string     $sort
30*173bfbcaSSatoshi Sahara     * @param int|string $after  only show results with mtime after this date,
31*173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
32*173bfbcaSSatoshi Sahara     * @param int|string $before only show results with mtime before this date,
33*173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
34*173bfbcaSSatoshi Sahara     *
35*173bfbcaSSatoshi Sahara     * @return array
36*173bfbcaSSatoshi Sahara     */
37*173bfbcaSSatoshi Sahara    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
38*173bfbcaSSatoshi Sahara    {
39*173bfbcaSSatoshi Sahara        if ($sort === null) {
40*173bfbcaSSatoshi Sahara            $sort = 'hits';
41*173bfbcaSSatoshi Sahara        }
42*173bfbcaSSatoshi Sahara        $data = [
43*173bfbcaSSatoshi Sahara            'query' => $query,
44*173bfbcaSSatoshi Sahara            'sort' => $sort,
45*173bfbcaSSatoshi Sahara            'after' => $after,
46*173bfbcaSSatoshi Sahara            'before' => $before
47*173bfbcaSSatoshi Sahara        ];
48*173bfbcaSSatoshi Sahara        $data['highlight'] =& $highlight;
49*173bfbcaSSatoshi Sahara        $action = static::class.'::callback_pageSearch';
50*173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
51*173bfbcaSSatoshi Sahara    }
52*173bfbcaSSatoshi Sahara
53*173bfbcaSSatoshi Sahara    /**
54*173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
55*173bfbcaSSatoshi Sahara     *
56*173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
57*173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
58*173bfbcaSSatoshi Sahara     *
59*173bfbcaSSatoshi Sahara     * @param array $data event data
60*173bfbcaSSatoshi Sahara     * @return array matching documents
61*173bfbcaSSatoshi Sahara     */
62*173bfbcaSSatoshi Sahara    public static function callback_pageSearch(&$data)
63*173bfbcaSSatoshi Sahara    {
64*173bfbcaSSatoshi Sahara        $Indexer = idx_get_indexer();
65*173bfbcaSSatoshi Sahara
66*173bfbcaSSatoshi Sahara        // parse the given query
67*173bfbcaSSatoshi Sahara        $q = static::queryParser($Indexer, $data['query']);
68*173bfbcaSSatoshi Sahara        $data['highlight'] = $q['highlight'];
69*173bfbcaSSatoshi Sahara
70*173bfbcaSSatoshi Sahara        if (empty($q['parsed_ary'])) return array();
71*173bfbcaSSatoshi Sahara
72*173bfbcaSSatoshi Sahara        // lookup all words found in the query
73*173bfbcaSSatoshi Sahara        $lookup = $Indexer->lookup($q['words']);
74*173bfbcaSSatoshi Sahara
75*173bfbcaSSatoshi Sahara        // get all pages in this dokuwiki site (!: includes nonexistent pages)
76*173bfbcaSSatoshi Sahara        $pages_all = array();
77*173bfbcaSSatoshi Sahara        foreach ($Indexer->getPages() as $id) {
78*173bfbcaSSatoshi Sahara            $pages_all[$id] = 0; // base: 0 hit
79*173bfbcaSSatoshi Sahara        }
80*173bfbcaSSatoshi Sahara
81*173bfbcaSSatoshi Sahara        // process the query
82*173bfbcaSSatoshi Sahara        $stack = array();
83*173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
84*173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
85*173bfbcaSSatoshi Sahara                case 'W+:':
86*173bfbcaSSatoshi Sahara                case 'W-:':
87*173bfbcaSSatoshi Sahara                case 'W_:': // word
88*173bfbcaSSatoshi Sahara                    $word    = substr($token, 3);
89*173bfbcaSSatoshi Sahara                    $stack[] = (array) $lookup[$word];
90*173bfbcaSSatoshi Sahara                    break;
91*173bfbcaSSatoshi Sahara                case 'P+:':
92*173bfbcaSSatoshi Sahara                case 'P-:': // phrase
93*173bfbcaSSatoshi Sahara                    $phrase = substr($token, 3);
94*173bfbcaSSatoshi Sahara                    // since phrases are always parsed as ((W1)(W2)...(P)),
95*173bfbcaSSatoshi Sahara                    // the end($stack) always points the pages that contain
96*173bfbcaSSatoshi Sahara                    // all words in this phrase
97*173bfbcaSSatoshi Sahara                    $pages  = end($stack);
98*173bfbcaSSatoshi Sahara                    $pages_matched = array();
99*173bfbcaSSatoshi Sahara                    foreach (array_keys($pages) as $id) {
100*173bfbcaSSatoshi Sahara                        $evdata = array(
101*173bfbcaSSatoshi Sahara                            'id' => $id,
102*173bfbcaSSatoshi Sahara                            'phrase' => $phrase,
103*173bfbcaSSatoshi Sahara                            'text' => rawWiki($id)
104*173bfbcaSSatoshi Sahara                        );
105*173bfbcaSSatoshi Sahara                        $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
106*173bfbcaSSatoshi Sahara                        if ($evt->advise_before() && $evt->result !== true) {
107*173bfbcaSSatoshi Sahara                            $text = \dokuwiki\Utf8\PhpString::strtolower($evdata['text']);
108*173bfbcaSSatoshi Sahara                            if (strpos($text, $phrase) !== false) {
109*173bfbcaSSatoshi Sahara                                $evt->result = true;
110*173bfbcaSSatoshi Sahara                            }
111*173bfbcaSSatoshi Sahara                        }
112*173bfbcaSSatoshi Sahara                        $evt->advise_after();
113*173bfbcaSSatoshi Sahara                        if ($evt->result === true) {
114*173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // phrase: always 0 hit
115*173bfbcaSSatoshi Sahara                        }
116*173bfbcaSSatoshi Sahara                    }
117*173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
118*173bfbcaSSatoshi Sahara                    break;
119*173bfbcaSSatoshi Sahara                case 'N+:':
120*173bfbcaSSatoshi Sahara                case 'N-:': // namespace
121*173bfbcaSSatoshi Sahara                    $ns = cleanID(substr($token, 3)) . ':';
122*173bfbcaSSatoshi Sahara                    $pages_matched = array();
123*173bfbcaSSatoshi Sahara                    foreach (array_keys($pages_all) as $id) {
124*173bfbcaSSatoshi Sahara                        if (strpos($id, $ns) === 0) {
125*173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // namespace: always 0 hit
126*173bfbcaSSatoshi Sahara                        }
127*173bfbcaSSatoshi Sahara                    }
128*173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
129*173bfbcaSSatoshi Sahara                    break;
130*173bfbcaSSatoshi Sahara                case 'AND': // and operation
131*173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
132*173bfbcaSSatoshi Sahara                    $stack[] = static::resultCombine(array($pages1, $pages2));
133*173bfbcaSSatoshi Sahara                    break;
134*173bfbcaSSatoshi Sahara                case 'OR':  // or operation
135*173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
136*173bfbcaSSatoshi Sahara                    $stack[] = static::resultUnite(array($pages1, $pages2));
137*173bfbcaSSatoshi Sahara                    break;
138*173bfbcaSSatoshi Sahara                case 'NOT': // not operation (unary)
139*173bfbcaSSatoshi Sahara                    $pages   = array_pop($stack);
140*173bfbcaSSatoshi Sahara                    $stack[] = static::resultComplement(array($pages_all, $pages));
141*173bfbcaSSatoshi Sahara                    break;
142*173bfbcaSSatoshi Sahara            }
143*173bfbcaSSatoshi Sahara        }
144*173bfbcaSSatoshi Sahara        $docs = array_pop($stack);
145*173bfbcaSSatoshi Sahara
146*173bfbcaSSatoshi Sahara        if (empty($docs)) return array();
147*173bfbcaSSatoshi Sahara
148*173bfbcaSSatoshi Sahara        // check: settings, acls, existence
149*173bfbcaSSatoshi Sahara        foreach (array_keys($docs) as $id) {
150*173bfbcaSSatoshi Sahara            if (isHiddenPage($id)
151*173bfbcaSSatoshi Sahara                || auth_quickaclcheck($id) < AUTH_READ
152*173bfbcaSSatoshi Sahara                || !page_exists($id, '', false)
153*173bfbcaSSatoshi Sahara            ) {
154*173bfbcaSSatoshi Sahara                unset($docs[$id]);
155*173bfbcaSSatoshi Sahara            }
156*173bfbcaSSatoshi Sahara        }
157*173bfbcaSSatoshi Sahara
158*173bfbcaSSatoshi Sahara        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
159*173bfbcaSSatoshi Sahara
160*173bfbcaSSatoshi Sahara        if ($data['sort'] === 'mtime') {
161*173bfbcaSSatoshi Sahara            uksort($docs, static::class.'::pagemtimesorter');
162*173bfbcaSSatoshi Sahara        } else {
163*173bfbcaSSatoshi Sahara            // sort docs by count
164*173bfbcaSSatoshi Sahara            arsort($docs);
165*173bfbcaSSatoshi Sahara        }
166*173bfbcaSSatoshi Sahara
167*173bfbcaSSatoshi Sahara        return $docs;
168*173bfbcaSSatoshi Sahara    }
169*173bfbcaSSatoshi Sahara
170*173bfbcaSSatoshi Sahara    /**
171*173bfbcaSSatoshi Sahara     * Quicksearch for pagenames
172*173bfbcaSSatoshi Sahara     *
173*173bfbcaSSatoshi Sahara     * By default it only matches the pagename and ignores the
174*173bfbcaSSatoshi Sahara     * namespace. This can be changed with the second parameter.
175*173bfbcaSSatoshi Sahara     * The third parameter allows to search in titles as well.
176*173bfbcaSSatoshi Sahara     *
177*173bfbcaSSatoshi Sahara     * The function always returns titles as well
178*173bfbcaSSatoshi Sahara     *
179*173bfbcaSSatoshi Sahara     * @triggers SEARCH_QUERY_PAGELOOKUP
180*173bfbcaSSatoshi Sahara     * @author   Andreas Gohr <andi@splitbrain.org>
181*173bfbcaSSatoshi Sahara     * @author   Adrian Lang <lang@cosmocode.de>
182*173bfbcaSSatoshi Sahara     *
183*173bfbcaSSatoshi Sahara     * @param string     $id       page id
184*173bfbcaSSatoshi Sahara     * @param bool       $in_ns    match against namespace as well?
185*173bfbcaSSatoshi Sahara     * @param bool       $in_title search in title?
186*173bfbcaSSatoshi Sahara     * @param int|string $after    only show results with mtime after this date,
187*173bfbcaSSatoshi Sahara     *                             accepts timestap or strtotime arguments
188*173bfbcaSSatoshi Sahara     * @param int|string $before   only show results with mtime before this date,
189*173bfbcaSSatoshi Sahara     *                             accepts timestap or strtotime arguments
190*173bfbcaSSatoshi Sahara     *
191*173bfbcaSSatoshi Sahara     * @return string[]
192*173bfbcaSSatoshi Sahara     */
193*173bfbcaSSatoshi Sahara    public static function pageLookup($id, $in_ns=false, $in_title=false, $after = null, $before = null)
194*173bfbcaSSatoshi Sahara    {
195*173bfbcaSSatoshi Sahara        $data = [
196*173bfbcaSSatoshi Sahara            'id' => $id,
197*173bfbcaSSatoshi Sahara            'in_ns' => $in_ns,
198*173bfbcaSSatoshi Sahara            'in_title' => $in_title,
199*173bfbcaSSatoshi Sahara            'after' => $after,
200*173bfbcaSSatoshi Sahara            'before' => $before
201*173bfbcaSSatoshi Sahara        ];
202*173bfbcaSSatoshi Sahara        $data['has_titles'] = true; // for plugin backward compatibility check
203*173bfbcaSSatoshi Sahara        $action = static::class.'::callback_pageLookup';
204*173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $action);
205*173bfbcaSSatoshi Sahara    }
206*173bfbcaSSatoshi Sahara
207*173bfbcaSSatoshi Sahara    /**
208*173bfbcaSSatoshi Sahara     * Returns list of pages as array(pageid => First Heading)
209*173bfbcaSSatoshi Sahara     *
210*173bfbcaSSatoshi Sahara     * @param array &$data event data
211*173bfbcaSSatoshi Sahara     * @return string[]
212*173bfbcaSSatoshi Sahara     */
213*173bfbcaSSatoshi Sahara    public static function callback_pageLookup(&$data)
214*173bfbcaSSatoshi Sahara    {
215*173bfbcaSSatoshi Sahara        // split out original parameters
216*173bfbcaSSatoshi Sahara        $id = $data['id'];
217*173bfbcaSSatoshi Sahara        $Indexer = idx_get_indexer();
218*173bfbcaSSatoshi Sahara        $parsedQuery = static::queryParser($Indexer, $id);
219*173bfbcaSSatoshi Sahara        if (count($parsedQuery['ns']) > 0) {
220*173bfbcaSSatoshi Sahara            $ns = cleanID($parsedQuery['ns'][0]) . ':';
221*173bfbcaSSatoshi Sahara            $id = implode(' ', $parsedQuery['highlight']);
222*173bfbcaSSatoshi Sahara        }
223*173bfbcaSSatoshi Sahara
224*173bfbcaSSatoshi Sahara        $in_ns    = $data['in_ns'];
225*173bfbcaSSatoshi Sahara        $in_title = $data['in_title'];
226*173bfbcaSSatoshi Sahara        $cleaned = cleanID($id);
227*173bfbcaSSatoshi Sahara
228*173bfbcaSSatoshi Sahara        $Indexer = idx_get_indexer();
229*173bfbcaSSatoshi Sahara        $page_idx = $Indexer->getPages();
230*173bfbcaSSatoshi Sahara
231*173bfbcaSSatoshi Sahara        $pages = array();
232*173bfbcaSSatoshi Sahara        if ($id !== '' && $cleaned !== '') {
233*173bfbcaSSatoshi Sahara            foreach ($page_idx as $p_id) {
234*173bfbcaSSatoshi Sahara                if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
235*173bfbcaSSatoshi Sahara                    if (!isset($pages[$p_id])) {
236*173bfbcaSSatoshi Sahara                        $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
237*173bfbcaSSatoshi Sahara                    }
238*173bfbcaSSatoshi Sahara                }
239*173bfbcaSSatoshi Sahara            }
240*173bfbcaSSatoshi Sahara            if ($in_title) {
241*173bfbcaSSatoshi Sahara                $func = static::class.'::pageLookupTitleCompare';
242*173bfbcaSSatoshi Sahara                foreach ($Indexer->lookupKey('title', $id, $func) as $p_id) {
243*173bfbcaSSatoshi Sahara                    if (!isset($pages[$p_id])) {
244*173bfbcaSSatoshi Sahara                        $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
245*173bfbcaSSatoshi Sahara                    }
246*173bfbcaSSatoshi Sahara                }
247*173bfbcaSSatoshi Sahara            }
248*173bfbcaSSatoshi Sahara        }
249*173bfbcaSSatoshi Sahara
250*173bfbcaSSatoshi Sahara        if (isset($ns)) {
251*173bfbcaSSatoshi Sahara            foreach (array_keys($pages) as $p_id) {
252*173bfbcaSSatoshi Sahara                if (strpos($p_id, $ns) !== 0) {
253*173bfbcaSSatoshi Sahara                    unset($pages[$p_id]);
254*173bfbcaSSatoshi Sahara                }
255*173bfbcaSSatoshi Sahara            }
256*173bfbcaSSatoshi Sahara        }
257*173bfbcaSSatoshi Sahara
258*173bfbcaSSatoshi Sahara        // discard hidden pages
259*173bfbcaSSatoshi Sahara        // discard nonexistent pages
260*173bfbcaSSatoshi Sahara        // check ACL permissions
261*173bfbcaSSatoshi Sahara        foreach (array_keys($pages) as $idx) {
262*173bfbcaSSatoshi Sahara            if (!isVisiblePage($idx) || !page_exists($idx) || auth_quickaclcheck($idx) < AUTH_READ) {
263*173bfbcaSSatoshi Sahara                unset($pages[$idx]);
264*173bfbcaSSatoshi Sahara            }
265*173bfbcaSSatoshi Sahara        }
266*173bfbcaSSatoshi Sahara
267*173bfbcaSSatoshi Sahara        $pages = static::filterResultsByTime($pages, $data['after'], $data['before']);
268*173bfbcaSSatoshi Sahara
269*173bfbcaSSatoshi Sahara        uksort($pages, static::class.'::pagesorter');
270*173bfbcaSSatoshi Sahara        return $pages;
271*173bfbcaSSatoshi Sahara    }
272*173bfbcaSSatoshi Sahara
273*173bfbcaSSatoshi Sahara    /**
274*173bfbcaSSatoshi Sahara     * @param array      $results search results in the form pageid => value
275*173bfbcaSSatoshi Sahara     * @param int|string $after   only returns results with mtime after this date,
276*173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
277*173bfbcaSSatoshi Sahara     * @param int|string $before  only returns results with mtime after this date,
278*173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
279*173bfbcaSSatoshi Sahara     *
280*173bfbcaSSatoshi Sahara     * @return array
281*173bfbcaSSatoshi Sahara     */
282*173bfbcaSSatoshi Sahara    protected static function filterResultsByTime(array $results, $after, $before)
283*173bfbcaSSatoshi Sahara    {
284*173bfbcaSSatoshi Sahara        if ($after || $before) {
285*173bfbcaSSatoshi Sahara            $after = is_int($after) ? $after : strtotime($after);
286*173bfbcaSSatoshi Sahara            $before = is_int($before) ? $before : strtotime($before);
287*173bfbcaSSatoshi Sahara
288*173bfbcaSSatoshi Sahara            foreach ($results as $id => $value) {
289*173bfbcaSSatoshi Sahara                $mTime = filemtime(wikiFN($id));
290*173bfbcaSSatoshi Sahara                if ($after && $after > $mTime) {
291*173bfbcaSSatoshi Sahara                    unset($results[$id]);
292*173bfbcaSSatoshi Sahara                    continue;
293*173bfbcaSSatoshi Sahara                }
294*173bfbcaSSatoshi Sahara                if ($before && $before < $mTime) {
295*173bfbcaSSatoshi Sahara                    unset($results[$id]);
296*173bfbcaSSatoshi Sahara                }
297*173bfbcaSSatoshi Sahara            }
298*173bfbcaSSatoshi Sahara        }
299*173bfbcaSSatoshi Sahara
300*173bfbcaSSatoshi Sahara        return $results;
301*173bfbcaSSatoshi Sahara    }
302*173bfbcaSSatoshi Sahara
303*173bfbcaSSatoshi Sahara    /**
304*173bfbcaSSatoshi Sahara     * Tiny helper function for comparing the searched title with the title
305*173bfbcaSSatoshi Sahara     * from the search index. This function is a wrapper around stripos with
306*173bfbcaSSatoshi Sahara     * adapted argument order and return value.
307*173bfbcaSSatoshi Sahara     *
308*173bfbcaSSatoshi Sahara     * @param string $search searched title
309*173bfbcaSSatoshi Sahara     * @param string $title  title from index
310*173bfbcaSSatoshi Sahara     * @return bool
311*173bfbcaSSatoshi Sahara     */
312*173bfbcaSSatoshi Sahara    public static function pageLookupTitleCompare($search, $title)
313*173bfbcaSSatoshi Sahara    {
314*173bfbcaSSatoshi Sahara        return stripos($title, $search) !== false;
315*173bfbcaSSatoshi Sahara    }
316*173bfbcaSSatoshi Sahara
317*173bfbcaSSatoshi Sahara    /**
318*173bfbcaSSatoshi Sahara     * Sort pages based on their namespace level first, then on their string
319*173bfbcaSSatoshi Sahara     * values. This makes higher hierarchy pages rank higher than lower hierarchy
320*173bfbcaSSatoshi Sahara     * pages.
321*173bfbcaSSatoshi Sahara     *
322*173bfbcaSSatoshi Sahara     * @param string $a
323*173bfbcaSSatoshi Sahara     * @param string $b
324*173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b,
325*173bfbcaSSatoshi Sahara     *             and 0 if they are equal.
326*173bfbcaSSatoshi Sahara     */
327*173bfbcaSSatoshi Sahara    protected static function pagesorter($a, $b)
328*173bfbcaSSatoshi Sahara    {
329*173bfbcaSSatoshi Sahara        $ac = count(explode(':',$a));
330*173bfbcaSSatoshi Sahara        $bc = count(explode(':',$b));
331*173bfbcaSSatoshi Sahara        if ($ac < $bc) {
332*173bfbcaSSatoshi Sahara            return -1;
333*173bfbcaSSatoshi Sahara        } elseif ($ac > $bc) {
334*173bfbcaSSatoshi Sahara            return 1;
335*173bfbcaSSatoshi Sahara        }
336*173bfbcaSSatoshi Sahara        return strcmp ($a,$b);
337*173bfbcaSSatoshi Sahara    }
338*173bfbcaSSatoshi Sahara
339*173bfbcaSSatoshi Sahara    /**
340*173bfbcaSSatoshi Sahara     * Sort pages by their mtime, from newest to oldest
341*173bfbcaSSatoshi Sahara     *
342*173bfbcaSSatoshi Sahara     * @param string $a
343*173bfbcaSSatoshi Sahara     * @param string $b
344*173bfbcaSSatoshi Sahara     *
345*173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
346*173bfbcaSSatoshi Sahara     *             and 0 if they are of the same age
347*173bfbcaSSatoshi Sahara     */
348*173bfbcaSSatoshi Sahara    protected static function pagemtimesorter($a, $b)
349*173bfbcaSSatoshi Sahara    {
350*173bfbcaSSatoshi Sahara        $mtimeA = filemtime(wikiFN($a));
351*173bfbcaSSatoshi Sahara        $mtimeB = filemtime(wikiFN($b));
352*173bfbcaSSatoshi Sahara        return $mtimeB - $mtimeA;
353*173bfbcaSSatoshi Sahara    }
354*173bfbcaSSatoshi Sahara
355*173bfbcaSSatoshi Sahara    /**
356*173bfbcaSSatoshi Sahara     * Creates a snippet extract
357*173bfbcaSSatoshi Sahara     *
358*173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
359*173bfbcaSSatoshi Sahara     * @triggers FULLTEXT_SNIPPET_CREATE
360*173bfbcaSSatoshi Sahara     *
361*173bfbcaSSatoshi Sahara     * @param string $id page id
362*173bfbcaSSatoshi Sahara     * @param array $highlight
363*173bfbcaSSatoshi Sahara     * @return mixed
364*173bfbcaSSatoshi Sahara     */
365*173bfbcaSSatoshi Sahara    public static function snippet($id, $highlight)
366*173bfbcaSSatoshi Sahara    {
367*173bfbcaSSatoshi Sahara        $text = rawWiki($id);
368*173bfbcaSSatoshi Sahara        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
369*173bfbcaSSatoshi Sahara        $evdata = array(
370*173bfbcaSSatoshi Sahara            'id'        => $id,
371*173bfbcaSSatoshi Sahara            'text'      => &$text,
372*173bfbcaSSatoshi Sahara            'highlight' => &$highlight,
373*173bfbcaSSatoshi Sahara            'snippet'   => '',
374*173bfbcaSSatoshi Sahara        );
375*173bfbcaSSatoshi Sahara
376*173bfbcaSSatoshi Sahara        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
377*173bfbcaSSatoshi Sahara        if ($evt->advise_before()) {
378*173bfbcaSSatoshi Sahara            $match = array();
379*173bfbcaSSatoshi Sahara            $snippets = array();
380*173bfbcaSSatoshi Sahara            $utf8_offset = $offset = $end = 0;
381*173bfbcaSSatoshi Sahara            $len = \dokuwiki\Utf8\PhpString::strlen($text);
382*173bfbcaSSatoshi Sahara
383*173bfbcaSSatoshi Sahara            // build a regexp from the phrases to highlight
384*173bfbcaSSatoshi Sahara            $re1 = '(' .
385*173bfbcaSSatoshi Sahara                join(
386*173bfbcaSSatoshi Sahara                    '|',
387*173bfbcaSSatoshi Sahara                    array_map(
388*173bfbcaSSatoshi Sahara                        static::class.'::snippet_re_preprocess',
389*173bfbcaSSatoshi Sahara                        array_map(
390*173bfbcaSSatoshi Sahara                            'preg_quote_cb',
391*173bfbcaSSatoshi Sahara                            array_filter((array) $highlight)
392*173bfbcaSSatoshi Sahara                        )
393*173bfbcaSSatoshi Sahara                    )
394*173bfbcaSSatoshi Sahara                ) .
395*173bfbcaSSatoshi Sahara                ')';
396*173bfbcaSSatoshi Sahara            $re2 = "$re1.{0,75}(?!\\1)$re1";
397*173bfbcaSSatoshi Sahara            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
398*173bfbcaSSatoshi Sahara
399*173bfbcaSSatoshi Sahara            for ($cnt=4; $cnt--;) {
400*173bfbcaSSatoshi Sahara                if (0) {
401*173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
402*173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
403*173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
404*173bfbcaSSatoshi Sahara                } else {
405*173bfbcaSSatoshi Sahara                    break;
406*173bfbcaSSatoshi Sahara                }
407*173bfbcaSSatoshi Sahara
408*173bfbcaSSatoshi Sahara                list($str, $idx) = $match[0];
409*173bfbcaSSatoshi Sahara
410*173bfbcaSSatoshi Sahara                // convert $idx (a byte offset) into a utf8 character offset
411*173bfbcaSSatoshi Sahara                $utf8_idx = \dokuwiki\Utf8\PhpString::strlen(substr($text, 0, $idx));
412*173bfbcaSSatoshi Sahara                $utf8_len = \dokuwiki\Utf8\PhpString::strlen($str);
413*173bfbcaSSatoshi Sahara
414*173bfbcaSSatoshi Sahara                // establish context, 100 bytes surrounding the match string
415*173bfbcaSSatoshi Sahara                // first look to see if we can go 100 either side,
416*173bfbcaSSatoshi Sahara                // then drop to 50 adding any excess if the other side can't go to 50,
417*173bfbcaSSatoshi Sahara                $pre = min($utf8_idx - $utf8_offset, 100);
418*173bfbcaSSatoshi Sahara                $post = min($len - $utf8_idx - $utf8_len, 100);
419*173bfbcaSSatoshi Sahara
420*173bfbcaSSatoshi Sahara                if ($pre > 50 && $post > 50) {
421*173bfbcaSSatoshi Sahara                    $pre = $post = 50;
422*173bfbcaSSatoshi Sahara                } elseif ($pre > 50) {
423*173bfbcaSSatoshi Sahara                    $pre = min($pre, 100 - $post);
424*173bfbcaSSatoshi Sahara                } elseif ($post > 50) {
425*173bfbcaSSatoshi Sahara                    $post = min($post, 100 - $pre);
426*173bfbcaSSatoshi Sahara                } elseif ($offset == 0) {
427*173bfbcaSSatoshi Sahara                    // both are less than 50, means the context is the whole string
428*173bfbcaSSatoshi Sahara                    // make it so and break out of this loop - there is no need for the
429*173bfbcaSSatoshi Sahara                    // complex snippet calculations
430*173bfbcaSSatoshi Sahara                    $snippets = array($text);
431*173bfbcaSSatoshi Sahara                    break;
432*173bfbcaSSatoshi Sahara                }
433*173bfbcaSSatoshi Sahara
434*173bfbcaSSatoshi Sahara                // establish context start and end points, try to append to previous
435*173bfbcaSSatoshi Sahara                // context if possible
436*173bfbcaSSatoshi Sahara                $start = $utf8_idx - $pre;
437*173bfbcaSSatoshi Sahara                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
438*173bfbcaSSatoshi Sahara                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
439*173bfbcaSSatoshi Sahara
440*173bfbcaSSatoshi Sahara                if ($append) {
441*173bfbcaSSatoshi Sahara                    $snippets[count($snippets)-1] .= \dokuwiki\Utf8\PhpString::substr($text,$append,$end-$append);
442*173bfbcaSSatoshi Sahara                } else {
443*173bfbcaSSatoshi Sahara                    $snippets[] = \dokuwiki\Utf8\PhpString::substr($text,$start,$end-$start);
444*173bfbcaSSatoshi Sahara                }
445*173bfbcaSSatoshi Sahara
446*173bfbcaSSatoshi Sahara                // set $offset for next match attempt
447*173bfbcaSSatoshi Sahara                // continue matching after the current match
448*173bfbcaSSatoshi Sahara                // if the current match is not the longest possible match starting at the current offset
449*173bfbcaSSatoshi Sahara                // this prevents further matching of this snippet but for possible matches of length
450*173bfbcaSSatoshi Sahara                // smaller than match length + context (at least 50 characters) this match is part of the context
451*173bfbcaSSatoshi Sahara                $utf8_offset = $utf8_idx + $utf8_len;
452*173bfbcaSSatoshi Sahara                $offset = $idx + strlen(\dokuwiki\Utf8\PhpString::substr($text,$utf8_idx,$utf8_len));
453*173bfbcaSSatoshi Sahara                $offset = \dokuwiki\Utf8\Clean::correctIdx($text,$offset);
454*173bfbcaSSatoshi Sahara            }
455*173bfbcaSSatoshi Sahara
456*173bfbcaSSatoshi Sahara            $m = "\1";
457*173bfbcaSSatoshi Sahara            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
458*173bfbcaSSatoshi Sahara            $snippet = preg_replace(
459*173bfbcaSSatoshi Sahara                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
460*173bfbcaSSatoshi Sahara                '<strong class="search_hit">$1</strong>',
461*173bfbcaSSatoshi Sahara                hsc(join('... ', $snippets))
462*173bfbcaSSatoshi Sahara            );
463*173bfbcaSSatoshi Sahara
464*173bfbcaSSatoshi Sahara            $evdata['snippet'] = $snippet;
465*173bfbcaSSatoshi Sahara        }
466*173bfbcaSSatoshi Sahara        $evt->advise_after();
467*173bfbcaSSatoshi Sahara        unset($evt);
468*173bfbcaSSatoshi Sahara
469*173bfbcaSSatoshi Sahara        return $evdata['snippet'];
470*173bfbcaSSatoshi Sahara    }
471*173bfbcaSSatoshi Sahara
472*173bfbcaSSatoshi Sahara    /**
473*173bfbcaSSatoshi Sahara     * Wraps a search term in regex boundary checks.
474*173bfbcaSSatoshi Sahara     *
475*173bfbcaSSatoshi Sahara     * @param string $term
476*173bfbcaSSatoshi Sahara     * @return string
477*173bfbcaSSatoshi Sahara     */
478*173bfbcaSSatoshi Sahara    public static function snippet_re_preprocess($term)
479*173bfbcaSSatoshi Sahara    {
480*173bfbcaSSatoshi Sahara        // do not process asian terms where word boundaries are not explicit
481*173bfbcaSSatoshi Sahara        if (\dokuwiki\Utf8\Asian::isAsianWords($term)) return $term;
482*173bfbcaSSatoshi Sahara
483*173bfbcaSSatoshi Sahara        if (UTF8_PROPERTYSUPPORT) {
484*173bfbcaSSatoshi Sahara            // unicode word boundaries
485*173bfbcaSSatoshi Sahara            // see http://stackoverflow.com/a/2449017/172068
486*173bfbcaSSatoshi Sahara            $BL = '(?<!\pL)';
487*173bfbcaSSatoshi Sahara            $BR = '(?!\pL)';
488*173bfbcaSSatoshi Sahara        } else {
489*173bfbcaSSatoshi Sahara            // not as correct as above, but at least won't break
490*173bfbcaSSatoshi Sahara            $BL = '\b';
491*173bfbcaSSatoshi Sahara            $BR = '\b';
492*173bfbcaSSatoshi Sahara        }
493*173bfbcaSSatoshi Sahara
494*173bfbcaSSatoshi Sahara        if (substr($term, 0, 2) == '\\*') {
495*173bfbcaSSatoshi Sahara            $term = substr($term, 2);
496*173bfbcaSSatoshi Sahara        } else {
497*173bfbcaSSatoshi Sahara            $term = $BL.$term;
498*173bfbcaSSatoshi Sahara        }
499*173bfbcaSSatoshi Sahara
500*173bfbcaSSatoshi Sahara        if (substr($term, -2, 2) == '\\*') {
501*173bfbcaSSatoshi Sahara            $term = substr($term, 0, -2);
502*173bfbcaSSatoshi Sahara        } else {
503*173bfbcaSSatoshi Sahara            $term = $term.$BR;
504*173bfbcaSSatoshi Sahara        }
505*173bfbcaSSatoshi Sahara
506*173bfbcaSSatoshi Sahara        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
507*173bfbcaSSatoshi Sahara            $term = '';
508*173bfbcaSSatoshi Sahara        }
509*173bfbcaSSatoshi Sahara        return $term;
510*173bfbcaSSatoshi Sahara    }
511*173bfbcaSSatoshi Sahara
512*173bfbcaSSatoshi Sahara    /**
513*173bfbcaSSatoshi Sahara     * Combine found documents and sum up their scores
514*173bfbcaSSatoshi Sahara     *
515*173bfbcaSSatoshi Sahara     * This function is used to combine searched words with a logical
516*173bfbcaSSatoshi Sahara     * AND. Only documents available in all arrays are returned.
517*173bfbcaSSatoshi Sahara     *
518*173bfbcaSSatoshi Sahara     * based upon PEAR's PHP_Compat function for array_intersect_key()
519*173bfbcaSSatoshi Sahara     *
520*173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
521*173bfbcaSSatoshi Sahara     * @return array
522*173bfbcaSSatoshi Sahara     */
523*173bfbcaSSatoshi Sahara    protected static function resultCombine($args)
524*173bfbcaSSatoshi Sahara    {
525*173bfbcaSSatoshi Sahara        $array_count = count($args);
526*173bfbcaSSatoshi Sahara        if ($array_count == 1) {
527*173bfbcaSSatoshi Sahara            return $args[0];
528*173bfbcaSSatoshi Sahara        }
529*173bfbcaSSatoshi Sahara
530*173bfbcaSSatoshi Sahara        $result = array();
531*173bfbcaSSatoshi Sahara        if ($array_count > 1) {
532*173bfbcaSSatoshi Sahara            foreach ($args[0] as $key => $value) {
533*173bfbcaSSatoshi Sahara                $result[$key] = $value;
534*173bfbcaSSatoshi Sahara                for ($i = 1; $i !== $array_count; $i++) {
535*173bfbcaSSatoshi Sahara                    if (!isset($args[$i][$key])) {
536*173bfbcaSSatoshi Sahara                        unset($result[$key]);
537*173bfbcaSSatoshi Sahara                        break;
538*173bfbcaSSatoshi Sahara                    }
539*173bfbcaSSatoshi Sahara                    $result[$key] += $args[$i][$key];
540*173bfbcaSSatoshi Sahara                }
541*173bfbcaSSatoshi Sahara            }
542*173bfbcaSSatoshi Sahara        }
543*173bfbcaSSatoshi Sahara        return $result;
544*173bfbcaSSatoshi Sahara    }
545*173bfbcaSSatoshi Sahara
546*173bfbcaSSatoshi Sahara    /**
547*173bfbcaSSatoshi Sahara     * Unites found documents and sum up their scores
548*173bfbcaSSatoshi Sahara     * based upon resultCombine() method
549*173bfbcaSSatoshi Sahara     *
550*173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
551*173bfbcaSSatoshi Sahara     * @return array
552*173bfbcaSSatoshi Sahara     *
553*173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
554*173bfbcaSSatoshi Sahara     */
555*173bfbcaSSatoshi Sahara    protected static function resultUnite($args)
556*173bfbcaSSatoshi Sahara    {
557*173bfbcaSSatoshi Sahara        $array_count = count($args);
558*173bfbcaSSatoshi Sahara        if ($array_count === 1) {
559*173bfbcaSSatoshi Sahara            return $args[0];
560*173bfbcaSSatoshi Sahara        }
561*173bfbcaSSatoshi Sahara
562*173bfbcaSSatoshi Sahara        $result = $args[0];
563*173bfbcaSSatoshi Sahara        for ($i = 1; $i !== $array_count; $i++) {
564*173bfbcaSSatoshi Sahara            foreach (array_keys($args[$i]) as $id) {
565*173bfbcaSSatoshi Sahara                $result[$id] += $args[$i][$id];
566*173bfbcaSSatoshi Sahara            }
567*173bfbcaSSatoshi Sahara        }
568*173bfbcaSSatoshi Sahara        return $result;
569*173bfbcaSSatoshi Sahara    }
570*173bfbcaSSatoshi Sahara
571*173bfbcaSSatoshi Sahara    /**
572*173bfbcaSSatoshi Sahara     * Computes the difference of documents using page id for comparison
573*173bfbcaSSatoshi Sahara     * nearly identical to PHP5's array_diff_key()
574*173bfbcaSSatoshi Sahara     *
575*173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
576*173bfbcaSSatoshi Sahara     * @return array
577*173bfbcaSSatoshi Sahara     *
578*173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
579*173bfbcaSSatoshi Sahara     */
580*173bfbcaSSatoshi Sahara    protected static function resultComplement($args)
581*173bfbcaSSatoshi Sahara    {
582*173bfbcaSSatoshi Sahara        $array_count = count($args);
583*173bfbcaSSatoshi Sahara        if ($array_count === 1) {
584*173bfbcaSSatoshi Sahara            return $args[0];
585*173bfbcaSSatoshi Sahara        }
586*173bfbcaSSatoshi Sahara
587*173bfbcaSSatoshi Sahara        $result = $args[0];
588*173bfbcaSSatoshi Sahara        foreach (array_keys($result) as $id) {
589*173bfbcaSSatoshi Sahara            for ($i = 1; $i !== $array_count; $i++) {
590*173bfbcaSSatoshi Sahara                if (isset($args[$i][$id])) unset($result[$id]);
591*173bfbcaSSatoshi Sahara            }
592*173bfbcaSSatoshi Sahara        }
593*173bfbcaSSatoshi Sahara        return $result;
594*173bfbcaSSatoshi Sahara    }
595*173bfbcaSSatoshi Sahara
596*173bfbcaSSatoshi Sahara    /**
597*173bfbcaSSatoshi Sahara     * Parses a search query and builds an array of search formulas
598*173bfbcaSSatoshi Sahara     *
599*173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
600*173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
601*173bfbcaSSatoshi Sahara     *
602*173bfbcaSSatoshi Sahara     * @param Doku_Indexer $Indexer
603*173bfbcaSSatoshi Sahara     * @param string $query search query
604*173bfbcaSSatoshi Sahara     * @return array of search formulas
605*173bfbcaSSatoshi Sahara     */
606*173bfbcaSSatoshi Sahara    public static function queryParser($Indexer, $query)
607*173bfbcaSSatoshi Sahara    {
608*173bfbcaSSatoshi Sahara        /**
609*173bfbcaSSatoshi Sahara         * parse a search query and transform it into intermediate representation
610*173bfbcaSSatoshi Sahara         *
611*173bfbcaSSatoshi Sahara         * in a search query, you can use the following expressions:
612*173bfbcaSSatoshi Sahara         *
613*173bfbcaSSatoshi Sahara         *   words:
614*173bfbcaSSatoshi Sahara         *     include
615*173bfbcaSSatoshi Sahara         *     -exclude
616*173bfbcaSSatoshi Sahara         *   phrases:
617*173bfbcaSSatoshi Sahara         *     "phrase to be included"
618*173bfbcaSSatoshi Sahara         *     -"phrase you want to exclude"
619*173bfbcaSSatoshi Sahara         *   namespaces:
620*173bfbcaSSatoshi Sahara         *     @include:namespace (or ns:include:namespace)
621*173bfbcaSSatoshi Sahara         *     ^exclude:namespace (or -ns:exclude:namespace)
622*173bfbcaSSatoshi Sahara         *   groups:
623*173bfbcaSSatoshi Sahara         *     ()
624*173bfbcaSSatoshi Sahara         *     -()
625*173bfbcaSSatoshi Sahara         *   operators:
626*173bfbcaSSatoshi Sahara         *     and ('and' is the default operator: you can always omit this)
627*173bfbcaSSatoshi Sahara         *     or  (or pipe symbol '|', lower precedence than 'and')
628*173bfbcaSSatoshi Sahara         *
629*173bfbcaSSatoshi Sahara         * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
630*173bfbcaSSatoshi Sahara         *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
631*173bfbcaSSatoshi Sahara         *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
632*173bfbcaSSatoshi Sahara         *      as long as you don't mind hit counts.
633*173bfbcaSSatoshi Sahara         *
634*173bfbcaSSatoshi Sahara         * intermediate representation consists of the following parts:
635*173bfbcaSSatoshi Sahara         *
636*173bfbcaSSatoshi Sahara         *   ( )           - group
637*173bfbcaSSatoshi Sahara         *   AND           - logical and
638*173bfbcaSSatoshi Sahara         *   OR            - logical or
639*173bfbcaSSatoshi Sahara         *   NOT           - logical not
640*173bfbcaSSatoshi Sahara         *   W+:, W-:, W_: - word      (underscore: no need to highlight)
641*173bfbcaSSatoshi Sahara         *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
642*173bfbcaSSatoshi Sahara         *   N+:, N-:      - namespace
643*173bfbcaSSatoshi Sahara         */
644*173bfbcaSSatoshi Sahara        $parsed_query = '';
645*173bfbcaSSatoshi Sahara        $parens_level = 0;
646*173bfbcaSSatoshi Sahara        $terms = preg_split('/(-?".*?")/u', \dokuwiki\Utf8\PhpString::strtolower($query),
647*173bfbcaSSatoshi Sahara                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
648*173bfbcaSSatoshi Sahara        );
649*173bfbcaSSatoshi Sahara
650*173bfbcaSSatoshi Sahara        foreach ($terms as $term) {
651*173bfbcaSSatoshi Sahara            $parsed = '';
652*173bfbcaSSatoshi Sahara            if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
653*173bfbcaSSatoshi Sahara                // phrase-include and phrase-exclude
654*173bfbcaSSatoshi Sahara                $not = $matches[1] ? 'NOT' : '';
655*173bfbcaSSatoshi Sahara                $parsed = $not . static::termParser($Indexer, $matches[2], false, true);
656*173bfbcaSSatoshi Sahara            } else {
657*173bfbcaSSatoshi Sahara                // fix incomplete phrase
658*173bfbcaSSatoshi Sahara                $term = str_replace('"', ' ', $term);
659*173bfbcaSSatoshi Sahara
660*173bfbcaSSatoshi Sahara                // fix parentheses
661*173bfbcaSSatoshi Sahara                $term = str_replace(')'  , ' ) ', $term);
662*173bfbcaSSatoshi Sahara                $term = str_replace('('  , ' ( ', $term);
663*173bfbcaSSatoshi Sahara                $term = str_replace('- (', ' -(', $term);
664*173bfbcaSSatoshi Sahara
665*173bfbcaSSatoshi Sahara                // treat pipe symbols as 'OR' operators
666*173bfbcaSSatoshi Sahara                $term = str_replace('|', ' or ', $term);
667*173bfbcaSSatoshi Sahara
668*173bfbcaSSatoshi Sahara                // treat ideographic spaces (U+3000) as search term separators
669*173bfbcaSSatoshi Sahara                // FIXME: some more separators?
670*173bfbcaSSatoshi Sahara                $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
671*173bfbcaSSatoshi Sahara                $term = trim($term);
672*173bfbcaSSatoshi Sahara                if ($term === '') continue;
673*173bfbcaSSatoshi Sahara
674*173bfbcaSSatoshi Sahara                $tokens = explode(' ', $term);
675*173bfbcaSSatoshi Sahara                foreach ($tokens as $token) {
676*173bfbcaSSatoshi Sahara                    if ($token === '(') {
677*173bfbcaSSatoshi Sahara                        // parenthesis-include-open
678*173bfbcaSSatoshi Sahara                        $parsed .= '(';
679*173bfbcaSSatoshi Sahara                        ++$parens_level;
680*173bfbcaSSatoshi Sahara                    } elseif ($token === '-(') {
681*173bfbcaSSatoshi Sahara                        // parenthesis-exclude-open
682*173bfbcaSSatoshi Sahara                        $parsed .= 'NOT(';
683*173bfbcaSSatoshi Sahara                        ++$parens_level;
684*173bfbcaSSatoshi Sahara                    } elseif ($token === ')') {
685*173bfbcaSSatoshi Sahara                        // parenthesis-any-close
686*173bfbcaSSatoshi Sahara                        if ($parens_level === 0) continue;
687*173bfbcaSSatoshi Sahara                        $parsed .= ')';
688*173bfbcaSSatoshi Sahara                        $parens_level--;
689*173bfbcaSSatoshi Sahara                    } elseif ($token === 'and') {
690*173bfbcaSSatoshi Sahara                        // logical-and (do nothing)
691*173bfbcaSSatoshi Sahara                    } elseif ($token === 'or') {
692*173bfbcaSSatoshi Sahara                        // logical-or
693*173bfbcaSSatoshi Sahara                        $parsed .= 'OR';
694*173bfbcaSSatoshi Sahara                    } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
695*173bfbcaSSatoshi Sahara                        // namespace-exclude
696*173bfbcaSSatoshi Sahara                        $parsed .= 'NOT(N+:'.$matches[1].')';
697*173bfbcaSSatoshi Sahara                    } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
698*173bfbcaSSatoshi Sahara                        // namespace-include
699*173bfbcaSSatoshi Sahara                        $parsed .= '(N+:'.$matches[1].')';
700*173bfbcaSSatoshi Sahara                    } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
701*173bfbcaSSatoshi Sahara                        // word-exclude
702*173bfbcaSSatoshi Sahara                        $parsed .= 'NOT('.static::termParser($Indexer, $matches[1]).')';
703*173bfbcaSSatoshi Sahara                    } else {
704*173bfbcaSSatoshi Sahara                        // word-include
705*173bfbcaSSatoshi Sahara                        $parsed .= static::termParser($Indexer, $token);
706*173bfbcaSSatoshi Sahara                    }
707*173bfbcaSSatoshi Sahara                }
708*173bfbcaSSatoshi Sahara            }
709*173bfbcaSSatoshi Sahara            $parsed_query .= $parsed;
710*173bfbcaSSatoshi Sahara        }
711*173bfbcaSSatoshi Sahara
712*173bfbcaSSatoshi Sahara        // cleanup (very sensitive)
713*173bfbcaSSatoshi Sahara        $parsed_query .= str_repeat(')', $parens_level);
714*173bfbcaSSatoshi Sahara        do {
715*173bfbcaSSatoshi Sahara            $parsed_query_old = $parsed_query;
716*173bfbcaSSatoshi Sahara            $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
717*173bfbcaSSatoshi Sahara        } while ($parsed_query !== $parsed_query_old);
718*173bfbcaSSatoshi Sahara        $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
719*173bfbcaSSatoshi Sahara        $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
720*173bfbcaSSatoshi Sahara        $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
721*173bfbcaSSatoshi Sahara        $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
722*173bfbcaSSatoshi Sahara        $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
723*173bfbcaSSatoshi Sahara
724*173bfbcaSSatoshi Sahara        // adjustment: make highlightings right
725*173bfbcaSSatoshi Sahara        $parens_level     = 0;
726*173bfbcaSSatoshi Sahara        $notgrp_levels    = array();
727*173bfbcaSSatoshi Sahara        $parsed_query_new = '';
728*173bfbcaSSatoshi Sahara        $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query,
729*173bfbcaSSatoshi Sahara                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
730*173bfbcaSSatoshi Sahara        );
731*173bfbcaSSatoshi Sahara        foreach ($tokens as $token) {
732*173bfbcaSSatoshi Sahara            if ($token === 'NOT(') {
733*173bfbcaSSatoshi Sahara                $notgrp_levels[] = ++$parens_level;
734*173bfbcaSSatoshi Sahara            } elseif ($token === '(') {
735*173bfbcaSSatoshi Sahara                ++$parens_level;
736*173bfbcaSSatoshi Sahara            } elseif ($token === ')') {
737*173bfbcaSSatoshi Sahara                if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
738*173bfbcaSSatoshi Sahara            } elseif (count($notgrp_levels) % 2 === 1) {
739*173bfbcaSSatoshi Sahara                // turn highlight-flag off if terms are logically in "NOT" group
740*173bfbcaSSatoshi Sahara                $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
741*173bfbcaSSatoshi Sahara            }
742*173bfbcaSSatoshi Sahara            $parsed_query_new .= $token;
743*173bfbcaSSatoshi Sahara        }
744*173bfbcaSSatoshi Sahara        $parsed_query = $parsed_query_new;
745*173bfbcaSSatoshi Sahara
746*173bfbcaSSatoshi Sahara        /**
747*173bfbcaSSatoshi Sahara         * convert infix notation string into postfix (Reverse Polish notation) array
748*173bfbcaSSatoshi Sahara         * by Shunting-yard algorithm
749*173bfbcaSSatoshi Sahara         *
750*173bfbcaSSatoshi Sahara         * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
751*173bfbcaSSatoshi Sahara         * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
752*173bfbcaSSatoshi Sahara         */
753*173bfbcaSSatoshi Sahara        $parsed_ary     = array();
754*173bfbcaSSatoshi Sahara        $ope_stack      = array();
755*173bfbcaSSatoshi Sahara        $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
756*173bfbcaSSatoshi Sahara        $ope_regex      = '/([()]|OR|AND|NOT)/u';
757*173bfbcaSSatoshi Sahara
758*173bfbcaSSatoshi Sahara        $tokens = preg_split($ope_regex, $parsed_query,
759*173bfbcaSSatoshi Sahara                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
760*173bfbcaSSatoshi Sahara        );
761*173bfbcaSSatoshi Sahara        foreach ($tokens as $token) {
762*173bfbcaSSatoshi Sahara            if (preg_match($ope_regex, $token)) {
763*173bfbcaSSatoshi Sahara                // operator
764*173bfbcaSSatoshi Sahara                $last_ope = end($ope_stack);
765*173bfbcaSSatoshi Sahara                while ($last_ope !== false
766*173bfbcaSSatoshi Sahara                    && $ope_precedence[$token] <= $ope_precedence[$last_ope]
767*173bfbcaSSatoshi Sahara                    && $last_ope != '('
768*173bfbcaSSatoshi Sahara                ) {
769*173bfbcaSSatoshi Sahara                    $parsed_ary[] = array_pop($ope_stack);
770*173bfbcaSSatoshi Sahara                    $last_ope = end($ope_stack);
771*173bfbcaSSatoshi Sahara                }
772*173bfbcaSSatoshi Sahara                if ($token == ')') {
773*173bfbcaSSatoshi Sahara                    array_pop($ope_stack); // this array_pop always deletes '('
774*173bfbcaSSatoshi Sahara                } else {
775*173bfbcaSSatoshi Sahara                    $ope_stack[] = $token;
776*173bfbcaSSatoshi Sahara                }
777*173bfbcaSSatoshi Sahara            } else {
778*173bfbcaSSatoshi Sahara                // operand
779*173bfbcaSSatoshi Sahara                $token_decoded = str_replace(['OP','CP'], ['(',')'], $token);
780*173bfbcaSSatoshi Sahara                $parsed_ary[] = $token_decoded;
781*173bfbcaSSatoshi Sahara            }
782*173bfbcaSSatoshi Sahara        }
783*173bfbcaSSatoshi Sahara        $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
784*173bfbcaSSatoshi Sahara
785*173bfbcaSSatoshi Sahara        // cleanup: each double "NOT" in RPN array actually does nothing
786*173bfbcaSSatoshi Sahara        $parsed_ary_count = count($parsed_ary);
787*173bfbcaSSatoshi Sahara        for ($i = 1; $i < $parsed_ary_count; ++$i) {
788*173bfbcaSSatoshi Sahara            if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
789*173bfbcaSSatoshi Sahara                unset($parsed_ary[$i], $parsed_ary[$i - 1]);
790*173bfbcaSSatoshi Sahara            }
791*173bfbcaSSatoshi Sahara        }
792*173bfbcaSSatoshi Sahara        $parsed_ary = array_values($parsed_ary);
793*173bfbcaSSatoshi Sahara
794*173bfbcaSSatoshi Sahara        // build return value
795*173bfbcaSSatoshi Sahara        $q = array();
796*173bfbcaSSatoshi Sahara        $q['query']      = $query;
797*173bfbcaSSatoshi Sahara        $q['parsed_str'] = $parsed_query;
798*173bfbcaSSatoshi Sahara        $q['parsed_ary'] = $parsed_ary;
799*173bfbcaSSatoshi Sahara
800*173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
801*173bfbcaSSatoshi Sahara            if ($token[2] !== ':') continue;
802*173bfbcaSSatoshi Sahara            $body = substr($token, 3);
803*173bfbcaSSatoshi Sahara
804*173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
805*173bfbcaSSatoshi Sahara                case 'N+:':
806*173bfbcaSSatoshi Sahara                     $q['ns'][]        = $body; // for backward compatibility
807*173bfbcaSSatoshi Sahara                     break;
808*173bfbcaSSatoshi Sahara                case 'N-:':
809*173bfbcaSSatoshi Sahara                     $q['notns'][]     = $body; // for backward compatibility
810*173bfbcaSSatoshi Sahara                     break;
811*173bfbcaSSatoshi Sahara                case 'W_:':
812*173bfbcaSSatoshi Sahara                     $q['words'][]     = $body;
813*173bfbcaSSatoshi Sahara                     break;
814*173bfbcaSSatoshi Sahara                case 'W-:':
815*173bfbcaSSatoshi Sahara                     $q['words'][]     = $body;
816*173bfbcaSSatoshi Sahara                     $q['not'][]       = $body; // for backward compatibility
817*173bfbcaSSatoshi Sahara                     break;
818*173bfbcaSSatoshi Sahara                case 'W+:':
819*173bfbcaSSatoshi Sahara                     $q['words'][]     = $body;
820*173bfbcaSSatoshi Sahara                     $q['highlight'][] = $body;
821*173bfbcaSSatoshi Sahara                     $q['and'][]       = $body; // for backward compatibility
822*173bfbcaSSatoshi Sahara                     break;
823*173bfbcaSSatoshi Sahara                case 'P-:':
824*173bfbcaSSatoshi Sahara                     $q['phrases'][]   = $body;
825*173bfbcaSSatoshi Sahara                     break;
826*173bfbcaSSatoshi Sahara                case 'P+:':
827*173bfbcaSSatoshi Sahara                     $q['phrases'][]   = $body;
828*173bfbcaSSatoshi Sahara                     $q['highlight'][] = $body;
829*173bfbcaSSatoshi Sahara                     break;
830*173bfbcaSSatoshi Sahara            }
831*173bfbcaSSatoshi Sahara        }
832*173bfbcaSSatoshi Sahara        foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
833*173bfbcaSSatoshi Sahara            $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
834*173bfbcaSSatoshi Sahara        }
835*173bfbcaSSatoshi Sahara
836*173bfbcaSSatoshi Sahara        return $q;
837*173bfbcaSSatoshi Sahara    }
838*173bfbcaSSatoshi Sahara
839*173bfbcaSSatoshi Sahara    /**
840*173bfbcaSSatoshi Sahara     * Transforms given search term into intermediate representation
841*173bfbcaSSatoshi Sahara     *
842*173bfbcaSSatoshi Sahara     * This function is used in ft_queryParser() and not for general purpose use.
843*173bfbcaSSatoshi Sahara     *
844*173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
845*173bfbcaSSatoshi Sahara     *
846*173bfbcaSSatoshi Sahara     * @param Doku_Indexer $Indexer
847*173bfbcaSSatoshi Sahara     * @param string       $term
848*173bfbcaSSatoshi Sahara     * @param bool         $consider_asian
849*173bfbcaSSatoshi Sahara     * @param bool         $phrase_mode
850*173bfbcaSSatoshi Sahara     * @return string
851*173bfbcaSSatoshi Sahara     */
852*173bfbcaSSatoshi Sahara    public static function termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false)
853*173bfbcaSSatoshi Sahara    {
854*173bfbcaSSatoshi Sahara        $parsed = '';
855*173bfbcaSSatoshi Sahara        if ($consider_asian) {
856*173bfbcaSSatoshi Sahara            // successive asian characters need to be searched as a phrase
857*173bfbcaSSatoshi Sahara            $words = \dokuwiki\Utf8\Asian::splitAsianWords($term);
858*173bfbcaSSatoshi Sahara            foreach ($words as $word) {
859*173bfbcaSSatoshi Sahara                $phrase_mode = $phrase_mode ? true : \dokuwiki\Utf8\Asian::isAsianWords($word);
860*173bfbcaSSatoshi Sahara                $parsed .= static::termParser($Indexer, $word, false, $phrase_mode);
861*173bfbcaSSatoshi Sahara            }
862*173bfbcaSSatoshi Sahara        } else {
863*173bfbcaSSatoshi Sahara            $term_noparen = str_replace(['(',')'], ' ', $term);
864*173bfbcaSSatoshi Sahara            $words = $Indexer->tokenizer($term_noparen, true);
865*173bfbcaSSatoshi Sahara
866*173bfbcaSSatoshi Sahara            // W_: no need to highlight
867*173bfbcaSSatoshi Sahara            if (empty($words)) {
868*173bfbcaSSatoshi Sahara                $parsed = '()'; // important: do not remove
869*173bfbcaSSatoshi Sahara            } elseif ($words[0] === $term) {
870*173bfbcaSSatoshi Sahara                $parsed = '(W+:'.$words[0].')';
871*173bfbcaSSatoshi Sahara            } elseif ($phrase_mode) {
872*173bfbcaSSatoshi Sahara                $term_encoded = str_replace(['(',')'], ['OP','CP'], $term);
873*173bfbcaSSatoshi Sahara                $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
874*173bfbcaSSatoshi Sahara            } else {
875*173bfbcaSSatoshi Sahara                $parsed = '((W+:'.implode(')(W+:', $words).'))';
876*173bfbcaSSatoshi Sahara            }
877*173bfbcaSSatoshi Sahara        }
878*173bfbcaSSatoshi Sahara        return $parsed;
879*173bfbcaSSatoshi Sahara    }
880*173bfbcaSSatoshi Sahara
881*173bfbcaSSatoshi Sahara    /**
882*173bfbcaSSatoshi Sahara     * Recreate a search query string based on parsed parts,
883*173bfbcaSSatoshi Sahara     * doesn't support negated phrases and `OR` searches
884*173bfbcaSSatoshi Sahara     *
885*173bfbcaSSatoshi Sahara     * @param array $and
886*173bfbcaSSatoshi Sahara     * @param array $not
887*173bfbcaSSatoshi Sahara     * @param array $phrases
888*173bfbcaSSatoshi Sahara     * @param array $ns
889*173bfbcaSSatoshi Sahara     * @param array $notns
890*173bfbcaSSatoshi Sahara     *
891*173bfbcaSSatoshi Sahara     * @return string
892*173bfbcaSSatoshi Sahara     */
893*173bfbcaSSatoshi Sahara    public static function queryUnparser_simple(
894*173bfbcaSSatoshi Sahara                        array $and, array $not, array $phrases, array $ns, array $notns
895*173bfbcaSSatoshi Sahara    ) {
896*173bfbcaSSatoshi Sahara        $query = implode(' ', $and);
897*173bfbcaSSatoshi Sahara
898*173bfbcaSSatoshi Sahara        if (!empty($not)) {
899*173bfbcaSSatoshi Sahara            $query .= ' -' . implode(' -', $not);
900*173bfbcaSSatoshi Sahara        }
901*173bfbcaSSatoshi Sahara        if (!empty($phrases)) {
902*173bfbcaSSatoshi Sahara            $query .= ' "' . implode('" "', $phrases) . '"';
903*173bfbcaSSatoshi Sahara        }
904*173bfbcaSSatoshi Sahara        if (!empty($ns)) {
905*173bfbcaSSatoshi Sahara            $query .= ' @' . implode(' @', $ns);
906*173bfbcaSSatoshi Sahara        }
907*173bfbcaSSatoshi Sahara        if (!empty($notns)) {
908*173bfbcaSSatoshi Sahara            $query .= ' ^' . implode(' ^', $notns);
909*173bfbcaSSatoshi Sahara        }
910*173bfbcaSSatoshi Sahara        return $query;
911*173bfbcaSSatoshi Sahara    }
912*173bfbcaSSatoshi Sahara}
913