xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 3837ea917fcd3abddc414e549ba7ddfcb7ca8a21)
1173bfbcaSSatoshi Sahara<?php
2173bfbcaSSatoshi Saharanamespace dokuwiki\Search;
3173bfbcaSSatoshi Sahara
4173bfbcaSSatoshi Saharause dokuwiki\Extension\Event;
5c31af4f3SSatoshi Saharause dokuwiki\Search\Indexer;
6*3837ea91SSatoshi Saharause dokuwiki\Search\QueryParser;
7c31af4f3SSatoshi Saharause dokuwiki\Utf8;
8173bfbcaSSatoshi Sahara
9173bfbcaSSatoshi Sahara/**
10173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search
11173bfbcaSSatoshi Sahara *
12173bfbcaSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
13173bfbcaSSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
14173bfbcaSSatoshi Sahara */
15173bfbcaSSatoshi Saharaclass FulltextSearch
16173bfbcaSSatoshi Sahara{
17173bfbcaSSatoshi Sahara    /**
18173bfbcaSSatoshi Sahara     *  Fulltext Search constructor. prevent direct object creation
19173bfbcaSSatoshi Sahara     */
20173bfbcaSSatoshi Sahara    protected function __construct() {}
21173bfbcaSSatoshi Sahara
22173bfbcaSSatoshi Sahara    /**
23173bfbcaSSatoshi Sahara     * The fulltext search
24173bfbcaSSatoshi Sahara     *
25173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
26173bfbcaSSatoshi Sahara     *
27173bfbcaSSatoshi Sahara     * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
28173bfbcaSSatoshi Sahara     *
29173bfbcaSSatoshi Sahara     * @param string     $query
30173bfbcaSSatoshi Sahara     * @param array      $highlight
31173bfbcaSSatoshi Sahara     * @param string     $sort
32173bfbcaSSatoshi Sahara     * @param int|string $after  only show results with mtime after this date,
33173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
34173bfbcaSSatoshi Sahara     * @param int|string $before only show results with mtime before this date,
35173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
36173bfbcaSSatoshi Sahara     *
37173bfbcaSSatoshi Sahara     * @return array
38173bfbcaSSatoshi Sahara     */
39173bfbcaSSatoshi Sahara    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
40173bfbcaSSatoshi Sahara    {
41173bfbcaSSatoshi Sahara        if ($sort === null) {
42173bfbcaSSatoshi Sahara            $sort = 'hits';
43173bfbcaSSatoshi Sahara        }
44173bfbcaSSatoshi Sahara        $data = [
45173bfbcaSSatoshi Sahara            'query' => $query,
46173bfbcaSSatoshi Sahara            'sort' => $sort,
47173bfbcaSSatoshi Sahara            'after' => $after,
48173bfbcaSSatoshi Sahara            'before' => $before
49173bfbcaSSatoshi Sahara        ];
50173bfbcaSSatoshi Sahara        $data['highlight'] =& $highlight;
51173bfbcaSSatoshi Sahara        $action = static::class.'::callback_pageSearch';
52173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
53173bfbcaSSatoshi Sahara    }
54173bfbcaSSatoshi Sahara
55173bfbcaSSatoshi Sahara    /**
56173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
57173bfbcaSSatoshi Sahara     *
58173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
59173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
60173bfbcaSSatoshi Sahara     *
61173bfbcaSSatoshi Sahara     * @param array $data event data
62173bfbcaSSatoshi Sahara     * @return array matching documents
63173bfbcaSSatoshi Sahara     */
64173bfbcaSSatoshi Sahara    public static function callback_pageSearch(&$data)
65173bfbcaSSatoshi Sahara    {
66c31af4f3SSatoshi Sahara        $Indexer = Indexer::getInstance();
67173bfbcaSSatoshi Sahara
68173bfbcaSSatoshi Sahara        // parse the given query
69*3837ea91SSatoshi Sahara        $q = QueryParser::convert($data['query']);
70173bfbcaSSatoshi Sahara        $data['highlight'] = $q['highlight'];
71173bfbcaSSatoshi Sahara
72173bfbcaSSatoshi Sahara        if (empty($q['parsed_ary'])) return array();
73173bfbcaSSatoshi Sahara
74173bfbcaSSatoshi Sahara        // lookup all words found in the query
75173bfbcaSSatoshi Sahara        $lookup = $Indexer->lookup($q['words']);
76173bfbcaSSatoshi Sahara
77173bfbcaSSatoshi Sahara        // get all pages in this dokuwiki site (!: includes nonexistent pages)
78173bfbcaSSatoshi Sahara        $pages_all = array();
79173bfbcaSSatoshi Sahara        foreach ($Indexer->getPages() as $id) {
80173bfbcaSSatoshi Sahara            $pages_all[$id] = 0; // base: 0 hit
81173bfbcaSSatoshi Sahara        }
82173bfbcaSSatoshi Sahara
83173bfbcaSSatoshi Sahara        // process the query
84173bfbcaSSatoshi Sahara        $stack = array();
85173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
86173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
87173bfbcaSSatoshi Sahara                case 'W+:':
88173bfbcaSSatoshi Sahara                case 'W-:':
89173bfbcaSSatoshi Sahara                case 'W_:': // word
90173bfbcaSSatoshi Sahara                    $word    = substr($token, 3);
91173bfbcaSSatoshi Sahara                    $stack[] = (array) $lookup[$word];
92173bfbcaSSatoshi Sahara                    break;
93173bfbcaSSatoshi Sahara                case 'P+:':
94173bfbcaSSatoshi Sahara                case 'P-:': // phrase
95173bfbcaSSatoshi Sahara                    $phrase = substr($token, 3);
96173bfbcaSSatoshi Sahara                    // since phrases are always parsed as ((W1)(W2)...(P)),
97173bfbcaSSatoshi Sahara                    // the end($stack) always points the pages that contain
98173bfbcaSSatoshi Sahara                    // all words in this phrase
99173bfbcaSSatoshi Sahara                    $pages  = end($stack);
100173bfbcaSSatoshi Sahara                    $pages_matched = array();
101173bfbcaSSatoshi Sahara                    foreach (array_keys($pages) as $id) {
102173bfbcaSSatoshi Sahara                        $evdata = array(
103173bfbcaSSatoshi Sahara                            'id' => $id,
104173bfbcaSSatoshi Sahara                            'phrase' => $phrase,
105173bfbcaSSatoshi Sahara                            'text' => rawWiki($id)
106173bfbcaSSatoshi Sahara                        );
107173bfbcaSSatoshi Sahara                        $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
108173bfbcaSSatoshi Sahara                        if ($evt->advise_before() && $evt->result !== true) {
109c31af4f3SSatoshi Sahara                            $text = Utf8\PhpString::strtolower($evdata['text']);
110173bfbcaSSatoshi Sahara                            if (strpos($text, $phrase) !== false) {
111173bfbcaSSatoshi Sahara                                $evt->result = true;
112173bfbcaSSatoshi Sahara                            }
113173bfbcaSSatoshi Sahara                        }
114173bfbcaSSatoshi Sahara                        $evt->advise_after();
115173bfbcaSSatoshi Sahara                        if ($evt->result === true) {
116173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // phrase: always 0 hit
117173bfbcaSSatoshi Sahara                        }
118173bfbcaSSatoshi Sahara                    }
119173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
120173bfbcaSSatoshi Sahara                    break;
121173bfbcaSSatoshi Sahara                case 'N+:':
122173bfbcaSSatoshi Sahara                case 'N-:': // namespace
123173bfbcaSSatoshi Sahara                    $ns = cleanID(substr($token, 3)) . ':';
124173bfbcaSSatoshi Sahara                    $pages_matched = array();
125173bfbcaSSatoshi Sahara                    foreach (array_keys($pages_all) as $id) {
126173bfbcaSSatoshi Sahara                        if (strpos($id, $ns) === 0) {
127173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // namespace: always 0 hit
128173bfbcaSSatoshi Sahara                        }
129173bfbcaSSatoshi Sahara                    }
130173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
131173bfbcaSSatoshi Sahara                    break;
132173bfbcaSSatoshi Sahara                case 'AND': // and operation
133173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
134173bfbcaSSatoshi Sahara                    $stack[] = static::resultCombine(array($pages1, $pages2));
135173bfbcaSSatoshi Sahara                    break;
136173bfbcaSSatoshi Sahara                case 'OR':  // or operation
137173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
138173bfbcaSSatoshi Sahara                    $stack[] = static::resultUnite(array($pages1, $pages2));
139173bfbcaSSatoshi Sahara                    break;
140173bfbcaSSatoshi Sahara                case 'NOT': // not operation (unary)
141173bfbcaSSatoshi Sahara                    $pages   = array_pop($stack);
142173bfbcaSSatoshi Sahara                    $stack[] = static::resultComplement(array($pages_all, $pages));
143173bfbcaSSatoshi Sahara                    break;
144173bfbcaSSatoshi Sahara            }
145173bfbcaSSatoshi Sahara        }
146173bfbcaSSatoshi Sahara        $docs = array_pop($stack);
147173bfbcaSSatoshi Sahara
148173bfbcaSSatoshi Sahara        if (empty($docs)) return array();
149173bfbcaSSatoshi Sahara
150173bfbcaSSatoshi Sahara        // check: settings, acls, existence
151173bfbcaSSatoshi Sahara        foreach (array_keys($docs) as $id) {
152173bfbcaSSatoshi Sahara            if (isHiddenPage($id)
153173bfbcaSSatoshi Sahara                || auth_quickaclcheck($id) < AUTH_READ
154173bfbcaSSatoshi Sahara                || !page_exists($id, '', false)
155173bfbcaSSatoshi Sahara            ) {
156173bfbcaSSatoshi Sahara                unset($docs[$id]);
157173bfbcaSSatoshi Sahara            }
158173bfbcaSSatoshi Sahara        }
159173bfbcaSSatoshi Sahara
160173bfbcaSSatoshi Sahara        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
161173bfbcaSSatoshi Sahara
162173bfbcaSSatoshi Sahara        if ($data['sort'] === 'mtime') {
163173bfbcaSSatoshi Sahara            uksort($docs, static::class.'::pagemtimesorter');
164173bfbcaSSatoshi Sahara        } else {
165173bfbcaSSatoshi Sahara            // sort docs by count
166173bfbcaSSatoshi Sahara            arsort($docs);
167173bfbcaSSatoshi Sahara        }
168173bfbcaSSatoshi Sahara
169173bfbcaSSatoshi Sahara        return $docs;
170173bfbcaSSatoshi Sahara    }
171173bfbcaSSatoshi Sahara
172173bfbcaSSatoshi Sahara    /**
173173bfbcaSSatoshi Sahara     * Quicksearch for pagenames
174173bfbcaSSatoshi Sahara     *
175173bfbcaSSatoshi Sahara     * By default it only matches the pagename and ignores the
176173bfbcaSSatoshi Sahara     * namespace. This can be changed with the second parameter.
177173bfbcaSSatoshi Sahara     * The third parameter allows to search in titles as well.
178173bfbcaSSatoshi Sahara     *
179173bfbcaSSatoshi Sahara     * The function always returns titles as well
180173bfbcaSSatoshi Sahara     *
181173bfbcaSSatoshi Sahara     * @triggers SEARCH_QUERY_PAGELOOKUP
182173bfbcaSSatoshi Sahara     * @author   Andreas Gohr <andi@splitbrain.org>
183173bfbcaSSatoshi Sahara     * @author   Adrian Lang <lang@cosmocode.de>
184173bfbcaSSatoshi Sahara     *
185173bfbcaSSatoshi Sahara     * @param string     $id       page id
186173bfbcaSSatoshi Sahara     * @param bool       $in_ns    match against namespace as well?
187173bfbcaSSatoshi Sahara     * @param bool       $in_title search in title?
188173bfbcaSSatoshi Sahara     * @param int|string $after    only show results with mtime after this date,
189173bfbcaSSatoshi Sahara     *                             accepts timestap or strtotime arguments
190173bfbcaSSatoshi Sahara     * @param int|string $before   only show results with mtime before this date,
191173bfbcaSSatoshi Sahara     *                             accepts timestap or strtotime arguments
192173bfbcaSSatoshi Sahara     *
193173bfbcaSSatoshi Sahara     * @return string[]
194173bfbcaSSatoshi Sahara     */
195173bfbcaSSatoshi Sahara    public static function pageLookup($id, $in_ns=false, $in_title=false, $after = null, $before = null)
196173bfbcaSSatoshi Sahara    {
197173bfbcaSSatoshi Sahara        $data = [
198173bfbcaSSatoshi Sahara            'id' => $id,
199173bfbcaSSatoshi Sahara            'in_ns' => $in_ns,
200173bfbcaSSatoshi Sahara            'in_title' => $in_title,
201173bfbcaSSatoshi Sahara            'after' => $after,
202173bfbcaSSatoshi Sahara            'before' => $before
203173bfbcaSSatoshi Sahara        ];
204173bfbcaSSatoshi Sahara        $data['has_titles'] = true; // for plugin backward compatibility check
205173bfbcaSSatoshi Sahara        $action = static::class.'::callback_pageLookup';
206173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $action);
207173bfbcaSSatoshi Sahara    }
208173bfbcaSSatoshi Sahara
209173bfbcaSSatoshi Sahara    /**
210173bfbcaSSatoshi Sahara     * Returns list of pages as array(pageid => First Heading)
211173bfbcaSSatoshi Sahara     *
212173bfbcaSSatoshi Sahara     * @param array &$data event data
213173bfbcaSSatoshi Sahara     * @return string[]
214173bfbcaSSatoshi Sahara     */
215173bfbcaSSatoshi Sahara    public static function callback_pageLookup(&$data)
216173bfbcaSSatoshi Sahara    {
217c31af4f3SSatoshi Sahara        $Indexer = Indexer::getInstance();
218c31af4f3SSatoshi Sahara
219173bfbcaSSatoshi Sahara        // split out original parameters
220173bfbcaSSatoshi Sahara        $id = $data['id'];
221*3837ea91SSatoshi Sahara        $parsedQuery = QueryParser::convert($id);
222*3837ea91SSatoshi Sahara
223173bfbcaSSatoshi Sahara        if (count($parsedQuery['ns']) > 0) {
224173bfbcaSSatoshi Sahara            $ns = cleanID($parsedQuery['ns'][0]) . ':';
225173bfbcaSSatoshi Sahara            $id = implode(' ', $parsedQuery['highlight']);
226173bfbcaSSatoshi Sahara        }
227173bfbcaSSatoshi Sahara
228173bfbcaSSatoshi Sahara        $in_ns    = $data['in_ns'];
229173bfbcaSSatoshi Sahara        $in_title = $data['in_title'];
230173bfbcaSSatoshi Sahara        $cleaned = cleanID($id);
231173bfbcaSSatoshi Sahara
232173bfbcaSSatoshi Sahara        $pages = array();
233173bfbcaSSatoshi Sahara        if ($id !== '' && $cleaned !== '') {
234c31af4f3SSatoshi Sahara            $page_idx = $Indexer->getPages();
235173bfbcaSSatoshi Sahara            foreach ($page_idx as $p_id) {
236173bfbcaSSatoshi Sahara                if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
237173bfbcaSSatoshi Sahara                    if (!isset($pages[$p_id])) {
238173bfbcaSSatoshi Sahara                        $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
239173bfbcaSSatoshi Sahara                    }
240173bfbcaSSatoshi Sahara                }
241173bfbcaSSatoshi Sahara            }
242173bfbcaSSatoshi Sahara            if ($in_title) {
243173bfbcaSSatoshi Sahara                $func = static::class.'::pageLookupTitleCompare';
244173bfbcaSSatoshi Sahara                foreach ($Indexer->lookupKey('title', $id, $func) as $p_id) {
245173bfbcaSSatoshi Sahara                    if (!isset($pages[$p_id])) {
246173bfbcaSSatoshi Sahara                        $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
247173bfbcaSSatoshi Sahara                    }
248173bfbcaSSatoshi Sahara                }
249173bfbcaSSatoshi Sahara            }
250173bfbcaSSatoshi Sahara        }
251173bfbcaSSatoshi Sahara
252173bfbcaSSatoshi Sahara        if (isset($ns)) {
253173bfbcaSSatoshi Sahara            foreach (array_keys($pages) as $p_id) {
254173bfbcaSSatoshi Sahara                if (strpos($p_id, $ns) !== 0) {
255173bfbcaSSatoshi Sahara                    unset($pages[$p_id]);
256173bfbcaSSatoshi Sahara                }
257173bfbcaSSatoshi Sahara            }
258173bfbcaSSatoshi Sahara        }
259173bfbcaSSatoshi Sahara
260173bfbcaSSatoshi Sahara        // discard hidden pages
261173bfbcaSSatoshi Sahara        // discard nonexistent pages
262173bfbcaSSatoshi Sahara        // check ACL permissions
263173bfbcaSSatoshi Sahara        foreach (array_keys($pages) as $idx) {
264173bfbcaSSatoshi Sahara            if (!isVisiblePage($idx) || !page_exists($idx) || auth_quickaclcheck($idx) < AUTH_READ) {
265173bfbcaSSatoshi Sahara                unset($pages[$idx]);
266173bfbcaSSatoshi Sahara            }
267173bfbcaSSatoshi Sahara        }
268173bfbcaSSatoshi Sahara
269173bfbcaSSatoshi Sahara        $pages = static::filterResultsByTime($pages, $data['after'], $data['before']);
270173bfbcaSSatoshi Sahara
271173bfbcaSSatoshi Sahara        uksort($pages, static::class.'::pagesorter');
272173bfbcaSSatoshi Sahara        return $pages;
273173bfbcaSSatoshi Sahara    }
274173bfbcaSSatoshi Sahara
275173bfbcaSSatoshi Sahara    /**
276173bfbcaSSatoshi Sahara     * @param array      $results search results in the form pageid => value
277173bfbcaSSatoshi Sahara     * @param int|string $after   only returns results with mtime after this date,
278173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
279173bfbcaSSatoshi Sahara     * @param int|string $before  only returns results with mtime after this date,
280173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
281173bfbcaSSatoshi Sahara     *
282173bfbcaSSatoshi Sahara     * @return array
283173bfbcaSSatoshi Sahara     */
284173bfbcaSSatoshi Sahara    protected static function filterResultsByTime(array $results, $after, $before)
285173bfbcaSSatoshi Sahara    {
286173bfbcaSSatoshi Sahara        if ($after || $before) {
287173bfbcaSSatoshi Sahara            $after = is_int($after) ? $after : strtotime($after);
288173bfbcaSSatoshi Sahara            $before = is_int($before) ? $before : strtotime($before);
289173bfbcaSSatoshi Sahara
290173bfbcaSSatoshi Sahara            foreach ($results as $id => $value) {
291173bfbcaSSatoshi Sahara                $mTime = filemtime(wikiFN($id));
292173bfbcaSSatoshi Sahara                if ($after && $after > $mTime) {
293173bfbcaSSatoshi Sahara                    unset($results[$id]);
294173bfbcaSSatoshi Sahara                    continue;
295173bfbcaSSatoshi Sahara                }
296173bfbcaSSatoshi Sahara                if ($before && $before < $mTime) {
297173bfbcaSSatoshi Sahara                    unset($results[$id]);
298173bfbcaSSatoshi Sahara                }
299173bfbcaSSatoshi Sahara            }
300173bfbcaSSatoshi Sahara        }
301173bfbcaSSatoshi Sahara
302173bfbcaSSatoshi Sahara        return $results;
303173bfbcaSSatoshi Sahara    }
304173bfbcaSSatoshi Sahara
305173bfbcaSSatoshi Sahara    /**
306173bfbcaSSatoshi Sahara     * Tiny helper function for comparing the searched title with the title
307173bfbcaSSatoshi Sahara     * from the search index. This function is a wrapper around stripos with
308173bfbcaSSatoshi Sahara     * adapted argument order and return value.
309173bfbcaSSatoshi Sahara     *
310173bfbcaSSatoshi Sahara     * @param string $search searched title
311173bfbcaSSatoshi Sahara     * @param string $title  title from index
312173bfbcaSSatoshi Sahara     * @return bool
313173bfbcaSSatoshi Sahara     */
3140a3e25f4SSatoshi Sahara    protected static function pageLookupTitleCompare($search, $title)
315173bfbcaSSatoshi Sahara    {
316173bfbcaSSatoshi Sahara        return stripos($title, $search) !== false;
317173bfbcaSSatoshi Sahara    }
318173bfbcaSSatoshi Sahara
319173bfbcaSSatoshi Sahara    /**
320173bfbcaSSatoshi Sahara     * Sort pages based on their namespace level first, then on their string
321173bfbcaSSatoshi Sahara     * values. This makes higher hierarchy pages rank higher than lower hierarchy
322173bfbcaSSatoshi Sahara     * pages.
323173bfbcaSSatoshi Sahara     *
324173bfbcaSSatoshi Sahara     * @param string $a
325173bfbcaSSatoshi Sahara     * @param string $b
326173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b,
327173bfbcaSSatoshi Sahara     *             and 0 if they are equal.
328173bfbcaSSatoshi Sahara     */
329173bfbcaSSatoshi Sahara    protected static function pagesorter($a, $b)
330173bfbcaSSatoshi Sahara    {
331173bfbcaSSatoshi Sahara        $ac = count(explode(':',$a));
332173bfbcaSSatoshi Sahara        $bc = count(explode(':',$b));
333173bfbcaSSatoshi Sahara        if ($ac < $bc) {
334173bfbcaSSatoshi Sahara            return -1;
335173bfbcaSSatoshi Sahara        } elseif ($ac > $bc) {
336173bfbcaSSatoshi Sahara            return 1;
337173bfbcaSSatoshi Sahara        }
338173bfbcaSSatoshi Sahara        return strcmp ($a,$b);
339173bfbcaSSatoshi Sahara    }
340173bfbcaSSatoshi Sahara
341173bfbcaSSatoshi Sahara    /**
342173bfbcaSSatoshi Sahara     * Sort pages by their mtime, from newest to oldest
343173bfbcaSSatoshi Sahara     *
344173bfbcaSSatoshi Sahara     * @param string $a
345173bfbcaSSatoshi Sahara     * @param string $b
346173bfbcaSSatoshi Sahara     *
347173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
348173bfbcaSSatoshi Sahara     *             and 0 if they are of the same age
349173bfbcaSSatoshi Sahara     */
350173bfbcaSSatoshi Sahara    protected static function pagemtimesorter($a, $b)
351173bfbcaSSatoshi Sahara    {
352173bfbcaSSatoshi Sahara        $mtimeA = filemtime(wikiFN($a));
353173bfbcaSSatoshi Sahara        $mtimeB = filemtime(wikiFN($b));
354173bfbcaSSatoshi Sahara        return $mtimeB - $mtimeA;
355173bfbcaSSatoshi Sahara    }
356173bfbcaSSatoshi Sahara
357173bfbcaSSatoshi Sahara    /**
358173bfbcaSSatoshi Sahara     * Creates a snippet extract
359173bfbcaSSatoshi Sahara     *
360173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
361173bfbcaSSatoshi Sahara     * @triggers FULLTEXT_SNIPPET_CREATE
362173bfbcaSSatoshi Sahara     *
363173bfbcaSSatoshi Sahara     * @param string $id page id
364173bfbcaSSatoshi Sahara     * @param array $highlight
365173bfbcaSSatoshi Sahara     * @return mixed
366173bfbcaSSatoshi Sahara     */
367173bfbcaSSatoshi Sahara    public static function snippet($id, $highlight)
368173bfbcaSSatoshi Sahara    {
369173bfbcaSSatoshi Sahara        $text = rawWiki($id);
370173bfbcaSSatoshi Sahara        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
371173bfbcaSSatoshi Sahara        $evdata = array(
372173bfbcaSSatoshi Sahara            'id'        => $id,
373173bfbcaSSatoshi Sahara            'text'      => &$text,
374173bfbcaSSatoshi Sahara            'highlight' => &$highlight,
375173bfbcaSSatoshi Sahara            'snippet'   => '',
376173bfbcaSSatoshi Sahara        );
377173bfbcaSSatoshi Sahara
378173bfbcaSSatoshi Sahara        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
379173bfbcaSSatoshi Sahara        if ($evt->advise_before()) {
380173bfbcaSSatoshi Sahara            $match = array();
381173bfbcaSSatoshi Sahara            $snippets = array();
382173bfbcaSSatoshi Sahara            $utf8_offset = $offset = $end = 0;
3830a3e25f4SSatoshi Sahara            $len = Utf8\PhpString::strlen($text);
384173bfbcaSSatoshi Sahara
385173bfbcaSSatoshi Sahara            // build a regexp from the phrases to highlight
386173bfbcaSSatoshi Sahara            $re1 = '(' .
387173bfbcaSSatoshi Sahara                join(
388173bfbcaSSatoshi Sahara                    '|',
389173bfbcaSSatoshi Sahara                    array_map(
390173bfbcaSSatoshi Sahara                        static::class.'::snippet_re_preprocess',
391173bfbcaSSatoshi Sahara                        array_map(
392173bfbcaSSatoshi Sahara                            'preg_quote_cb',
393173bfbcaSSatoshi Sahara                            array_filter((array) $highlight)
394173bfbcaSSatoshi Sahara                        )
395173bfbcaSSatoshi Sahara                    )
396173bfbcaSSatoshi Sahara                ) .
397173bfbcaSSatoshi Sahara                ')';
398173bfbcaSSatoshi Sahara            $re2 = "$re1.{0,75}(?!\\1)$re1";
399173bfbcaSSatoshi Sahara            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
400173bfbcaSSatoshi Sahara
401173bfbcaSSatoshi Sahara            for ($cnt=4; $cnt--;) {
402173bfbcaSSatoshi Sahara                if (0) {
403173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
404173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
405173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
406173bfbcaSSatoshi Sahara                } else {
407173bfbcaSSatoshi Sahara                    break;
408173bfbcaSSatoshi Sahara                }
409173bfbcaSSatoshi Sahara
410173bfbcaSSatoshi Sahara                list($str, $idx) = $match[0];
411173bfbcaSSatoshi Sahara
412173bfbcaSSatoshi Sahara                // convert $idx (a byte offset) into a utf8 character offset
4130a3e25f4SSatoshi Sahara                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
4140a3e25f4SSatoshi Sahara                $utf8_len = Utf8\PhpString::strlen($str);
415173bfbcaSSatoshi Sahara
416173bfbcaSSatoshi Sahara                // establish context, 100 bytes surrounding the match string
417173bfbcaSSatoshi Sahara                // first look to see if we can go 100 either side,
418173bfbcaSSatoshi Sahara                // then drop to 50 adding any excess if the other side can't go to 50,
419173bfbcaSSatoshi Sahara                $pre = min($utf8_idx - $utf8_offset, 100);
420173bfbcaSSatoshi Sahara                $post = min($len - $utf8_idx - $utf8_len, 100);
421173bfbcaSSatoshi Sahara
422173bfbcaSSatoshi Sahara                if ($pre > 50 && $post > 50) {
423173bfbcaSSatoshi Sahara                    $pre = $post = 50;
424173bfbcaSSatoshi Sahara                } elseif ($pre > 50) {
425173bfbcaSSatoshi Sahara                    $pre = min($pre, 100 - $post);
426173bfbcaSSatoshi Sahara                } elseif ($post > 50) {
427173bfbcaSSatoshi Sahara                    $post = min($post, 100 - $pre);
428173bfbcaSSatoshi Sahara                } elseif ($offset == 0) {
429173bfbcaSSatoshi Sahara                    // both are less than 50, means the context is the whole string
430173bfbcaSSatoshi Sahara                    // make it so and break out of this loop - there is no need for the
431173bfbcaSSatoshi Sahara                    // complex snippet calculations
432173bfbcaSSatoshi Sahara                    $snippets = array($text);
433173bfbcaSSatoshi Sahara                    break;
434173bfbcaSSatoshi Sahara                }
435173bfbcaSSatoshi Sahara
436173bfbcaSSatoshi Sahara                // establish context start and end points, try to append to previous
437173bfbcaSSatoshi Sahara                // context if possible
438173bfbcaSSatoshi Sahara                $start = $utf8_idx - $pre;
439173bfbcaSSatoshi Sahara                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
440173bfbcaSSatoshi Sahara                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
441173bfbcaSSatoshi Sahara
442173bfbcaSSatoshi Sahara                if ($append) {
4430a3e25f4SSatoshi Sahara                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
444173bfbcaSSatoshi Sahara                } else {
4450a3e25f4SSatoshi Sahara                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
446173bfbcaSSatoshi Sahara                }
447173bfbcaSSatoshi Sahara
448173bfbcaSSatoshi Sahara                // set $offset for next match attempt
449173bfbcaSSatoshi Sahara                // continue matching after the current match
450173bfbcaSSatoshi Sahara                // if the current match is not the longest possible match starting at the current offset
451173bfbcaSSatoshi Sahara                // this prevents further matching of this snippet but for possible matches of length
452173bfbcaSSatoshi Sahara                // smaller than match length + context (at least 50 characters) this match is part of the context
453173bfbcaSSatoshi Sahara                $utf8_offset = $utf8_idx + $utf8_len;
4540a3e25f4SSatoshi Sahara                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
4550a3e25f4SSatoshi Sahara                $offset = Utf8\Clean::correctIdx($text, $offset);
456173bfbcaSSatoshi Sahara            }
457173bfbcaSSatoshi Sahara
458173bfbcaSSatoshi Sahara            $m = "\1";
459173bfbcaSSatoshi Sahara            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
460173bfbcaSSatoshi Sahara            $snippet = preg_replace(
461173bfbcaSSatoshi Sahara                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
462173bfbcaSSatoshi Sahara                '<strong class="search_hit">$1</strong>',
463173bfbcaSSatoshi Sahara                hsc(join('... ', $snippets))
464173bfbcaSSatoshi Sahara            );
465173bfbcaSSatoshi Sahara
466173bfbcaSSatoshi Sahara            $evdata['snippet'] = $snippet;
467173bfbcaSSatoshi Sahara        }
468173bfbcaSSatoshi Sahara        $evt->advise_after();
469173bfbcaSSatoshi Sahara        unset($evt);
470173bfbcaSSatoshi Sahara
471173bfbcaSSatoshi Sahara        return $evdata['snippet'];
472173bfbcaSSatoshi Sahara    }
473173bfbcaSSatoshi Sahara
474173bfbcaSSatoshi Sahara    /**
475173bfbcaSSatoshi Sahara     * Wraps a search term in regex boundary checks.
476173bfbcaSSatoshi Sahara     *
477173bfbcaSSatoshi Sahara     * @param string $term
478173bfbcaSSatoshi Sahara     * @return string
479173bfbcaSSatoshi Sahara     */
480173bfbcaSSatoshi Sahara    public static function snippet_re_preprocess($term)
481173bfbcaSSatoshi Sahara    {
482173bfbcaSSatoshi Sahara        // do not process asian terms where word boundaries are not explicit
4830a3e25f4SSatoshi Sahara        if (Utf8\Asian::isAsianWords($term)) return $term;
484173bfbcaSSatoshi Sahara
485173bfbcaSSatoshi Sahara        if (UTF8_PROPERTYSUPPORT) {
486173bfbcaSSatoshi Sahara            // unicode word boundaries
487173bfbcaSSatoshi Sahara            // see http://stackoverflow.com/a/2449017/172068
488173bfbcaSSatoshi Sahara            $BL = '(?<!\pL)';
489173bfbcaSSatoshi Sahara            $BR = '(?!\pL)';
490173bfbcaSSatoshi Sahara        } else {
491173bfbcaSSatoshi Sahara            // not as correct as above, but at least won't break
492173bfbcaSSatoshi Sahara            $BL = '\b';
493173bfbcaSSatoshi Sahara            $BR = '\b';
494173bfbcaSSatoshi Sahara        }
495173bfbcaSSatoshi Sahara
496173bfbcaSSatoshi Sahara        if (substr($term, 0, 2) == '\\*') {
497173bfbcaSSatoshi Sahara            $term = substr($term, 2);
498173bfbcaSSatoshi Sahara        } else {
499173bfbcaSSatoshi Sahara            $term = $BL.$term;
500173bfbcaSSatoshi Sahara        }
501173bfbcaSSatoshi Sahara
502173bfbcaSSatoshi Sahara        if (substr($term, -2, 2) == '\\*') {
503173bfbcaSSatoshi Sahara            $term = substr($term, 0, -2);
504173bfbcaSSatoshi Sahara        } else {
505173bfbcaSSatoshi Sahara            $term = $term.$BR;
506173bfbcaSSatoshi Sahara        }
507173bfbcaSSatoshi Sahara
508173bfbcaSSatoshi Sahara        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
509173bfbcaSSatoshi Sahara            $term = '';
510173bfbcaSSatoshi Sahara        }
511173bfbcaSSatoshi Sahara        return $term;
512173bfbcaSSatoshi Sahara    }
513173bfbcaSSatoshi Sahara
514173bfbcaSSatoshi Sahara    /**
515173bfbcaSSatoshi Sahara     * Combine found documents and sum up their scores
516173bfbcaSSatoshi Sahara     *
517173bfbcaSSatoshi Sahara     * This function is used to combine searched words with a logical
518173bfbcaSSatoshi Sahara     * AND. Only documents available in all arrays are returned.
519173bfbcaSSatoshi Sahara     *
520173bfbcaSSatoshi Sahara     * based upon PEAR's PHP_Compat function for array_intersect_key()
521173bfbcaSSatoshi Sahara     *
522173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
523173bfbcaSSatoshi Sahara     * @return array
524173bfbcaSSatoshi Sahara     */
525173bfbcaSSatoshi Sahara    protected static function resultCombine($args)
526173bfbcaSSatoshi Sahara    {
527173bfbcaSSatoshi Sahara        $array_count = count($args);
528173bfbcaSSatoshi Sahara        if ($array_count == 1) {
529173bfbcaSSatoshi Sahara            return $args[0];
530173bfbcaSSatoshi Sahara        }
531173bfbcaSSatoshi Sahara
532173bfbcaSSatoshi Sahara        $result = array();
533173bfbcaSSatoshi Sahara        if ($array_count > 1) {
534173bfbcaSSatoshi Sahara            foreach ($args[0] as $key => $value) {
535173bfbcaSSatoshi Sahara                $result[$key] = $value;
536173bfbcaSSatoshi Sahara                for ($i = 1; $i !== $array_count; $i++) {
537173bfbcaSSatoshi Sahara                    if (!isset($args[$i][$key])) {
538173bfbcaSSatoshi Sahara                        unset($result[$key]);
539173bfbcaSSatoshi Sahara                        break;
540173bfbcaSSatoshi Sahara                    }
541173bfbcaSSatoshi Sahara                    $result[$key] += $args[$i][$key];
542173bfbcaSSatoshi Sahara                }
543173bfbcaSSatoshi Sahara            }
544173bfbcaSSatoshi Sahara        }
545173bfbcaSSatoshi Sahara        return $result;
546173bfbcaSSatoshi Sahara    }
547173bfbcaSSatoshi Sahara
548173bfbcaSSatoshi Sahara    /**
549173bfbcaSSatoshi Sahara     * Unites found documents and sum up their scores
550173bfbcaSSatoshi Sahara     * based upon resultCombine() method
551173bfbcaSSatoshi Sahara     *
552173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
553173bfbcaSSatoshi Sahara     * @return array
554173bfbcaSSatoshi Sahara     *
555173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
556173bfbcaSSatoshi Sahara     */
557173bfbcaSSatoshi Sahara    protected static function resultUnite($args)
558173bfbcaSSatoshi Sahara    {
559173bfbcaSSatoshi Sahara        $array_count = count($args);
560173bfbcaSSatoshi Sahara        if ($array_count === 1) {
561173bfbcaSSatoshi Sahara            return $args[0];
562173bfbcaSSatoshi Sahara        }
563173bfbcaSSatoshi Sahara
564173bfbcaSSatoshi Sahara        $result = $args[0];
565173bfbcaSSatoshi Sahara        for ($i = 1; $i !== $array_count; $i++) {
566173bfbcaSSatoshi Sahara            foreach (array_keys($args[$i]) as $id) {
567173bfbcaSSatoshi Sahara                $result[$id] += $args[$i][$id];
568173bfbcaSSatoshi Sahara            }
569173bfbcaSSatoshi Sahara        }
570173bfbcaSSatoshi Sahara        return $result;
571173bfbcaSSatoshi Sahara    }
572173bfbcaSSatoshi Sahara
573173bfbcaSSatoshi Sahara    /**
574173bfbcaSSatoshi Sahara     * Computes the difference of documents using page id for comparison
575173bfbcaSSatoshi Sahara     * nearly identical to PHP5's array_diff_key()
576173bfbcaSSatoshi Sahara     *
577173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
578173bfbcaSSatoshi Sahara     * @return array
579173bfbcaSSatoshi Sahara     *
580173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
581173bfbcaSSatoshi Sahara     */
582173bfbcaSSatoshi Sahara    protected static function resultComplement($args)
583173bfbcaSSatoshi Sahara    {
584173bfbcaSSatoshi Sahara        $array_count = count($args);
585173bfbcaSSatoshi Sahara        if ($array_count === 1) {
586173bfbcaSSatoshi Sahara            return $args[0];
587173bfbcaSSatoshi Sahara        }
588173bfbcaSSatoshi Sahara
589173bfbcaSSatoshi Sahara        $result = $args[0];
590173bfbcaSSatoshi Sahara        foreach (array_keys($result) as $id) {
591173bfbcaSSatoshi Sahara            for ($i = 1; $i !== $array_count; $i++) {
592173bfbcaSSatoshi Sahara                if (isset($args[$i][$id])) unset($result[$id]);
593173bfbcaSSatoshi Sahara            }
594173bfbcaSSatoshi Sahara        }
595173bfbcaSSatoshi Sahara        return $result;
596173bfbcaSSatoshi Sahara    }
597173bfbcaSSatoshi Sahara}
598