xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 9329b002986cc3f43c18c207dd9d0fdfd0f8a5e8)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\FulltextIndex;
7use dokuwiki\Search\QueryParser;
8use dokuwiki\Utf8;
9
10// create snippets for the first few results only
11const FT_SNIPPET_NUMBER = 15;
12
13/**
14 * Class DokuWiki Fulltext Search
15 *
16 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
17 * @author     Andreas Gohr <andi@splitbrain.org>
18 */
19class FulltextSearch
20{
21    /** @var FulltextSearch $instance */
22    protected static $instance = null;
23
24    /**
25     * Get new or existing singleton instance of the FulltextSearch
26     *
27     * @return FulltextSearch
28     */
29    public static function getInstance()
30    {
31        if (is_null(static::$instance)) {
32            static::$instance = new static();
33        }
34        return static::$instance;
35    }
36
37    /**
38     *  Fulltext Search constructor. prevent direct object creation
39     */
40    protected function __construct() {}
41
42    /**
43     * The fulltext search
44     *
45     * Returns a list of matching documents for the given query
46     *
47     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
48     *
49     * @param string     $query
50     * @param array      $highlight
51     * @param string     $sort
52     * @param int|string $after  only show results with mtime after this date,
53     *                           accepts timestap or strtotime arguments
54     * @param int|string $before only show results with mtime before this date,
55     *                           accepts timestap or strtotime arguments
56     *
57     * @return array
58     */
59    public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
60    {
61        if ($sort === null) {
62            $sort = 'hits';
63        }
64        $data = [
65            'query' => $query,
66            'sort' => $sort,
67            'after' => $after,
68            'before' => $before
69        ];
70        $data['highlight'] =& $highlight;
71        $action = [$this, 'pageSearchCallBack'];
72        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
73    }
74
75    /**
76     * Returns a list of matching documents for the given query
77     *
78     * @author Andreas Gohr <andi@splitbrain.org>
79     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
80     *
81     * @param array $data  event data
82     * @return array       matching documents
83     */
84    public function pageSearchCallBack(&$data)
85    {
86        // parse the given query
87        $q = (new QueryParser)->convert($data['query']);
88        $data['highlight'] = $q['highlight'];
89
90        if (empty($q['parsed_ary'])) return array();
91
92        // lookup all words found in the query
93        $FulltextIndex = FulltextIndex::getInstance();
94        $lookup = $FulltextIndex->lookupWords($q['words']);
95
96        // get all pages in this dokuwiki site (!: includes nonexistent pages)
97        $pages_all = array();
98        foreach ($FulltextIndex->getPages() as $id) {
99            $pages_all[$id] = 0; // base: 0 hit
100        }
101
102        // process the query
103        $stack = array();
104        foreach ($q['parsed_ary'] as $token) {
105            switch (substr($token, 0, 3)) {
106                case 'W+:':
107                case 'W-:':
108                case 'W_:': // word
109                    $word    = substr($token, 3);
110                    $stack[] = (array) $lookup[$word];
111                    break;
112                case 'P+:':
113                case 'P-:': // phrase
114                    $phrase = substr($token, 3);
115                    // since phrases are always parsed as ((W1)(W2)...(P)),
116                    // the end($stack) always points the pages that contain
117                    // all words in this phrase
118                    $pages  = end($stack);
119                    $pages_matched = array();
120                    foreach (array_keys($pages) as $id) {
121                        $evdata = array(
122                            'id' => $id,
123                            'phrase' => $phrase,
124                            'text' => rawWiki($id)
125                        );
126                        $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
127                        if ($event->advise_before() && $event->result !== true) {
128                            $text = Utf8\PhpString::strtolower($evdata['text']);
129                            if (strpos($text, $phrase) !== false) {
130                                $event->result = true;
131                            }
132                        }
133                        $event->advise_after();
134                        if ($event->result === true) {
135                            $pages_matched[$id] = 0; // phrase: always 0 hit
136                        }
137                    }
138                    $stack[] = $pages_matched;
139                    break;
140                case 'N+:':
141                case 'N-:': // namespace
142                    $ns = cleanID(substr($token, 3)) . ':';
143                    $pages_matched = array();
144                    foreach (array_keys($pages_all) as $id) {
145                        if (strpos($id, $ns) === 0) {
146                            $pages_matched[$id] = 0; // namespace: always 0 hit
147                        }
148                    }
149                    $stack[] = $pages_matched;
150                    break;
151                case 'AND': // and operation
152                    list($pages1, $pages2) = array_splice($stack, -2);
153                    $stack[] = $this->resultCombine(array($pages1, $pages2));
154                    break;
155                case 'OR':  // or operation
156                    list($pages1, $pages2) = array_splice($stack, -2);
157                    $stack[] = $this->resultUnite(array($pages1, $pages2));
158                    break;
159                case 'NOT': // not operation (unary)
160                    $pages   = array_pop($stack);
161                    $stack[] = $this->resultComplement(array($pages_all, $pages));
162                    break;
163            }
164        }
165        $docs = array_pop($stack);
166
167        if (empty($docs)) return array();
168
169        // check: settings, acls, existence
170        foreach (array_keys($docs) as $id) {
171            if (isHiddenPage($id)
172                || auth_quickaclcheck($id) < AUTH_READ
173                || !page_exists($id, '', false)
174            ) {
175                unset($docs[$id]);
176            }
177        }
178
179        $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']);
180
181        if ($data['sort'] === 'mtime') {
182            uksort($docs, [$this, 'pagemtimesorter']);
183        } else {
184            // sort docs by count
185            arsort($docs);
186        }
187
188        return $docs;
189    }
190
191    /**
192     * @param array      $results search results in the form pageid => value
193     * @param int|string $after   only returns results with mtime after this date,
194     *                            accepts timestap or strtotime arguments
195     * @param int|string $before  only returns results with mtime after this date,
196     *                            accepts timestap or strtotime arguments
197     *
198     * @return array
199     */
200    protected function filterResultsByTime(array $results, $after, $before)
201    {
202        if ($after || $before) {
203            $after = is_int($after) ? $after : strtotime($after);
204            $before = is_int($before) ? $before : strtotime($before);
205
206            foreach ($results as $id => $value) {
207                $mTime = filemtime(wikiFN($id));
208                if ($after && $after > $mTime) {
209                    unset($results[$id]);
210                    continue;
211                }
212                if ($before && $before < $mTime) {
213                    unset($results[$id]);
214                }
215            }
216        }
217        return $results;
218    }
219
220    /**
221     * Sort pages by their mtime, from newest to oldest
222     *
223     * @param string $a
224     * @param string $b
225     *
226     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
227     *             and 0 if they are of the same age
228     */
229    protected function pagemtimesorter($a, $b)
230    {
231        $mtimeA = filemtime(wikiFN($a));
232        $mtimeB = filemtime(wikiFN($b));
233        return $mtimeB - $mtimeA;
234    }
235
236    /**
237     * Creates a snippet extract
238     *
239     * @author Andreas Gohr <andi@splitbrain.org>
240     * @triggers FULLTEXT_SNIPPET_CREATE
241     *
242     * @param string $id page id
243     * @param array $highlight
244     * @return mixed
245     */
246    public function snippet($id, $highlight)
247    {
248        $text = rawWiki($id);
249        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
250        $evdata = array(
251            'id'        => $id,
252            'text'      => &$text,
253            'highlight' => &$highlight,
254            'snippet'   => '',
255        );
256
257        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
258        if ($evt->advise_before()) {
259            $match = array();
260            $snippets = array();
261            $utf8_offset = $offset = $end = 0;
262            $len = Utf8\PhpString::strlen($text);
263
264            // build a regexp from the phrases to highlight
265            $re1 = '(' .
266                join(
267                    '|',
268                    array_map(
269                        [$this, 'snippetRePreprocess'],
270                        array_map(
271                            'preg_quote_cb',
272                            array_filter((array) $highlight)
273                        )
274                    )
275                ) .
276                ')';
277            $re2 = "$re1.{0,75}(?!\\1)$re1";
278            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
279
280            for ($cnt=4; $cnt--;) {
281                if (0) {
282                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
283                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
284                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
285                } else {
286                    break;
287                }
288
289                list($str, $idx) = $match[0];
290
291                // convert $idx (a byte offset) into a utf8 character offset
292                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
293                $utf8_len = Utf8\PhpString::strlen($str);
294
295                // establish context, 100 bytes surrounding the match string
296                // first look to see if we can go 100 either side,
297                // then drop to 50 adding any excess if the other side can't go to 50,
298                $pre = min($utf8_idx - $utf8_offset, 100);
299                $post = min($len - $utf8_idx - $utf8_len, 100);
300
301                if ($pre > 50 && $post > 50) {
302                    $pre = $post = 50;
303                } elseif ($pre > 50) {
304                    $pre = min($pre, 100 - $post);
305                } elseif ($post > 50) {
306                    $post = min($post, 100 - $pre);
307                } elseif ($offset == 0) {
308                    // both are less than 50, means the context is the whole string
309                    // make it so and break out of this loop - there is no need for the
310                    // complex snippet calculations
311                    $snippets = array($text);
312                    break;
313                }
314
315                // establish context start and end points, try to append to previous
316                // context if possible
317                $start = $utf8_idx - $pre;
318                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
319                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
320
321                if ($append) {
322                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
323                } else {
324                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
325                }
326
327                // set $offset for next match attempt
328                // continue matching after the current match
329                // if the current match is not the longest possible match starting at the current offset
330                // this prevents further matching of this snippet but for possible matches of length
331                // smaller than match length + context (at least 50 characters) this match is part of the context
332                $utf8_offset = $utf8_idx + $utf8_len;
333                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
334                $offset = Utf8\Clean::correctIdx($text, $offset);
335            }
336
337            $m = "\1";
338            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
339            $snippet = preg_replace(
340                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
341                '<strong class="search_hit">$1</strong>',
342                hsc(join('... ', $snippets))
343            );
344
345            $evdata['snippet'] = $snippet;
346        }
347        $evt->advise_after();
348        unset($evt);
349
350        return $evdata['snippet'];
351    }
352
353    /**
354     * Wraps a search term in regex boundary checks.
355     *
356     * @param string $term
357     * @return string
358     */
359    public function snippetRePreprocess($term)
360    {
361        // do not process asian terms where word boundaries are not explicit
362        if (Utf8\Asian::isAsianWords($term)) return $term;
363
364        if (UTF8_PROPERTYSUPPORT) {
365            // unicode word boundaries
366            // see http://stackoverflow.com/a/2449017/172068
367            $BL = '(?<!\pL)';
368            $BR = '(?!\pL)';
369        } else {
370            // not as correct as above, but at least won't break
371            $BL = '\b';
372            $BR = '\b';
373        }
374
375        if (substr($term, 0, 2) == '\\*') {
376            $term = substr($term, 2);
377        } else {
378            $term = $BL.$term;
379        }
380
381        if (substr($term, -2, 2) == '\\*') {
382            $term = substr($term, 0, -2);
383        } else {
384            $term = $term.$BR;
385        }
386
387        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
388            $term = '';
389        }
390        return $term;
391    }
392
393    /**
394     * Combine found documents and sum up their scores
395     *
396     * This function is used to combine searched words with a logical
397     * AND. Only documents available in all arrays are returned.
398     *
399     * based upon PEAR's PHP_Compat function for array_intersect_key()
400     *
401     * @param array $args An array of page arrays
402     * @return array
403     */
404    protected function resultCombine($args)
405    {
406        $array_count = count($args);
407        if ($array_count == 1) {
408            return $args[0];
409        }
410
411        $result = array();
412        if ($array_count > 1) {
413            foreach ($args[0] as $key => $value) {
414                $result[$key] = $value;
415                for ($i = 1; $i !== $array_count; $i++) {
416                    if (!isset($args[$i][$key])) {
417                        unset($result[$key]);
418                        break;
419                    }
420                    $result[$key] += $args[$i][$key];
421                }
422            }
423        }
424        return $result;
425    }
426
427    /**
428     * Unites found documents and sum up their scores
429     * based upon resultCombine() method
430     *
431     * @param array $args An array of page arrays
432     * @return array
433     *
434     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
435     */
436    protected function resultUnite($args)
437    {
438        $array_count = count($args);
439        if ($array_count === 1) {
440            return $args[0];
441        }
442
443        $result = $args[0];
444        for ($i = 1; $i !== $array_count; $i++) {
445            foreach (array_keys($args[$i]) as $id) {
446                $result[$id] += $args[$i][$id];
447            }
448        }
449        return $result;
450    }
451
452    /**
453     * Computes the difference of documents using page id for comparison
454     * nearly identical to PHP5's array_diff_key()
455     *
456     * @param array $args An array of page arrays
457     * @return array
458     *
459     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
460     */
461    protected function resultComplement($args)
462    {
463        $array_count = count($args);
464        if ($array_count === 1) {
465            return $args[0];
466        }
467
468        $result = $args[0];
469        foreach (array_keys($result) as $id) {
470            for ($i = 1; $i !== $array_count; $i++) {
471                if (isset($args[$i][$id])) unset($result[$id]);
472            }
473        }
474        return $result;
475    }
476}
477