xref: /dokuwiki/inc/Search/FulltextSearch.php (revision a32da6dda625a55bd713657b22f3ab332757fca6)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Utf8;
7
8// create snippets for the first few results only
9const FT_SNIPPET_NUMBER = 15;
10
11/**
12 * Class DokuWiki Fulltext Search
13 *
14 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
15 * @author     Andreas Gohr <andi@splitbrain.org>
16 */
17class FulltextSearch
18{
19    /** @var FulltextSearch $instance */
20    protected static $instance = null;
21
22    /**
23     * Get new or existing singleton instance of the FulltextSearch
24     *
25     * @return FulltextSearch
26     */
27    public static function getInstance()
28    {
29        if (is_null(static::$instance)) {
30            static::$instance = new static();
31        }
32        return static::$instance;
33    }
34
35    /**
36     *  Fulltext Search constructor. prevent direct object creation
37     */
38    protected function __construct() {}
39
40    /**
41     * The fulltext search
42     *
43     * Returns a list of matching documents for the given query
44     *
45     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
46     *
47     * @param string     $query
48     * @param array      $highlight
49     * @param string     $sort
50     * @param int|string $after  only show results with mtime after this date,
51     *                           accepts timestap or strtotime arguments
52     * @param int|string $before only show results with mtime before this date,
53     *                           accepts timestap or strtotime arguments
54     *
55     * @return array
56     */
57    public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
58    {
59        if ($sort === null) {
60            $sort = 'hits';
61        }
62        $data = [
63            'query' => $query,
64            'sort' => $sort,
65            'after' => $after,
66            'before' => $before
67        ];
68        $data['highlight'] =& $highlight;
69        $action = [$this, 'pageSearchCallBack'];
70        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
71    }
72
73    /**
74     * Returns a list of matching documents for the given query
75     *
76     * @author Andreas Gohr <andi@splitbrain.org>
77     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
78     *
79     * @param array $data  event data
80     * @return array       matching documents
81     */
82    public function pageSearchCallBack(&$data)
83    {
84        // parse the given query
85        $q = (new QueryParser)->convert($data['query']);
86        $data['highlight'] = $q['highlight'];
87
88        if (empty($q['parsed_ary'])) return array();
89
90        // lookup all words found in the query
91        $FulltextIndex = new FulltextIndex();
92        $lookup = $FulltextIndex->lookupWords($q['words']);
93
94        // get all pages in this dokuwiki site (!: includes nonexistent pages)
95        $pages_all = array();
96        foreach ($FulltextIndex->getPages() as $id) {
97            $pages_all[$id] = 0; // base: 0 hit
98        }
99
100        // process the query
101        $stack = array();
102        foreach ($q['parsed_ary'] as $token) {
103            switch (substr($token, 0, 3)) {
104                case 'W+:':
105                case 'W-:':
106                case 'W_:': // word
107                    $word    = substr($token, 3);
108                    $stack[] = (array) $lookup[$word];
109                    break;
110                case 'P+:':
111                case 'P-:': // phrase
112                    $phrase = substr($token, 3);
113                    // since phrases are always parsed as ((W1)(W2)...(P)),
114                    // the end($stack) always points the pages that contain
115                    // all words in this phrase
116                    $pages  = end($stack);
117                    $pages_matched = array();
118                    foreach (array_keys($pages) as $id) {
119                        $evdata = array(
120                            'id' => $id,
121                            'phrase' => $phrase,
122                            'text' => rawWiki($id)
123                        );
124                        $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
125                        if ($event->advise_before() && $event->result !== true) {
126                            $text = Utf8\PhpString::strtolower($evdata['text']);
127                            if (strpos($text, $phrase) !== false) {
128                                $event->result = true;
129                            }
130                        }
131                        $event->advise_after();
132                        if ($event->result === true) {
133                            $pages_matched[$id] = 0; // phrase: always 0 hit
134                        }
135                    }
136                    $stack[] = $pages_matched;
137                    break;
138                case 'N+:':
139                case 'N-:': // namespace
140                    $ns = cleanID(substr($token, 3)) . ':';
141                    $pages_matched = array();
142                    foreach (array_keys($pages_all) as $id) {
143                        if (strpos($id, $ns) === 0) {
144                            $pages_matched[$id] = 0; // namespace: always 0 hit
145                        }
146                    }
147                    $stack[] = $pages_matched;
148                    break;
149                case 'AND': // and operation
150                    list($pages1, $pages2) = array_splice($stack, -2);
151                    $stack[] = $this->resultCombine(array($pages1, $pages2));
152                    break;
153                case 'OR':  // or operation
154                    list($pages1, $pages2) = array_splice($stack, -2);
155                    $stack[] = $this->resultUnite(array($pages1, $pages2));
156                    break;
157                case 'NOT': // not operation (unary)
158                    $pages   = array_pop($stack);
159                    $stack[] = $this->resultComplement(array($pages_all, $pages));
160                    break;
161            }
162        }
163        $docs = array_pop($stack);
164
165        if (empty($docs)) return array();
166
167        // check: settings, acls, existence
168        foreach (array_keys($docs) as $id) {
169            if (isHiddenPage($id)
170                || auth_quickaclcheck($id) < AUTH_READ
171                || !page_exists($id, '', false)
172            ) {
173                unset($docs[$id]);
174            }
175        }
176
177        $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']);
178
179        if ($data['sort'] === 'mtime') {
180            uksort($docs, [$this, 'pagemtimesorter']);
181        } else {
182            // sort docs by count
183            arsort($docs);
184        }
185
186        return $docs;
187    }
188
189    /**
190     * @param array      $results search results in the form pageid => value
191     * @param int|string $after   only returns results with mtime after this date,
192     *                            accepts timestap or strtotime arguments
193     * @param int|string $before  only returns results with mtime after this date,
194     *                            accepts timestap or strtotime arguments
195     *
196     * @return array
197     */
198    protected function filterResultsByTime(array $results, $after, $before)
199    {
200        if ($after || $before) {
201            $after = is_int($after) ? $after : strtotime($after);
202            $before = is_int($before) ? $before : strtotime($before);
203
204            foreach ($results as $id => $value) {
205                $mTime = filemtime(wikiFN($id));
206                if ($after && $after > $mTime) {
207                    unset($results[$id]);
208                    continue;
209                }
210                if ($before && $before < $mTime) {
211                    unset($results[$id]);
212                }
213            }
214        }
215        return $results;
216    }
217
218    /**
219     * Sort pages by their mtime, from newest to oldest
220     *
221     * @param string $a
222     * @param string $b
223     *
224     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
225     *             and 0 if they are of the same age
226     */
227    protected function pagemtimesorter($a, $b)
228    {
229        $mtimeA = filemtime(wikiFN($a));
230        $mtimeB = filemtime(wikiFN($b));
231        return $mtimeB - $mtimeA;
232    }
233
234    /**
235     * Creates a snippet extract
236     *
237     * @author Andreas Gohr <andi@splitbrain.org>
238     * @triggers FULLTEXT_SNIPPET_CREATE
239     *
240     * @param string $id page id
241     * @param array $highlight
242     * @return mixed
243     */
244    public function snippet($id, $highlight)
245    {
246        $text = rawWiki($id);
247        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
248        $evdata = array(
249            'id'        => $id,
250            'text'      => &$text,
251            'highlight' => &$highlight,
252            'snippet'   => '',
253        );
254
255        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
256        if ($evt->advise_before()) {
257            $match = array();
258            $snippets = array();
259            $utf8_offset = $offset = $end = 0;
260            $len = Utf8\PhpString::strlen($text);
261
262            // build a regexp from the phrases to highlight
263            $re1 = '(' .
264                join(
265                    '|',
266                    array_map(
267                        [$this, 'snippetRePreprocess'],
268                        array_map(
269                            'preg_quote_cb',
270                            array_filter((array) $highlight)
271                        )
272                    )
273                ) .
274                ')';
275            $re2 = "$re1.{0,75}(?!\\1)$re1";
276            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
277
278            for ($cnt=4; $cnt--;) {
279                if (0) {
280                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
281                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
282                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
283                } else {
284                    break;
285                }
286
287                list($str, $idx) = $match[0];
288
289                // convert $idx (a byte offset) into a utf8 character offset
290                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
291                $utf8_len = Utf8\PhpString::strlen($str);
292
293                // establish context, 100 bytes surrounding the match string
294                // first look to see if we can go 100 either side,
295                // then drop to 50 adding any excess if the other side can't go to 50,
296                $pre = min($utf8_idx - $utf8_offset, 100);
297                $post = min($len - $utf8_idx - $utf8_len, 100);
298
299                if ($pre > 50 && $post > 50) {
300                    $pre = $post = 50;
301                } elseif ($pre > 50) {
302                    $pre = min($pre, 100 - $post);
303                } elseif ($post > 50) {
304                    $post = min($post, 100 - $pre);
305                } elseif ($offset == 0) {
306                    // both are less than 50, means the context is the whole string
307                    // make it so and break out of this loop - there is no need for the
308                    // complex snippet calculations
309                    $snippets = array($text);
310                    break;
311                }
312
313                // establish context start and end points, try to append to previous
314                // context if possible
315                $start = $utf8_idx - $pre;
316                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
317                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
318
319                if ($append) {
320                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
321                } else {
322                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
323                }
324
325                // set $offset for next match attempt
326                // continue matching after the current match
327                // if the current match is not the longest possible match starting at the current offset
328                // this prevents further matching of this snippet but for possible matches of length
329                // smaller than match length + context (at least 50 characters) this match is part of the context
330                $utf8_offset = $utf8_idx + $utf8_len;
331                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
332                $offset = Utf8\Clean::correctIdx($text, $offset);
333            }
334
335            $m = "\1";
336            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
337            $snippet = preg_replace(
338                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
339                '<strong class="search_hit">$1</strong>',
340                hsc(join('... ', $snippets))
341            );
342
343            $evdata['snippet'] = $snippet;
344        }
345        $evt->advise_after();
346        unset($evt);
347
348        return $evdata['snippet'];
349    }
350
351    /**
352     * Wraps a search term in regex boundary checks.
353     *
354     * @param string $term
355     * @return string
356     */
357    public function snippetRePreprocess($term)
358    {
359        // do not process asian terms where word boundaries are not explicit
360        if (Utf8\Asian::isAsianWords($term)) return $term;
361
362        if (UTF8_PROPERTYSUPPORT) {
363            // unicode word boundaries
364            // see http://stackoverflow.com/a/2449017/172068
365            $BL = '(?<!\pL)';
366            $BR = '(?!\pL)';
367        } else {
368            // not as correct as above, but at least won't break
369            $BL = '\b';
370            $BR = '\b';
371        }
372
373        if (substr($term, 0, 2) == '\\*') {
374            $term = substr($term, 2);
375        } else {
376            $term = $BL.$term;
377        }
378
379        if (substr($term, -2, 2) == '\\*') {
380            $term = substr($term, 0, -2);
381        } else {
382            $term = $term.$BR;
383        }
384
385        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
386            $term = '';
387        }
388        return $term;
389    }
390
391    /**
392     * Combine found documents and sum up their scores
393     *
394     * This function is used to combine searched words with a logical
395     * AND. Only documents available in all arrays are returned.
396     *
397     * based upon PEAR's PHP_Compat function for array_intersect_key()
398     *
399     * @param array $args An array of page arrays
400     * @return array
401     */
402    protected function resultCombine($args)
403    {
404        $array_count = count($args);
405        if ($array_count == 1) {
406            return $args[0];
407        }
408
409        $result = array();
410        if ($array_count > 1) {
411            foreach ($args[0] as $key => $value) {
412                $result[$key] = $value;
413                for ($i = 1; $i !== $array_count; $i++) {
414                    if (!isset($args[$i][$key])) {
415                        unset($result[$key]);
416                        break;
417                    }
418                    $result[$key] += $args[$i][$key];
419                }
420            }
421        }
422        return $result;
423    }
424
425    /**
426     * Unites found documents and sum up their scores
427     * based upon resultCombine() method
428     *
429     * @param array $args An array of page arrays
430     * @return array
431     *
432     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
433     */
434    protected function resultUnite($args)
435    {
436        $array_count = count($args);
437        if ($array_count === 1) {
438            return $args[0];
439        }
440
441        $result = $args[0];
442        for ($i = 1; $i !== $array_count; $i++) {
443            foreach (array_keys($args[$i]) as $id) {
444                $result[$id] += $args[$i][$id];
445            }
446        }
447        return $result;
448    }
449
450    /**
451     * Computes the difference of documents using page id for comparison
452     * nearly identical to PHP5's array_diff_key()
453     *
454     * @param array $args An array of page arrays
455     * @return array
456     *
457     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
458     */
459    protected function resultComplement($args)
460    {
461        $array_count = count($args);
462        if ($array_count === 1) {
463            return $args[0];
464        }
465
466        $result = $args[0];
467        foreach (array_keys($result) as $id) {
468            for ($i = 1; $i !== $array_count; $i++) {
469                if (isset($args[$i][$id])) unset($result[$id]);
470            }
471        }
472        return $result;
473    }
474}
475