xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 0cba610bea94e5841d211c0d3f57ae96e8ad1379)
1<?php
2namespace dokuwiki\Search;
3
4use dokuwiki\Extension\Event;
5use dokuwiki\Search\PageIndex;
6use dokuwiki\Search\QueryParser;
7use dokuwiki\Utf8;
8
9/**
10 * create snippets for the first few results only
11 */
12const FT_SNIPPET_NUMBER = 15;
13
14/**
15 * Class DokuWiki Fulltext Search
16 *
17 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18 * @author     Andreas Gohr <andi@splitbrain.org>
19 */
20class FulltextSearch
21{
22    /**
23     *  Fulltext Search constructor. prevent direct object creation
24     */
25    protected function __construct() {}
26
27    /**
28     * The fulltext search
29     *
30     * Returns a list of matching documents for the given query
31     *
32     * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
33     *
34     * @param string     $query
35     * @param array      $highlight
36     * @param string     $sort
37     * @param int|string $after  only show results with mtime after this date,
38     *                           accepts timestap or strtotime arguments
39     * @param int|string $before only show results with mtime before this date,
40     *                           accepts timestap or strtotime arguments
41     *
42     * @return array
43     */
44    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
45    {
46        if ($sort === null) {
47            $sort = 'hits';
48        }
49        $data = [
50            'query' => $query,
51            'sort' => $sort,
52            'after' => $after,
53            'before' => $before
54        ];
55        $data['highlight'] =& $highlight;
56        $action = static::class.'::callback_pageSearch';
57        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
58    }
59
60    /**
61     * Returns a list of matching documents for the given query
62     *
63     * @author Andreas Gohr <andi@splitbrain.org>
64     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
65     *
66     * @param array $data  event data
67     * @return array       matching documents
68     */
69    public static function callback_pageSearch($data)
70    {
71        $Indexer = PageIndex::getInstance();
72
73        // parse the given query
74        $q = QueryParser::convert($data['query']);
75        $data['highlight'] = $q['highlight'];
76
77        if (empty($q['parsed_ary'])) return array();
78
79        // lookup all words found in the query
80        $lookup = $Indexer->lookup($q['words']);
81
82        // get all pages in this dokuwiki site (!: includes nonexistent pages)
83        $pages_all = array();
84        foreach ($Indexer->getPages() as $id) {
85            $pages_all[$id] = 0; // base: 0 hit
86        }
87
88        // process the query
89        $stack = array();
90        foreach ($q['parsed_ary'] as $token) {
91            switch (substr($token, 0, 3)) {
92                case 'W+:':
93                case 'W-:':
94                case 'W_:': // word
95                    $word    = substr($token, 3);
96                    $stack[] = (array) $lookup[$word];
97                    break;
98                case 'P+:':
99                case 'P-:': // phrase
100                    $phrase = substr($token, 3);
101                    // since phrases are always parsed as ((W1)(W2)...(P)),
102                    // the end($stack) always points the pages that contain
103                    // all words in this phrase
104                    $pages  = end($stack);
105                    $pages_matched = array();
106                    foreach (array_keys($pages) as $id) {
107                        $evdata = array(
108                            'id' => $id,
109                            'phrase' => $phrase,
110                            'text' => rawWiki($id)
111                        );
112                        $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
113                        if ($evt->advise_before() && $evt->result !== true) {
114                            $text = Utf8\PhpString::strtolower($evdata['text']);
115                            if (strpos($text, $phrase) !== false) {
116                                $evt->result = true;
117                            }
118                        }
119                        $evt->advise_after();
120                        if ($evt->result === true) {
121                            $pages_matched[$id] = 0; // phrase: always 0 hit
122                        }
123                    }
124                    $stack[] = $pages_matched;
125                    break;
126                case 'N+:':
127                case 'N-:': // namespace
128                    $ns = cleanID(substr($token, 3)) . ':';
129                    $pages_matched = array();
130                    foreach (array_keys($pages_all) as $id) {
131                        if (strpos($id, $ns) === 0) {
132                            $pages_matched[$id] = 0; // namespace: always 0 hit
133                        }
134                    }
135                    $stack[] = $pages_matched;
136                    break;
137                case 'AND': // and operation
138                    list($pages1, $pages2) = array_splice($stack, -2);
139                    $stack[] = static::resultCombine(array($pages1, $pages2));
140                    break;
141                case 'OR':  // or operation
142                    list($pages1, $pages2) = array_splice($stack, -2);
143                    $stack[] = static::resultUnite(array($pages1, $pages2));
144                    break;
145                case 'NOT': // not operation (unary)
146                    $pages   = array_pop($stack);
147                    $stack[] = static::resultComplement(array($pages_all, $pages));
148                    break;
149            }
150        }
151        $docs = array_pop($stack);
152
153        if (empty($docs)) return array();
154
155        // check: settings, acls, existence
156        foreach (array_keys($docs) as $id) {
157            if (isHiddenPage($id)
158                || auth_quickaclcheck($id) < AUTH_READ
159                || !page_exists($id, '', false)
160            ) {
161                unset($docs[$id]);
162            }
163        }
164
165        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
166
167        if ($data['sort'] === 'mtime') {
168            uksort($docs, static::class.'::pagemtimesorter');
169        } else {
170            // sort docs by count
171            arsort($docs);
172        }
173
174        return $docs;
175    }
176
177    /**
178     * @param array      $results search results in the form pageid => value
179     * @param int|string $after   only returns results with mtime after this date,
180     *                            accepts timestap or strtotime arguments
181     * @param int|string $before  only returns results with mtime after this date,
182     *                            accepts timestap or strtotime arguments
183     *
184     * @return array
185     */
186    protected static function filterResultsByTime(array $results, $after, $before)
187    {
188        if ($after || $before) {
189            $after = is_int($after) ? $after : strtotime($after);
190            $before = is_int($before) ? $before : strtotime($before);
191
192            foreach ($results as $id => $value) {
193                $mTime = filemtime(wikiFN($id));
194                if ($after && $after > $mTime) {
195                    unset($results[$id]);
196                    continue;
197                }
198                if ($before && $before < $mTime) {
199                    unset($results[$id]);
200                }
201            }
202        }
203        return $results;
204    }
205
206    /**
207     * Sort pages by their mtime, from newest to oldest
208     *
209     * @param string $a
210     * @param string $b
211     *
212     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
213     *             and 0 if they are of the same age
214     */
215    protected static function pagemtimesorter($a, $b)
216    {
217        $mtimeA = filemtime(wikiFN($a));
218        $mtimeB = filemtime(wikiFN($b));
219        return $mtimeB - $mtimeA;
220    }
221
222    /**
223     * Creates a snippet extract
224     *
225     * @author Andreas Gohr <andi@splitbrain.org>
226     * @triggers FULLTEXT_SNIPPET_CREATE
227     *
228     * @param string $id page id
229     * @param array $highlight
230     * @return mixed
231     */
232    public static function snippet($id, $highlight)
233    {
234        $text = rawWiki($id);
235        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
236        $evdata = array(
237            'id'        => $id,
238            'text'      => &$text,
239            'highlight' => &$highlight,
240            'snippet'   => '',
241        );
242
243        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
244        if ($evt->advise_before()) {
245            $match = array();
246            $snippets = array();
247            $utf8_offset = $offset = $end = 0;
248            $len = Utf8\PhpString::strlen($text);
249
250            // build a regexp from the phrases to highlight
251            $re1 = '(' .
252                join(
253                    '|',
254                    array_map(
255                        static::class.'::snippet_re_preprocess',
256                        array_map(
257                            'preg_quote_cb',
258                            array_filter((array) $highlight)
259                        )
260                    )
261                ) .
262                ')';
263            $re2 = "$re1.{0,75}(?!\\1)$re1";
264            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
265
266            for ($cnt=4; $cnt--;) {
267                if (0) {
268                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
269                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
270                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
271                } else {
272                    break;
273                }
274
275                list($str, $idx) = $match[0];
276
277                // convert $idx (a byte offset) into a utf8 character offset
278                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
279                $utf8_len = Utf8\PhpString::strlen($str);
280
281                // establish context, 100 bytes surrounding the match string
282                // first look to see if we can go 100 either side,
283                // then drop to 50 adding any excess if the other side can't go to 50,
284                $pre = min($utf8_idx - $utf8_offset, 100);
285                $post = min($len - $utf8_idx - $utf8_len, 100);
286
287                if ($pre > 50 && $post > 50) {
288                    $pre = $post = 50;
289                } elseif ($pre > 50) {
290                    $pre = min($pre, 100 - $post);
291                } elseif ($post > 50) {
292                    $post = min($post, 100 - $pre);
293                } elseif ($offset == 0) {
294                    // both are less than 50, means the context is the whole string
295                    // make it so and break out of this loop - there is no need for the
296                    // complex snippet calculations
297                    $snippets = array($text);
298                    break;
299                }
300
301                // establish context start and end points, try to append to previous
302                // context if possible
303                $start = $utf8_idx - $pre;
304                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
305                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
306
307                if ($append) {
308                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
309                } else {
310                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
311                }
312
313                // set $offset for next match attempt
314                // continue matching after the current match
315                // if the current match is not the longest possible match starting at the current offset
316                // this prevents further matching of this snippet but for possible matches of length
317                // smaller than match length + context (at least 50 characters) this match is part of the context
318                $utf8_offset = $utf8_idx + $utf8_len;
319                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
320                $offset = Utf8\Clean::correctIdx($text, $offset);
321            }
322
323            $m = "\1";
324            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
325            $snippet = preg_replace(
326                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
327                '<strong class="search_hit">$1</strong>',
328                hsc(join('... ', $snippets))
329            );
330
331            $evdata['snippet'] = $snippet;
332        }
333        $evt->advise_after();
334        unset($evt);
335
336        return $evdata['snippet'];
337    }
338
339    /**
340     * Wraps a search term in regex boundary checks.
341     *
342     * @param string $term
343     * @return string
344     */
345    public static function snippet_re_preprocess($term)
346    {
347        // do not process asian terms where word boundaries are not explicit
348        if (Utf8\Asian::isAsianWords($term)) return $term;
349
350        if (UTF8_PROPERTYSUPPORT) {
351            // unicode word boundaries
352            // see http://stackoverflow.com/a/2449017/172068
353            $BL = '(?<!\pL)';
354            $BR = '(?!\pL)';
355        } else {
356            // not as correct as above, but at least won't break
357            $BL = '\b';
358            $BR = '\b';
359        }
360
361        if (substr($term, 0, 2) == '\\*') {
362            $term = substr($term, 2);
363        } else {
364            $term = $BL.$term;
365        }
366
367        if (substr($term, -2, 2) == '\\*') {
368            $term = substr($term, 0, -2);
369        } else {
370            $term = $term.$BR;
371        }
372
373        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
374            $term = '';
375        }
376        return $term;
377    }
378
379    /**
380     * Combine found documents and sum up their scores
381     *
382     * This function is used to combine searched words with a logical
383     * AND. Only documents available in all arrays are returned.
384     *
385     * based upon PEAR's PHP_Compat function for array_intersect_key()
386     *
387     * @param array $args An array of page arrays
388     * @return array
389     */
390    protected static function resultCombine($args)
391    {
392        $array_count = count($args);
393        if ($array_count == 1) {
394            return $args[0];
395        }
396
397        $result = array();
398        if ($array_count > 1) {
399            foreach ($args[0] as $key => $value) {
400                $result[$key] = $value;
401                for ($i = 1; $i !== $array_count; $i++) {
402                    if (!isset($args[$i][$key])) {
403                        unset($result[$key]);
404                        break;
405                    }
406                    $result[$key] += $args[$i][$key];
407                }
408            }
409        }
410        return $result;
411    }
412
413    /**
414     * Unites found documents and sum up their scores
415     * based upon resultCombine() method
416     *
417     * @param array $args An array of page arrays
418     * @return array
419     *
420     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
421     */
422    protected static function resultUnite($args)
423    {
424        $array_count = count($args);
425        if ($array_count === 1) {
426            return $args[0];
427        }
428
429        $result = $args[0];
430        for ($i = 1; $i !== $array_count; $i++) {
431            foreach (array_keys($args[$i]) as $id) {
432                $result[$id] += $args[$i][$id];
433            }
434        }
435        return $result;
436    }
437
438    /**
439     * Computes the difference of documents using page id for comparison
440     * nearly identical to PHP5's array_diff_key()
441     *
442     * @param array $args An array of page arrays
443     * @return array
444     *
445     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
446     */
447    protected static function resultComplement($args)
448    {
449        $array_count = count($args);
450        if ($array_count === 1) {
451            return $args[0];
452        }
453
454        $result = $args[0];
455        foreach (array_keys($result) as $id) {
456            for ($i = 1; $i !== $array_count; $i++) {
457                if (isset($args[$i][$id])) unset($result[$id]);
458            }
459        }
460        return $result;
461    }
462}
463