xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 743c9a28dd4c99f9336a634d173da73f2c7f1e59)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\FulltextIndex;
7use dokuwiki\Search\QueryParser;
8use dokuwiki\Utf8;
9
10// create snippets for the first few results only
11const FT_SNIPPET_NUMBER = 15;
12
13/**
14 * Class DokuWiki Fulltext Search
15 *
16 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
17 * @author     Andreas Gohr <andi@splitbrain.org>
18 */
19class FulltextSearch
20{
21    /**
22     *  Fulltext Search constructor. prevent direct object creation
23     */
24    protected function __construct() {}
25
26    /**
27     * The fulltext search
28     *
29     * Returns a list of matching documents for the given query
30     *
31     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
32     *
33     * @param string     $query
34     * @param array      $highlight
35     * @param string     $sort
36     * @param int|string $after  only show results with mtime after this date,
37     *                           accepts timestap or strtotime arguments
38     * @param int|string $before only show results with mtime before this date,
39     *                           accepts timestap or strtotime arguments
40     *
41     * @return array
42     */
43    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
44    {
45        if ($sort === null) {
46            $sort = 'hits';
47        }
48        $data = [
49            'query' => $query,
50            'sort' => $sort,
51            'after' => $after,
52            'before' => $before
53        ];
54        $data['highlight'] =& $highlight;
55        $action = static::class.'::pageSearchCallBack';
56        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
57    }
58
59    /**
60     * Returns a list of matching documents for the given query
61     *
62     * @author Andreas Gohr <andi@splitbrain.org>
63     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
64     *
65     * @param array $data  event data
66     * @return array       matching documents
67     */
68    public static function pageSearchCallBack(&$data)
69    {
70        // parse the given query
71        $q = QueryParser::convert($data['query']);
72        $data['highlight'] = $q['highlight'];
73
74        if (empty($q['parsed_ary'])) return array();
75
76        // lookup all words found in the query
77        $FulltextIndex = FulltextIndex::getInstance();
78        $lookup = $FulltextIndex->lookupWords($q['words']);
79
80        // get all pages in this dokuwiki site (!: includes nonexistent pages)
81        $pages_all = array();
82        foreach ($FulltextIndex->getPages() as $id) {
83            $pages_all[$id] = 0; // base: 0 hit
84        }
85
86        // process the query
87        $stack = array();
88        foreach ($q['parsed_ary'] as $token) {
89            switch (substr($token, 0, 3)) {
90                case 'W+:':
91                case 'W-:':
92                case 'W_:': // word
93                    $word    = substr($token, 3);
94                    $stack[] = (array) $lookup[$word];
95                    break;
96                case 'P+:':
97                case 'P-:': // phrase
98                    $phrase = substr($token, 3);
99                    // since phrases are always parsed as ((W1)(W2)...(P)),
100                    // the end($stack) always points the pages that contain
101                    // all words in this phrase
102                    $pages  = end($stack);
103                    $pages_matched = array();
104                    foreach (array_keys($pages) as $id) {
105                        $evdata = array(
106                            'id' => $id,
107                            'phrase' => $phrase,
108                            'text' => rawWiki($id)
109                        );
110                        $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
111                        if ($event->advise_before() && $event->result !== true) {
112                            $text = Utf8\PhpString::strtolower($evdata['text']);
113                            if (strpos($text, $phrase) !== false) {
114                                $event->result = true;
115                            }
116                        }
117                        $event->advise_after();
118                        if ($event->result === true) {
119                            $pages_matched[$id] = 0; // phrase: always 0 hit
120                        }
121                    }
122                    $stack[] = $pages_matched;
123                    break;
124                case 'N+:':
125                case 'N-:': // namespace
126                    $ns = cleanID(substr($token, 3)) . ':';
127                    $pages_matched = array();
128                    foreach (array_keys($pages_all) as $id) {
129                        if (strpos($id, $ns) === 0) {
130                            $pages_matched[$id] = 0; // namespace: always 0 hit
131                        }
132                    }
133                    $stack[] = $pages_matched;
134                    break;
135                case 'AND': // and operation
136                    list($pages1, $pages2) = array_splice($stack, -2);
137                    $stack[] = static::resultCombine(array($pages1, $pages2));
138                    break;
139                case 'OR':  // or operation
140                    list($pages1, $pages2) = array_splice($stack, -2);
141                    $stack[] = static::resultUnite(array($pages1, $pages2));
142                    break;
143                case 'NOT': // not operation (unary)
144                    $pages   = array_pop($stack);
145                    $stack[] = static::resultComplement(array($pages_all, $pages));
146                    break;
147            }
148        }
149        $docs = array_pop($stack);
150
151        if (empty($docs)) return array();
152
153        // check: settings, acls, existence
154        foreach (array_keys($docs) as $id) {
155            if (isHiddenPage($id)
156                || auth_quickaclcheck($id) < AUTH_READ
157                || !page_exists($id, '', false)
158            ) {
159                unset($docs[$id]);
160            }
161        }
162
163        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
164
165        if ($data['sort'] === 'mtime') {
166            uksort($docs, static::class.'::pagemtimesorter');
167        } else {
168            // sort docs by count
169            arsort($docs);
170        }
171
172        return $docs;
173    }
174
175    /**
176     * @param array      $results search results in the form pageid => value
177     * @param int|string $after   only returns results with mtime after this date,
178     *                            accepts timestap or strtotime arguments
179     * @param int|string $before  only returns results with mtime after this date,
180     *                            accepts timestap or strtotime arguments
181     *
182     * @return array
183     */
184    protected static function filterResultsByTime(array $results, $after, $before)
185    {
186        if ($after || $before) {
187            $after = is_int($after) ? $after : strtotime($after);
188            $before = is_int($before) ? $before : strtotime($before);
189
190            foreach ($results as $id => $value) {
191                $mTime = filemtime(wikiFN($id));
192                if ($after && $after > $mTime) {
193                    unset($results[$id]);
194                    continue;
195                }
196                if ($before && $before < $mTime) {
197                    unset($results[$id]);
198                }
199            }
200        }
201        return $results;
202    }
203
204    /**
205     * Sort pages by their mtime, from newest to oldest
206     *
207     * @param string $a
208     * @param string $b
209     *
210     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
211     *             and 0 if they are of the same age
212     */
213    protected static function pagemtimesorter($a, $b)
214    {
215        $mtimeA = filemtime(wikiFN($a));
216        $mtimeB = filemtime(wikiFN($b));
217        return $mtimeB - $mtimeA;
218    }
219
220    /**
221     * Creates a snippet extract
222     *
223     * @author Andreas Gohr <andi@splitbrain.org>
224     * @triggers FULLTEXT_SNIPPET_CREATE
225     *
226     * @param string $id page id
227     * @param array $highlight
228     * @return mixed
229     */
230    public static function snippet($id, $highlight)
231    {
232        $text = rawWiki($id);
233        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
234        $evdata = array(
235            'id'        => $id,
236            'text'      => &$text,
237            'highlight' => &$highlight,
238            'snippet'   => '',
239        );
240
241        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
242        if ($evt->advise_before()) {
243            $match = array();
244            $snippets = array();
245            $utf8_offset = $offset = $end = 0;
246            $len = Utf8\PhpString::strlen($text);
247
248            // build a regexp from the phrases to highlight
249            $re1 = '(' .
250                join(
251                    '|',
252                    array_map(
253                        static::class.'::snippetRePreprocess',
254                        array_map(
255                            'preg_quote_cb',
256                            array_filter((array) $highlight)
257                        )
258                    )
259                ) .
260                ')';
261            $re2 = "$re1.{0,75}(?!\\1)$re1";
262            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
263
264            for ($cnt=4; $cnt--;) {
265                if (0) {
266                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
267                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
268                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
269                } else {
270                    break;
271                }
272
273                list($str, $idx) = $match[0];
274
275                // convert $idx (a byte offset) into a utf8 character offset
276                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
277                $utf8_len = Utf8\PhpString::strlen($str);
278
279                // establish context, 100 bytes surrounding the match string
280                // first look to see if we can go 100 either side,
281                // then drop to 50 adding any excess if the other side can't go to 50,
282                $pre = min($utf8_idx - $utf8_offset, 100);
283                $post = min($len - $utf8_idx - $utf8_len, 100);
284
285                if ($pre > 50 && $post > 50) {
286                    $pre = $post = 50;
287                } elseif ($pre > 50) {
288                    $pre = min($pre, 100 - $post);
289                } elseif ($post > 50) {
290                    $post = min($post, 100 - $pre);
291                } elseif ($offset == 0) {
292                    // both are less than 50, means the context is the whole string
293                    // make it so and break out of this loop - there is no need for the
294                    // complex snippet calculations
295                    $snippets = array($text);
296                    break;
297                }
298
299                // establish context start and end points, try to append to previous
300                // context if possible
301                $start = $utf8_idx - $pre;
302                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
303                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
304
305                if ($append) {
306                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
307                } else {
308                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
309                }
310
311                // set $offset for next match attempt
312                // continue matching after the current match
313                // if the current match is not the longest possible match starting at the current offset
314                // this prevents further matching of this snippet but for possible matches of length
315                // smaller than match length + context (at least 50 characters) this match is part of the context
316                $utf8_offset = $utf8_idx + $utf8_len;
317                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
318                $offset = Utf8\Clean::correctIdx($text, $offset);
319            }
320
321            $m = "\1";
322            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
323            $snippet = preg_replace(
324                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
325                '<strong class="search_hit">$1</strong>',
326                hsc(join('... ', $snippets))
327            );
328
329            $evdata['snippet'] = $snippet;
330        }
331        $evt->advise_after();
332        unset($evt);
333
334        return $evdata['snippet'];
335    }
336
337    /**
338     * Wraps a search term in regex boundary checks.
339     *
340     * @param string $term
341     * @return string
342     */
343    public static function snippetRePreprocess($term)
344    {
345        // do not process asian terms where word boundaries are not explicit
346        if (Utf8\Asian::isAsianWords($term)) return $term;
347
348        if (UTF8_PROPERTYSUPPORT) {
349            // unicode word boundaries
350            // see http://stackoverflow.com/a/2449017/172068
351            $BL = '(?<!\pL)';
352            $BR = '(?!\pL)';
353        } else {
354            // not as correct as above, but at least won't break
355            $BL = '\b';
356            $BR = '\b';
357        }
358
359        if (substr($term, 0, 2) == '\\*') {
360            $term = substr($term, 2);
361        } else {
362            $term = $BL.$term;
363        }
364
365        if (substr($term, -2, 2) == '\\*') {
366            $term = substr($term, 0, -2);
367        } else {
368            $term = $term.$BR;
369        }
370
371        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
372            $term = '';
373        }
374        return $term;
375    }
376
377    /**
378     * Combine found documents and sum up their scores
379     *
380     * This function is used to combine searched words with a logical
381     * AND. Only documents available in all arrays are returned.
382     *
383     * based upon PEAR's PHP_Compat function for array_intersect_key()
384     *
385     * @param array $args An array of page arrays
386     * @return array
387     */
388    protected static function resultCombine($args)
389    {
390        $array_count = count($args);
391        if ($array_count == 1) {
392            return $args[0];
393        }
394
395        $result = array();
396        if ($array_count > 1) {
397            foreach ($args[0] as $key => $value) {
398                $result[$key] = $value;
399                for ($i = 1; $i !== $array_count; $i++) {
400                    if (!isset($args[$i][$key])) {
401                        unset($result[$key]);
402                        break;
403                    }
404                    $result[$key] += $args[$i][$key];
405                }
406            }
407        }
408        return $result;
409    }
410
411    /**
412     * Unites found documents and sum up their scores
413     * based upon resultCombine() method
414     *
415     * @param array $args An array of page arrays
416     * @return array
417     *
418     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
419     */
420    protected static function resultUnite($args)
421    {
422        $array_count = count($args);
423        if ($array_count === 1) {
424            return $args[0];
425        }
426
427        $result = $args[0];
428        for ($i = 1; $i !== $array_count; $i++) {
429            foreach (array_keys($args[$i]) as $id) {
430                $result[$id] += $args[$i][$id];
431            }
432        }
433        return $result;
434    }
435
436    /**
437     * Computes the difference of documents using page id for comparison
438     * nearly identical to PHP5's array_diff_key()
439     *
440     * @param array $args An array of page arrays
441     * @return array
442     *
443     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
444     */
445    protected static function resultComplement($args)
446    {
447        $array_count = count($args);
448        if ($array_count === 1) {
449            return $args[0];
450        }
451
452        $result = $args[0];
453        foreach (array_keys($result) as $id) {
454            for ($i = 1; $i !== $array_count; $i++) {
455                if (isset($args[$i][$id])) unset($result[$id]);
456            }
457        }
458        return $result;
459    }
460}
461