xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 1ca67924afdba7cda93bb9fbf36899df6f3723ef)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Utf8;
7
8// create snippets for the first few results only
9const FT_SNIPPET_NUMBER = 15;
10
11/**
12 * Class DokuWiki Fulltext Search
13 *
14 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
15 * @author     Andreas Gohr <andi@splitbrain.org>
16 */
17class FulltextSearch
18{
19    /**
20     * The fulltext search
21     *
22     * Returns a list of matching documents for the given query
23     *
24     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
25     *
26     * @param string     $query
27     * @param array      $highlight
28     * @param string     $sort
29     * @param int|string $after  only show results with mtime after this date,
30     *                           accepts timestap or strtotime arguments
31     * @param int|string $before only show results with mtime before this date,
32     *                           accepts timestap or strtotime arguments
33     *
34     * @return array
35     */
36    public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
37    {
38        if ($sort === null) {
39            $sort = 'hits';
40        }
41        $data = [
42            'query' => $query,
43            'sort' => $sort,
44            'after' => $after,
45            'before' => $before
46        ];
47        $data['highlight'] =& $highlight;
48        $action = [$this, 'pageSearchCallBack'];
49        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
50    }
51
52    /**
53     * Returns a list of matching documents for the given query
54     *
55     * @author Andreas Gohr <andi@splitbrain.org>
56     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
57     *
58     * @param array $data  event data
59     * @return array       matching documents
60     */
61    public function pageSearchCallBack(&$data)
62    {
63        // parse the given query
64        $q = (new QueryParser)->convert($data['query']);
65        $data['highlight'] = $q['highlight'];
66
67        if (empty($q['parsed_ary'])) return array();
68
69        // lookup all words found in the query
70        $FulltextIndex = new FulltextIndex();
71        $lookup = $FulltextIndex->lookupWords($q['words']);
72
73        // get all pages in this dokuwiki site (!: includes nonexistent pages)
74        $pages_all = array();
75        foreach ($FulltextIndex->getPages() as $id) {
76            $pages_all[$id] = 0; // base: 0 hit
77        }
78
79        // process the query
80        $stack = array();
81        foreach ($q['parsed_ary'] as $token) {
82            switch (substr($token, 0, 3)) {
83                case 'W+:':
84                case 'W-:':
85                case 'W_:': // word
86                    $word = substr($token, 3);
87                    if (array_key_exists($word, $lookup)) $stack[] = (array) $lookup[$word];
88                    break;
89                case 'P+:':
90                case 'P-:': // phrase
91                    $phrase = substr($token, 3);
92                    // since phrases are always parsed as ((W1)(W2)...(P)),
93                    // the end($stack) always points the pages that contain
94                    // all words in this phrase
95                    $pages  = end($stack);
96                    $pages_matched = array();
97                    foreach (array_keys($pages) as $id) {
98                        $evdata = array(
99                            'id' => $id,
100                            'phrase' => $phrase,
101                            'text' => rawWiki($id)
102                        );
103                        $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
104                        if ($event->advise_before() && $event->result !== true) {
105                            $text = Utf8\PhpString::strtolower($evdata['text']);
106                            if (strpos($text, $phrase) !== false) {
107                                $event->result = true;
108                            }
109                        }
110                        $event->advise_after();
111                        if ($event->result === true) {
112                            $pages_matched[$id] = 0; // phrase: always 0 hit
113                        }
114                    }
115                    $stack[] = $pages_matched;
116                    break;
117                case 'N+:':
118                case 'N-:': // namespace
119                    $ns = cleanID(substr($token, 3)) . ':';
120                    $pages_matched = array();
121                    foreach (array_keys($pages_all) as $id) {
122                        if (strpos($id, $ns) === 0) {
123                            $pages_matched[$id] = 0; // namespace: always 0 hit
124                        }
125                    }
126                    $stack[] = $pages_matched;
127                    break;
128                case 'AND': // and operation
129                    list($pages1, $pages2) = array_splice($stack, -2);
130                    $stack[] = $this->resultCombine(array($pages1, $pages2));
131                    break;
132                case 'OR':  // or operation
133                    list($pages1, $pages2) = array_splice($stack, -2);
134                    $stack[] = $this->resultUnite(array($pages1, $pages2));
135                    break;
136                case 'NOT': // not operation (unary)
137                    $pages   = array_pop($stack);
138                    $stack[] = $this->resultComplement(array($pages_all, $pages));
139                    break;
140            }
141        }
142        $docs = array_pop($stack);
143
144        if (empty($docs)) return array();
145
146        // check: settings, acls, existence
147        foreach (array_keys($docs) as $id) {
148            if (isHiddenPage($id)
149                || auth_quickaclcheck($id) < AUTH_READ
150                || !page_exists($id, '', false)
151            ) {
152                unset($docs[$id]);
153            }
154        }
155
156        $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']);
157
158        if ($data['sort'] === 'mtime') {
159            uksort($docs, [$this, 'pagemtimesorter']);
160        } else {
161            // sort docs by count
162            arsort($docs);
163        }
164
165        return $docs;
166    }
167
168    /**
169     * @param array      $results search results in the form pageid => value
170     * @param int|string $after   only returns results with mtime after this date,
171     *                            accepts timestap or strtotime arguments
172     * @param int|string $before  only returns results with mtime after this date,
173     *                            accepts timestap or strtotime arguments
174     *
175     * @return array
176     */
177    protected function filterResultsByTime(array $results, $after, $before)
178    {
179        if ($after || $before) {
180            $after = is_int($after) ? $after : strtotime($after);
181            $before = is_int($before) ? $before : strtotime($before);
182
183            foreach ($results as $id => $value) {
184                $mTime = filemtime(wikiFN($id));
185                if ($after && $after > $mTime) {
186                    unset($results[$id]);
187                    continue;
188                }
189                if ($before && $before < $mTime) {
190                    unset($results[$id]);
191                }
192            }
193        }
194        return $results;
195    }
196
197    /**
198     * Sort pages by their mtime, from newest to oldest
199     *
200     * @param string $a
201     * @param string $b
202     *
203     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
204     *             and 0 if they are of the same age
205     */
206    protected function pagemtimesorter($a, $b)
207    {
208        $mtimeA = filemtime(wikiFN($a));
209        $mtimeB = filemtime(wikiFN($b));
210        return $mtimeB - $mtimeA;
211    }
212
213    /**
214     * Creates a snippet extract
215     *
216     * @author Andreas Gohr <andi@splitbrain.org>
217     * @triggers FULLTEXT_SNIPPET_CREATE
218     *
219     * @param string $id page id
220     * @param array $highlight
221     * @return mixed
222     */
223    public function snippet($id, $highlight)
224    {
225        $text = rawWiki($id);
226        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
227        $evdata = array(
228            'id'        => $id,
229            'text'      => &$text,
230            'highlight' => &$highlight,
231            'snippet'   => '',
232        );
233
234        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
235        if ($evt->advise_before()) {
236            $match = array();
237            $snippets = array();
238            $utf8_offset = $offset = $end = 0;
239            $len = Utf8\PhpString::strlen($text);
240
241            // build a regexp from the phrases to highlight
242            $re1 = '(' .
243                join(
244                    '|',
245                    array_map(
246                        [$this, 'snippetRePreprocess'],
247                        array_map(
248                            'preg_quote_cb',
249                            array_filter((array) $highlight)
250                        )
251                    )
252                ) .
253                ')';
254            $re2 = "$re1.{0,75}(?!\\1)$re1";
255            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
256
257            for ($cnt=4; $cnt--;) {
258                if (0) {
259                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
260                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
261                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
262                } else {
263                    break;
264                }
265
266                list($str, $idx) = $match[0];
267
268                // convert $idx (a byte offset) into a utf8 character offset
269                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
270                $utf8_len = Utf8\PhpString::strlen($str);
271
272                // establish context, 100 bytes surrounding the match string
273                // first look to see if we can go 100 either side,
274                // then drop to 50 adding any excess if the other side can't go to 50,
275                $pre = min($utf8_idx - $utf8_offset, 100);
276                $post = min($len - $utf8_idx - $utf8_len, 100);
277
278                if ($pre > 50 && $post > 50) {
279                    $pre = $post = 50;
280                } elseif ($pre > 50) {
281                    $pre = min($pre, 100 - $post);
282                } elseif ($post > 50) {
283                    $post = min($post, 100 - $pre);
284                } elseif ($offset == 0) {
285                    // both are less than 50, means the context is the whole string
286                    // make it so and break out of this loop - there is no need for the
287                    // complex snippet calculations
288                    $snippets = array($text);
289                    break;
290                }
291
292                // establish context start and end points, try to append to previous
293                // context if possible
294                $start = $utf8_idx - $pre;
295                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
296                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
297
298                if ($append) {
299                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
300                } else {
301                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
302                }
303
304                // set $offset for next match attempt
305                // continue matching after the current match
306                // if the current match is not the longest possible match starting at the current offset
307                // this prevents further matching of this snippet but for possible matches of length
308                // smaller than match length + context (at least 50 characters) this match is part of the context
309                $utf8_offset = $utf8_idx + $utf8_len;
310                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
311                $offset = Utf8\Clean::correctIdx($text, $offset);
312            }
313
314            $m = "\1";
315            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
316            $snippet = preg_replace(
317                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
318                '<strong class="search_hit">$1</strong>',
319                hsc(join('... ', $snippets))
320            );
321
322            $evdata['snippet'] = $snippet;
323        }
324        $evt->advise_after();
325        unset($evt);
326
327        return $evdata['snippet'];
328    }
329
330    /**
331     * Wraps a search term in regex boundary checks.
332     *
333     * @param string $term
334     * @return string
335     */
336    public function snippetRePreprocess($term)
337    {
338        // do not process asian terms where word boundaries are not explicit
339        if (Utf8\Asian::isAsianWords($term)) return $term;
340
341        if (UTF8_PROPERTYSUPPORT) {
342            // unicode word boundaries
343            // see http://stackoverflow.com/a/2449017/172068
344            $BL = '(?<!\pL)';
345            $BR = '(?!\pL)';
346        } else {
347            // not as correct as above, but at least won't break
348            $BL = '\b';
349            $BR = '\b';
350        }
351
352        if (substr($term, 0, 2) == '\\*') {
353            $term = substr($term, 2);
354        } else {
355            $term = $BL.$term;
356        }
357
358        if (substr($term, -2, 2) == '\\*') {
359            $term = substr($term, 0, -2);
360        } else {
361            $term = $term.$BR;
362        }
363
364        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
365            $term = '';
366        }
367        return $term;
368    }
369
370    /**
371     * Combine found documents and sum up their scores
372     *
373     * This function is used to combine searched words with a logical
374     * AND. Only documents available in all arrays are returned.
375     *
376     * based upon PEAR's PHP_Compat function for array_intersect_key()
377     *
378     * @param array $args An array of page arrays
379     * @return array
380     */
381    protected function resultCombine($args)
382    {
383        $array_count = count($args);
384        if ($array_count == 1) {
385            return $args[0];
386        }
387
388        $result = array();
389        if ($array_count > 1) {
390            foreach ($args[0] as $key => $value) {
391                $result[$key] = $value;
392                for ($i = 1; $i !== $array_count; $i++) {
393                    if (!isset($args[$i][$key])) {
394                        unset($result[$key]);
395                        break;
396                    }
397                    $result[$key] += $args[$i][$key];
398                }
399            }
400        }
401        return $result;
402    }
403
404    /**
405     * Unites found documents and sum up their scores
406     * based upon resultCombine() method
407     *
408     * @param array $args An array of page arrays
409     * @return array
410     *
411     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
412     */
413    protected function resultUnite($args)
414    {
415        $array_count = count($args);
416        if ($array_count === 1) {
417            return $args[0];
418        }
419
420        $result = $args[0];
421        for ($i = 1; $i !== $array_count; $i++) {
422            foreach (array_keys($args[$i]) as $id) {
423                $result[$id] += $args[$i][$id];
424            }
425        }
426        return $result;
427    }
428
429    /**
430     * Computes the difference of documents using page id for comparison
431     * nearly identical to PHP5's array_diff_key()
432     *
433     * @param array $args An array of page arrays
434     * @return array
435     *
436     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
437     */
438    protected function resultComplement($args)
439    {
440        $array_count = count($args);
441        if ($array_count === 1) {
442            return $args[0];
443        }
444
445        $result = $args[0];
446        foreach (array_keys($result) as $id) {
447            for ($i = 1; $i !== $array_count; $i++) {
448                if (isset($args[$i][$id])) unset($result[$id]);
449            }
450        }
451        return $result;
452    }
453}
454