xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\CollectionSearch;
7use dokuwiki\Search\Collection\PageFulltextCollection;
8use dokuwiki\Search\Query\QueryEvaluator;
9use dokuwiki\Search\Query\QueryParser;
10use dokuwiki\Utf8;
11
12/**
13 * DokuWiki Fulltext Search
14 *
15 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
16 * @author     Andreas Gohr <andi@splitbrain.org>
17 */
18class FulltextSearch
19{
20    /** @var int Maximum number of results to generate snippets for */
21    protected int $maxSnippets = 15;
22
23    /**
24     * @return int
25     */
26    public function getMaxSnippets(): int
27    {
28        return $this->maxSnippets;
29    }
30
31    /**
32     * @param int $maxSnippets
33     */
34    public function setMaxSnippets(int $maxSnippets): void
35    {
36        $this->maxSnippets = $maxSnippets;
37    }
38
39    /**
40     * The fulltext search
41     *
42     * Returns a list of matching documents for the given query
43     *
44     * @triggers SEARCH_QUERY_FULLPAGE
45     *
46     * @param string     $query   the search query string
47     * @param array      $highlight  will be filled with terms to highlight
48     * @param string     $sort    sort mode: 'hits' (default) or 'mtime'
49     * @param int|string $after   only show results with mtime after this date,
50     *                            accepts timestamp or strtotime arguments
51     * @param int|string $before  only show results with mtime before this date,
52     *                            accepts timestamp or strtotime arguments
53     *
54     * @return array matching documents as pageid => score
55     */
56    public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
57    {
58        if ($sort === null) {
59            $sort = 'hits';
60        }
61        $data = [
62            'query' => $query,
63            'sort' => $sort,
64            'after' => $after,
65            'before' => $before
66        ];
67        $data['highlight'] =& $highlight;
68        $action = [$this, 'pageSearchCallBack'];
69        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
70    }
71
72    /**
73     * Returns a list of matching documents for the given query
74     *
75     * @author Andreas Gohr <andi@splitbrain.org>
76     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
77     *
78     * @param array $data  event data
79     * @return array       matching documents as pageid => score
80     */
81    public function pageSearchCallBack(&$data)
82    {
83        // parse the given query
84        $q = (new QueryParser)->convert($data['query']);
85        $data['highlight'] = $q['highlight'];
86
87        if (empty($q['parsed_ary'])) return [];
88
89        // look up all words via CollectionSearch
90        $collection = new PageFulltextCollection();
91        $search = new CollectionSearch($collection);
92        foreach ($q['words'] as $word) {
93            if (!Tokenizer::isValidSearchTerm($word)) continue;
94            $search->addTerm($word);
95        }
96        $terms = $search->execute();
97
98        // evaluate the query
99        $evaluator = new QueryEvaluator($q['parsed_ary'], $terms);
100        $docs = $evaluator->evaluate();
101
102        if (empty($docs)) return [];
103
104        // filter by visibility, acls, existence, and time range
105        $docs = MetadataSearch::filterPages($docs, false, $data['after'], $data['before']);
106
107        if ($data['sort'] === 'mtime') {
108            uksort($docs, static function ($a, $b) {
109                return filemtime(wikiFN($b)) - filemtime(wikiFN($a));
110            });
111        } else {
112            arsort($docs);
113        }
114
115        return $docs;
116    }
117
118    /**
119     * Creates a snippet extract
120     *
121     * @author Andreas Gohr <andi@splitbrain.org>
122     * @triggers FULLTEXT_SNIPPET_CREATE
123     *
124     * @param string $id page id
125     * @param array $highlight
126     * @return mixed
127     */
128    public function snippet($id, $highlight)
129    {
130        $text = rawWiki($id);
131        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
132        $evdata = array(
133            'id'        => $id,
134            'text'      => &$text,
135            'highlight' => &$highlight,
136            'snippet'   => '',
137        );
138
139        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
140        if ($evt->advise_before()) {
141            $match = array();
142            $snippets = array();
143            $utf8_offset = $offset = $end = 0;
144            $len = Utf8\PhpString::strlen($text);
145
146            // build a regexp from the phrases to highlight
147            $re1 = '(' .
148                join(
149                    '|',
150                    array_map(
151                        [$this, 'snippetRePreprocess'],
152                        array_map(
153                            'preg_quote_cb',
154                            array_filter((array) $highlight)
155                        )
156                    )
157                ) .
158                ')';
159            $re2 = "$re1.{0,75}(?!\\1)$re1";
160            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
161
162            for ($cnt=4; $cnt--;) {
163                if (0) {
164                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
165                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
166                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
167                } else {
168                    break;
169                }
170
171                list($str, $idx) = $match[0];
172
173                // convert $idx (a byte offset) into a utf8 character offset
174                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
175                $utf8_len = Utf8\PhpString::strlen($str);
176
177                // establish context, 100 bytes surrounding the match string
178                // first look to see if we can go 100 either side,
179                // then drop to 50 adding any excess if the other side can't go to 50,
180                $pre = min($utf8_idx - $utf8_offset, 100);
181                $post = min($len - $utf8_idx - $utf8_len, 100);
182
183                if ($pre > 50 && $post > 50) {
184                    $pre = $post = 50;
185                } elseif ($pre > 50) {
186                    $pre = min($pre, 100 - $post);
187                } elseif ($post > 50) {
188                    $post = min($post, 100 - $pre);
189                } elseif ($offset == 0) {
190                    // both are less than 50, means the context is the whole string
191                    // make it so and break out of this loop - there is no need for the
192                    // complex snippet calculations
193                    $snippets = array($text);
194                    break;
195                }
196
197                // establish context start and end points, try to append to previous
198                // context if possible
199                $start = $utf8_idx - $pre;
200                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
201                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
202
203                if ($append) {
204                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
205                } else {
206                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
207                }
208
209                // set $offset for next match attempt
210                // continue matching after the current match
211                // if the current match is not the longest possible match starting at the current offset
212                // this prevents further matching of this snippet but for possible matches of length
213                // smaller than match length + context (at least 50 characters) this match is part of the context
214                $utf8_offset = $utf8_idx + $utf8_len;
215                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
216                $offset = Utf8\Clean::correctIdx($text, $offset);
217            }
218
219            $m = "\1";
220            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
221            $snippet = preg_replace(
222                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
223                '<strong class="search_hit">$1</strong>',
224                hsc(join('... ', $snippets))
225            );
226
227            $evdata['snippet'] = $snippet;
228        }
229        $evt->advise_after();
230        unset($evt);
231
232        return $evdata['snippet'];
233    }
234
235    /**
236     * Wraps a search term in regex boundary checks.
237     *
238     * @param string $term
239     * @return string
240     */
241    public function snippetRePreprocess($term)
242    {
243        // do not process asian terms where word boundaries are not explicit
244        if (Utf8\Asian::isAsianWords($term)) return $term;
245
246        if (UTF8_PROPERTYSUPPORT) {
247            // unicode word boundaries
248            // see http://stackoverflow.com/a/2449017/172068
249            $BL = '(?<!\pL)';
250            $BR = '(?!\pL)';
251        } else {
252            // not as correct as above, but at least won't break
253            $BL = '\b';
254            $BR = '\b';
255        }
256
257        if (substr($term, 0, 2) == '\\*') {
258            $term = substr($term, 2);
259        } else {
260            $term = $BL.$term;
261        }
262
263        if (substr($term, -2, 2) == '\\*') {
264            $term = substr($term, 0, -2);
265        } else {
266            $term = $term.$BR;
267        }
268
269        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
270            $term = '';
271        }
272        return $term;
273    }
274}
275