xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 4f29a5b95f4d435233ea7be53601adafa274beb7)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\CollectionSearch;
7use dokuwiki\Search\Collection\PageFulltextCollection;
8use dokuwiki\Search\Query\QueryEvaluator;
9use dokuwiki\Search\Query\QueryParser;
10use dokuwiki\Utf8\Asian;
11use dokuwiki\Utf8\Clean;
12use dokuwiki\Utf8\PhpString;
13
14/**
15 * DokuWiki Fulltext Search
16 *
17 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18 * @author     Andreas Gohr <andi@splitbrain.org>
19 */
20class FulltextSearch
21{
22    /** @var int Maximum number of results to generate snippets for */
23    protected int $maxSnippets = 15;
24
25    /**
26     * @return int
27     */
28    public function getMaxSnippets(): int
29    {
30        return $this->maxSnippets;
31    }
32
33    /**
34     * @param int $maxSnippets
35     */
36    public function setMaxSnippets(int $maxSnippets): void
37    {
38        $this->maxSnippets = $maxSnippets;
39    }
40
41    /**
42     * The fulltext search
43     *
44     * Returns a list of matching documents for the given query
45     *
46     * @triggers SEARCH_QUERY_FULLPAGE
47     *
48     * @param string $query the search query string
49     * @param array $highlight will be filled with terms to highlight
50     * @param string|null $sort sort mode: 'hits' (default) or 'mtime'
51     * @param int|string|null $after only show results with mtime after this date,
52     *                            accepts timestamp or strtotime arguments
53     * @param int|string|null $before only show results with mtime before this date,
54     *                            accepts timestamp or strtotime arguments
55     *
56     * @return array matching documents as pageid => score
57     */
58    public function pageSearch(
59        string $query,
60        array &$highlight,
61        ?string $sort = null,
62        int|string|null $after = null,
63        int|string|null $before = null
64    ): array {
65        if ($sort === null) {
66            $sort = 'hits';
67        }
68        $data = [
69            'query' => $query,
70            'sort' => $sort,
71            'after' => $after,
72            'before' => $before
73        ];
74        $data['highlight'] =& $highlight;
75        $action = $this->pageSearchCallBack(...);
76        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
77    }
78
79    /**
80     * Returns a list of matching documents for the given query
81     *
82     * @param array $data event data
83     * @return array       matching documents as pageid => score
84     * @author Andreas Gohr <andi@splitbrain.org>
85     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
86     *
87     */
88    public function pageSearchCallBack(array &$data): array
89    {
90        // parse the given query
91        $q = (new QueryParser())->convert($data['query']);
92        $data['highlight'] = $q['highlight'];
93
94        if (empty($q['parsed_ary'])) return [];
95
96        // look up all words via CollectionSearch
97        $collection = new PageFulltextCollection();
98        $search = new CollectionSearch($collection);
99        foreach ($q['words'] as $word) {
100            if (!Tokenizer::isValidSearchTerm($word)) continue;
101            $search->addTerm($word);
102        }
103        $terms = $search->execute();
104
105        // evaluate the query
106        $evaluator = new QueryEvaluator($q['parsed_ary'], $terms);
107        $docs = $evaluator->evaluate();
108
109        if ($docs === []) return [];
110
111        // filter by visibility, acls, existence, and time range
112        $docs = MetadataSearch::filterPages($docs, false, $data['after'], $data['before']);
113
114        if ($data['sort'] === 'mtime') {
115            uksort($docs, static fn($a, $b) => filemtime(wikiFN($b)) - filemtime(wikiFN($a)));
116        } else {
117            arsort($docs);
118        }
119
120        return $docs;
121    }
122
123    /**
124     * Creates a snippet extract
125     *
126     * @param string $id page id
127     * @param array $highlight
128     * @return mixed
129     * @author Andreas Gohr <andi@splitbrain.org>
130     * @triggers FULLTEXT_SNIPPET_CREATE
131     *
132     */
133    public function snippet(string $id, array $highlight): mixed
134    {
135        $text = rawWiki($id);
136        $text = str_replace("\xC2\xAD", '', $text); // remove soft-hyphens
137        $evdata = [
138            'id' => $id,
139            'text' => &$text,
140            'highlight' => &$highlight,
141            'snippet' => '',
142        ];
143
144        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
145        if ($evt->advise_before()) {
146            $match = [];
147            $snippets = [];
148            $utf8_offset = 0;
149            $offset = 0;
150            $end = 0;
151            $len = PhpString::strlen($text);
152
153            // build a regexp from the phrases to highlight
154            $re1 = '(' .
155                implode(
156                    '|',
157                    array_map(
158                        $this->snippetRePreprocess(...),
159                        array_map(
160                            preg_quote_cb(...),
161                            array_filter($highlight)
162                        )
163                    )
164                ) .
165                ')';
166            $re2 = "$re1.{0,75}(?!\\\\1)$re1";
167            $re3 = "$re1.{0,45}(?!\\\\1)$re1.{0,45}(?!\\\\1)(?!\\\\2)$re1";
168
169            for ($cnt = 4; $cnt--;) {
170                if (0) {
171                } elseif (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
172                } elseif (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
173                } elseif (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
174                } else {
175                    break;
176                }
177
178                [$str, $idx] = $match[0];
179
180                // convert $idx (a byte offset) into a utf8 character offset
181                $utf8_idx = PhpString::strlen(substr($text, 0, $idx));
182                $utf8_len = PhpString::strlen($str);
183
184                // establish context, 100 bytes surrounding the match string
185                // first look to see if we can go 100 either side,
186                // then drop to 50 adding any excess if the other side can't go to 50,
187                $pre = min($utf8_idx - $utf8_offset, 100);
188                $post = min($len - $utf8_idx - $utf8_len, 100);
189
190                if ($pre > 50 && $post > 50) {
191                    $pre = 50;
192                    $post = 50;
193                } elseif ($pre > 50) {
194                    $pre = min($pre, 100 - $post);
195                } elseif ($post > 50) {
196                    $post = min($post, 100 - $pre);
197                } elseif ($offset == 0) {
198                    // both are less than 50, means the context is the whole string
199                    // make it so and break out of this loop - there is no need for the
200                    // complex snippet calculations
201                    $snippets = [$text];
202                    break;
203                }
204
205                // establish context start and end points, try to append to previous
206                // context if possible
207                $start = $utf8_idx - $pre;
208                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
209                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
210
211                if ($append) {
212                    $snippets[count($snippets) - 1] .= PhpString::substr($text, $append, $end - $append);
213                } else {
214                    $snippets[] = PhpString::substr($text, $start, $end - $start);
215                }
216
217                // set $offset for next match attempt
218                // continue matching after the current match
219                // if the current match is not the longest possible match starting at the current offset
220                // this prevents further matching of this snippet but for possible matches of length
221                // smaller than match length + context (at least 50 characters) this match is part of the context
222                $utf8_offset = $utf8_idx + $utf8_len;
223                $offset = $idx + strlen(PhpString::substr($text, $utf8_idx, $utf8_len));
224                $offset = Clean::correctIdx($text, $offset);
225            }
226
227            $m = "\1";
228            $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets);
229            $snippet = preg_replace(
230                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
231                '<strong class="search_hit">$1</strong>',
232                hsc(implode('... ', $snippets))
233            );
234
235            $evdata['snippet'] = $snippet;
236        }
237        $evt->advise_after();
238        unset($evt);
239
240        return $evdata['snippet'];
241    }
242
243    /**
244     * Wraps a search term in regex boundary checks.
245     *
246     * @param string $term
247     * @return string
248     */
249    public function snippetRePreprocess(string $term): string
250    {
251        // do not process asian terms where word boundaries are not explicit
252        if (Asian::isAsianWords($term)) return $term;
253
254        if (UTF8_PROPERTYSUPPORT) {
255            // unicode word boundaries
256            // see http://stackoverflow.com/a/2449017/172068
257            $BL = '(?<!\pL)';
258            $BR = '(?!\pL)';
259        } else {
260            // not as correct as above, but at least won't break
261            $BL = '\b';
262            $BR = '\b';
263        }
264
265        if (str_starts_with($term, '\\*')) {
266            $term = substr($term, 2);
267        } else {
268            $term = $BL . $term;
269        }
270
271        if (str_ends_with($term, '\\*')) {
272            $term = substr($term, 0, -2);
273        } else {
274            $term .= $BR;
275        }
276
277        if (in_array($term, [$BL, $BR, $BL . $BR])) {
278            $term = '';
279        }
280        return $term;
281    }
282}
283