xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 9369b4a991666bc911474806b106d8958e79f4c1)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\CollectionSearch;
7use dokuwiki\Search\Collection\PageFulltextCollection;
8use dokuwiki\Search\Query\QueryEvaluator;
9use dokuwiki\Search\Query\QueryParser;
10use dokuwiki\Utf8\Asian;
11use dokuwiki\Utf8\Clean;
12use dokuwiki\Utf8\PhpString;
13
14/**
15 * DokuWiki Fulltext Search
16 *
17 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18 * @author     Andreas Gohr <andi@splitbrain.org>
19 */
20class FulltextSearch
21{
22    /** @var int Maximum number of results to generate snippets for */
23    protected int $maxSnippets = 15;
24
25    /**
26     * @return int
27     */
28    public function getMaxSnippets(): int
29    {
30        return $this->maxSnippets;
31    }
32
33    /**
34     * @param int $maxSnippets
35     */
36    public function setMaxSnippets(int $maxSnippets): void
37    {
38        $this->maxSnippets = $maxSnippets;
39    }
40
41    /**
42     * The fulltext search
43     *
44     * Returns a list of matching documents for the given query
45     *
46     * @triggers SEARCH_QUERY_FULLPAGE
47     *
48     * @param string $query the search query string
49     * @param array $highlight will be filled with terms to highlight
50     * @param string|null $sort sort mode: 'hits' (default) or 'mtime'
51     * @param int|string|null $after only show results with mtime after this date,
52     *                            accepts timestamp or strtotime arguments
53     * @param int|string|null $before only show results with mtime before this date,
54     *                            accepts timestamp or strtotime arguments
55     *
56     * @return array matching documents as pageid => score
57     */
58    public function pageSearch(
59        string $query,
60        array &$highlight,
61        ?string $sort = null,
62        int|string|null $after = null,
63        int|string|null $before = null
64    ): array {
65        if ($sort === null) {
66            $sort = 'hits';
67        }
68        $data = [
69            'query' => $query,
70            'sort' => $sort,
71            'after' => $after,
72            'before' => $before
73        ];
74        $data['highlight'] =& $highlight;
75        $action = $this->pageSearchCallBack(...);
76        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
77    }
78
79    /**
80     * Returns a list of matching documents for the given query
81     *
82     * @param array $data event data
83     * @return array       matching documents as pageid => score
84     * @author Andreas Gohr <andi@splitbrain.org>
85     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
86     *
87     */
88    public function pageSearchCallBack(array &$data): array
89    {
90        // parse the given query
91        $q = (new QueryParser())->convert($data['query']);
92        $data['highlight'] = $q['highlight'];
93
94        if (empty($q['parsed_ary'])) return [];
95
96        // look up all words via CollectionSearch
97        $collection = new PageFulltextCollection();
98        $search = new CollectionSearch($collection);
99        foreach ($q['words'] as $word) {
100            if (!Tokenizer::isValidSearchTerm($word)) continue;
101            $search->addTerm($word);
102        }
103        $terms = $search->execute();
104
105        // evaluate the query
106        $evaluator = new QueryEvaluator($q['parsed_ary'], $terms);
107        $docs = $evaluator->evaluate();
108
109        if ($docs === []) return [];
110
111        // filter by visibility, acls, existence, and time range
112        $docs = MetadataSearch::filterPages($docs, false, $data['after'], $data['before']);
113
114        if ($data['sort'] === 'mtime') {
115            uksort($docs, static fn($a, $b) => filemtime(wikiFN($b)) - filemtime(wikiFN($a)));
116        } else {
117            arsort($docs);
118        }
119
120        return $docs;
121    }
122
123    /**
124     * Creates a snippet extract
125     *
126     * @param string $id page id
127     * @param array $highlight
128     * @return mixed
129     * @author Andreas Gohr <andi@splitbrain.org>
130     * @triggers FULLTEXT_SNIPPET_CREATE
131     *
132     */
133    public function snippet(string $id, array $highlight): mixed
134    {
135        $text = rawWiki($id);
136        $text = str_replace("\xC2\xAD", '', $text);
137        // remove soft-hyphens
138        $evdata = [
139            'id' => $id,
140            'text' => &$text,
141            'highlight' => &$highlight,
142            'snippet' => '',
143        ];
144
145        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
146        if ($evt->advise_before()) {
147            $match = [];
148            $snippets = [];
149            $utf8_offset = 0;
150            $offset = 0;
151            $end = 0;
152            $len = PhpString::strlen($text);
153
154            // build a regexp from the phrases to highlight
155            $re1 = '(' .
156                implode(
157                    '|',
158                    array_map(
159                        $this->snippetRePreprocess(...),
160                        array_map(
161                            preg_quote_cb(...),
162                            array_filter($highlight)
163                        )
164                    )
165                ) .
166                ')';
167            $re2 = "$re1.{0,75}(?!\\\\1)$re1";
168            $re3 = "$re1.{0,45}(?!\\\\1)$re1.{0,45}(?!\\\\1)(?!\\\\2)$re1";
169
170            for ($cnt = 4; $cnt--;) {
171                if (0) {
172                } elseif (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
173                } elseif (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
174                } elseif (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
175                } else {
176                    break;
177                }
178
179                [$str, $idx] = $match[0];
180
181                // convert $idx (a byte offset) into a utf8 character offset
182                $utf8_idx = PhpString::strlen(substr($text, 0, $idx));
183                $utf8_len = PhpString::strlen($str);
184
185                // establish context, 100 bytes surrounding the match string
186                // first look to see if we can go 100 either side,
187                // then drop to 50 adding any excess if the other side can't go to 50,
188                $pre = min($utf8_idx - $utf8_offset, 100);
189                $post = min($len - $utf8_idx - $utf8_len, 100);
190
191                if ($pre > 50 && $post > 50) {
192                    $pre = 50;
193                    $post = 50;
194                } elseif ($pre > 50) {
195                    $pre = min($pre, 100 - $post);
196                } elseif ($post > 50) {
197                    $post = min($post, 100 - $pre);
198                } elseif ($offset == 0) {
199                    // both are less than 50, means the context is the whole string
200                    // make it so and break out of this loop - there is no need for the
201                    // complex snippet calculations
202                    $snippets = [$text];
203                    break;
204                }
205
206                // establish context start and end points, try to append to previous
207                // context if possible
208                $start = $utf8_idx - $pre;
209                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
210                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
211
212                if ($append) {
213                    $snippets[count($snippets) - 1] .= PhpString::substr($text, $append, $end - $append);
214                } else {
215                    $snippets[] = PhpString::substr($text, $start, $end - $start);
216                }
217
218                // set $offset for next match attempt
219                // continue matching after the current match
220                // if the current match is not the longest possible match starting at the current offset
221                // this prevents further matching of this snippet but for possible matches of length
222                // smaller than match length + context (at least 50 characters) this match is part of the context
223                $utf8_offset = $utf8_idx + $utf8_len;
224                $offset = $idx + strlen(PhpString::substr($text, $utf8_idx, $utf8_len));
225                $offset = Clean::correctIdx($text, $offset);
226            }
227
228            $m = "\1";
229            $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets);
230            $snippet = preg_replace(
231                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
232                '<strong class="search_hit">$1</strong>',
233                hsc(implode('... ', $snippets))
234            );
235
236            $evdata['snippet'] = $snippet;
237        }
238        $evt->advise_after();
239        unset($evt);
240
241        return $evdata['snippet'];
242    }
243
244    /**
245     * Wraps a search term in regex boundary checks.
246     *
247     * @param string $term
248     * @return string
249     */
250    public function snippetRePreprocess(string $term): string
251    {
252        // do not process asian terms where word boundaries are not explicit
253        if (Asian::isAsianWords($term)) return $term;
254
255        if (UTF8_PROPERTYSUPPORT) {
256            // unicode word boundaries
257            // see http://stackoverflow.com/a/2449017/172068
258            $BL = '(?<!\pL)';
259            $BR = '(?!\pL)';
260        } else {
261            // not as correct as above, but at least won't break
262            $BL = '\b';
263            $BR = '\b';
264        }
265
266        if (str_starts_with($term, '\\*')) {
267            $term = substr($term, 2);
268        } else {
269            $term = $BL . $term;
270        }
271
272        if (str_ends_with($term, '\\*')) {
273            $term = substr($term, 0, -2);
274        } else {
275            $term .= $BR;
276        }
277
278        if (in_array($term, [$BL, $BR, $BL . $BR])) {
279            $term = '';
280        }
281        return $term;
282    }
283}
284