1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\CollectionSearch; 7use dokuwiki\Search\Collection\PageFulltextCollection; 8use dokuwiki\Search\Query\QueryEvaluator; 9use dokuwiki\Search\Query\QueryParser; 10use dokuwiki\Utf8\Asian; 11use dokuwiki\Utf8\Clean; 12use dokuwiki\Utf8\PhpString; 13 14/** 15 * DokuWiki Fulltext Search 16 * 17 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18 * @author Andreas Gohr <andi@splitbrain.org> 19 */ 20class FulltextSearch 21{ 22 /** @var int Maximum number of results to generate snippets for */ 23 protected int $maxSnippets = 15; 24 25 /** 26 * @return int 27 */ 28 public function getMaxSnippets(): int 29 { 30 return $this->maxSnippets; 31 } 32 33 /** 34 * @param int $maxSnippets 35 */ 36 public function setMaxSnippets(int $maxSnippets): void 37 { 38 $this->maxSnippets = $maxSnippets; 39 } 40 41 /** 42 * The fulltext search 43 * 44 * Returns a list of matching documents for the given query 45 * 46 * @triggers SEARCH_QUERY_FULLPAGE 47 * 48 * @param string $query the search query string 49 * @param array $highlight will be filled with terms to highlight 50 * @param string|null $sort sort mode: 'hits' (default) or 'mtime' 51 * @param int|string|null $after only show results with mtime after this date, 52 * accepts timestamp or strtotime arguments 53 * @param int|string|null $before only show results with mtime before this date, 54 * accepts timestamp or strtotime arguments 55 * 56 * @return array matching documents as pageid => score 57 */ 58 public function pageSearch( 59 string $query, 60 array &$highlight, 61 ?string $sort = null, 62 int|string|null $after = null, 63 int|string|null $before = null 64 ): array { 65 if ($sort === null) { 66 $sort = 'hits'; 67 } 68 $data = [ 69 'query' => $query, 70 'sort' => $sort, 71 'after' => $after, 72 'before' => $before 73 ]; 74 $data['highlight'] =& $highlight; 75 $action = $this->pageSearchCallBack(...); 76 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 77 } 78 79 /** 80 * Returns a list of matching documents for the given query 81 * 82 * @param array $data event data 83 * @return array matching documents as pageid => score 84 * @author Andreas Gohr <andi@splitbrain.org> 85 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 86 * 87 */ 88 public function pageSearchCallBack(array &$data): array 89 { 90 // parse the given query 91 $q = (new QueryParser())->convert($data['query']); 92 $data['highlight'] = $q['highlight']; 93 94 if (empty($q['parsed_ary'])) return []; 95 96 // look up all words via CollectionSearch 97 $collection = new PageFulltextCollection(); 98 $search = new CollectionSearch($collection); 99 foreach ($q['words'] as $word) { 100 if (!Tokenizer::isValidSearchTerm($word)) continue; 101 $search->addTerm($word); 102 } 103 $terms = $search->execute(); 104 105 // evaluate the query 106 $evaluator = new QueryEvaluator($q['parsed_ary'], $terms); 107 $docs = $evaluator->evaluate(); 108 109 if ($docs === []) return []; 110 111 // filter by visibility, acls, existence, and time range 112 $docs = MetadataSearch::filterPages($docs, false, $data['after'], $data['before']); 113 114 if ($data['sort'] === 'mtime') { 115 uksort($docs, static fn($a, $b) => filemtime(wikiFN($b)) - filemtime(wikiFN($a))); 116 } else { 117 arsort($docs); 118 } 119 120 return $docs; 121 } 122 123 /** 124 * Creates a snippet extract 125 * 126 * @param string $id page id 127 * @param array $highlight 128 * @return mixed 129 * @author Andreas Gohr <andi@splitbrain.org> 130 * @triggers FULLTEXT_SNIPPET_CREATE 131 * 132 */ 133 public function snippet(string $id, array $highlight): mixed 134 { 135 $text = rawWiki($id); 136 $text = str_replace("\xC2\xAD", '', $text); 137 // remove soft-hyphens 138 $evdata = [ 139 'id' => $id, 140 'text' => &$text, 141 'highlight' => &$highlight, 142 'snippet' => '', 143 ]; 144 145 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 146 if ($evt->advise_before()) { 147 $match = []; 148 $snippets = []; 149 $utf8_offset = 0; 150 $offset = 0; 151 $end = 0; 152 $len = PhpString::strlen($text); 153 154 // build a regexp from the phrases to highlight 155 $re1 = '(' . 156 implode( 157 '|', 158 array_map( 159 $this->snippetRePreprocess(...), 160 array_map( 161 preg_quote_cb(...), 162 array_filter($highlight) 163 ) 164 ) 165 ) . 166 ')'; 167 $re2 = "$re1.{0,75}(?!\\\\1)$re1"; 168 $re3 = "$re1.{0,45}(?!\\\\1)$re1.{0,45}(?!\\\\1)(?!\\\\2)$re1"; 169 170 for ($cnt = 4; $cnt--;) { 171 if (0) { 172 } elseif (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 173 } elseif (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 174 } elseif (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 175 } else { 176 break; 177 } 178 179 [$str, $idx] = $match[0]; 180 181 // convert $idx (a byte offset) into a utf8 character offset 182 $utf8_idx = PhpString::strlen(substr($text, 0, $idx)); 183 $utf8_len = PhpString::strlen($str); 184 185 // establish context, 100 bytes surrounding the match string 186 // first look to see if we can go 100 either side, 187 // then drop to 50 adding any excess if the other side can't go to 50, 188 $pre = min($utf8_idx - $utf8_offset, 100); 189 $post = min($len - $utf8_idx - $utf8_len, 100); 190 191 if ($pre > 50 && $post > 50) { 192 $pre = 50; 193 $post = 50; 194 } elseif ($pre > 50) { 195 $pre = min($pre, 100 - $post); 196 } elseif ($post > 50) { 197 $post = min($post, 100 - $pre); 198 } elseif ($offset == 0) { 199 // both are less than 50, means the context is the whole string 200 // make it so and break out of this loop - there is no need for the 201 // complex snippet calculations 202 $snippets = [$text]; 203 break; 204 } 205 206 // establish context start and end points, try to append to previous 207 // context if possible 208 $start = $utf8_idx - $pre; 209 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 210 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 211 212 if ($append) { 213 $snippets[count($snippets) - 1] .= PhpString::substr($text, $append, $end - $append); 214 } else { 215 $snippets[] = PhpString::substr($text, $start, $end - $start); 216 } 217 218 // set $offset for next match attempt 219 // continue matching after the current match 220 // if the current match is not the longest possible match starting at the current offset 221 // this prevents further matching of this snippet but for possible matches of length 222 // smaller than match length + context (at least 50 characters) this match is part of the context 223 $utf8_offset = $utf8_idx + $utf8_len; 224 $offset = $idx + strlen(PhpString::substr($text, $utf8_idx, $utf8_len)); 225 $offset = Clean::correctIdx($text, $offset); 226 } 227 228 $m = "\1"; 229 $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets); 230 $snippet = preg_replace( 231 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 232 '<strong class="search_hit">$1</strong>', 233 hsc(implode('... ', $snippets)) 234 ); 235 236 $evdata['snippet'] = $snippet; 237 } 238 $evt->advise_after(); 239 unset($evt); 240 241 return $evdata['snippet']; 242 } 243 244 /** 245 * Wraps a search term in regex boundary checks. 246 * 247 * @param string $term 248 * @return string 249 */ 250 public function snippetRePreprocess(string $term): string 251 { 252 // do not process asian terms where word boundaries are not explicit 253 if (Asian::isAsianWords($term)) return $term; 254 255 if (UTF8_PROPERTYSUPPORT) { 256 // unicode word boundaries 257 // see http://stackoverflow.com/a/2449017/172068 258 $BL = '(?<!\pL)'; 259 $BR = '(?!\pL)'; 260 } else { 261 // not as correct as above, but at least won't break 262 $BL = '\b'; 263 $BR = '\b'; 264 } 265 266 if (str_starts_with($term, '\\*')) { 267 $term = substr($term, 2); 268 } else { 269 $term = $BL . $term; 270 } 271 272 if (str_ends_with($term, '\\*')) { 273 $term = substr($term, 0, -2); 274 } else { 275 $term .= $BR; 276 } 277 278 if (in_array($term, [$BL, $BR, $BL . $BR], true)) { 279 $term = ''; 280 } 281 return $term; 282 } 283} 284