1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\CollectionSearch; 7use dokuwiki\Search\Collection\PageFulltextCollection; 8use dokuwiki\Search\Query\QueryEvaluator; 9use dokuwiki\Search\Query\QueryParser; 10use dokuwiki\Utf8\Asian; 11use dokuwiki\Utf8\Clean; 12use dokuwiki\Utf8\PhpString; 13 14/** 15 * DokuWiki Fulltext Search 16 * 17 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18 * @author Andreas Gohr <andi@splitbrain.org> 19 */ 20class FulltextSearch 21{ 22 /** @var int Maximum number of results to generate snippets for */ 23 protected int $maxSnippets = 15; 24 25 /** 26 * @return int 27 */ 28 public function getMaxSnippets(): int 29 { 30 return $this->maxSnippets; 31 } 32 33 /** 34 * @param int $maxSnippets 35 */ 36 public function setMaxSnippets(int $maxSnippets): void 37 { 38 $this->maxSnippets = $maxSnippets; 39 } 40 41 /** 42 * The fulltext search 43 * 44 * Returns a list of matching documents for the given query 45 * 46 * @triggers SEARCH_QUERY_FULLPAGE 47 * 48 * @param string $query the search query string 49 * @param array $highlight will be filled with terms to highlight 50 * @param string|null $sort sort mode: 'hits' (default) or 'mtime' 51 * @param int|string|null $after only show results with mtime after this date, 52 * accepts timestamp or strtotime arguments 53 * @param int|string|null $before only show results with mtime before this date, 54 * accepts timestamp or strtotime arguments 55 * 56 * @return array matching documents as pageid => score 57 */ 58 public function pageSearch( 59 string $query, 60 array &$highlight, 61 ?string $sort = null, 62 int|string|null $after = null, 63 int|string|null $before = null 64 ): array { 65 if ($sort === null) { 66 $sort = 'hits'; 67 } 68 $data = [ 69 'query' => $query, 70 'sort' => $sort, 71 'after' => $after, 72 'before' => $before 73 ]; 74 $data['highlight'] =& $highlight; 75 $action = $this->pageSearchCallBack(...); 76 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 77 } 78 79 /** 80 * Returns a list of matching documents for the given query 81 * 82 * @param array $data event data 83 * @return array matching documents as pageid => score 84 * @author Andreas Gohr <andi@splitbrain.org> 85 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 86 * 87 */ 88 public function pageSearchCallBack(array &$data): array 89 { 90 // parse the given query 91 $q = (new QueryParser())->convert($data['query']); 92 $data['highlight'] = $q['highlight']; 93 94 if (empty($q['parsed_ary'])) return []; 95 96 // look up all words via CollectionSearch 97 $collection = new PageFulltextCollection(); 98 $search = new CollectionSearch($collection); 99 foreach ($q['words'] as $word) { 100 if (!Tokenizer::isValidSearchTerm($word)) continue; 101 $search->addTerm($word); 102 } 103 $terms = $search->execute(); 104 105 // evaluate the query 106 $evaluator = new QueryEvaluator($q['parsed_ary'], $terms); 107 $docs = $evaluator->evaluate(); 108 109 if ($docs === []) return []; 110 111 // filter by visibility, acls, existence, and time range 112 $docs = MetadataSearch::filterPages($docs, false, $data['after'], $data['before']); 113 114 if ($data['sort'] === 'mtime') { 115 uksort($docs, static fn($a, $b) => filemtime(wikiFN($b)) - filemtime(wikiFN($a))); 116 } else { 117 arsort($docs); 118 } 119 120 return $docs; 121 } 122 123 /** 124 * Creates a snippet extract 125 * 126 * @param string $id page id 127 * @param array $highlight 128 * @return mixed 129 * @author Andreas Gohr <andi@splitbrain.org> 130 * @triggers FULLTEXT_SNIPPET_CREATE 131 * 132 */ 133 public function snippet(string $id, array $highlight): mixed 134 { 135 $text = rawWiki($id); 136 $text = str_replace("\xC2\xAD", '', $text); // remove soft-hyphens 137 $evdata = [ 138 'id' => $id, 139 'text' => &$text, 140 'highlight' => &$highlight, 141 'snippet' => '', 142 ]; 143 144 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 145 if ($evt->advise_before()) { 146 $match = []; 147 $snippets = []; 148 $utf8_offset = 0; 149 $offset = 0; 150 $end = 0; 151 $len = PhpString::strlen($text); 152 153 // build a regexp from the phrases to highlight 154 $re1 = '(' . 155 implode( 156 '|', 157 array_map( 158 $this->snippetRePreprocess(...), 159 array_map( 160 preg_quote_cb(...), 161 array_filter($highlight) 162 ) 163 ) 164 ) . 165 ')'; 166 $re2 = "$re1.{0,75}(?!\\\\1)$re1"; 167 $re3 = "$re1.{0,45}(?!\\\\1)$re1.{0,45}(?!\\\\1)(?!\\\\2)$re1"; 168 169 for ($cnt = 4; $cnt--;) { 170 if (0) { 171 } elseif (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 172 } elseif (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 173 } elseif (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 174 } else { 175 break; 176 } 177 178 [$str, $idx] = $match[0]; 179 180 // convert $idx (a byte offset) into a utf8 character offset 181 $utf8_idx = PhpString::strlen(substr($text, 0, $idx)); 182 $utf8_len = PhpString::strlen($str); 183 184 // establish context, 100 bytes surrounding the match string 185 // first look to see if we can go 100 either side, 186 // then drop to 50 adding any excess if the other side can't go to 50, 187 $pre = min($utf8_idx - $utf8_offset, 100); 188 $post = min($len - $utf8_idx - $utf8_len, 100); 189 190 if ($pre > 50 && $post > 50) { 191 $pre = 50; 192 $post = 50; 193 } elseif ($pre > 50) { 194 $pre = min($pre, 100 - $post); 195 } elseif ($post > 50) { 196 $post = min($post, 100 - $pre); 197 } elseif ($offset == 0) { 198 // both are less than 50, means the context is the whole string 199 // make it so and break out of this loop - there is no need for the 200 // complex snippet calculations 201 $snippets = [$text]; 202 break; 203 } 204 205 // establish context start and end points, try to append to previous 206 // context if possible 207 $start = $utf8_idx - $pre; 208 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 209 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 210 211 if ($append) { 212 $snippets[count($snippets) - 1] .= PhpString::substr($text, $append, $end - $append); 213 } else { 214 $snippets[] = PhpString::substr($text, $start, $end - $start); 215 } 216 217 // set $offset for next match attempt 218 // continue matching after the current match 219 // if the current match is not the longest possible match starting at the current offset 220 // this prevents further matching of this snippet but for possible matches of length 221 // smaller than match length + context (at least 50 characters) this match is part of the context 222 $utf8_offset = $utf8_idx + $utf8_len; 223 $offset = $idx + strlen(PhpString::substr($text, $utf8_idx, $utf8_len)); 224 $offset = Clean::correctIdx($text, $offset); 225 } 226 227 $m = "\1"; 228 $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets); 229 $snippet = preg_replace( 230 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 231 '<strong class="search_hit">$1</strong>', 232 hsc(implode('... ', $snippets)) 233 ); 234 235 $evdata['snippet'] = $snippet; 236 } 237 $evt->advise_after(); 238 unset($evt); 239 240 return $evdata['snippet']; 241 } 242 243 /** 244 * Wraps a search term in regex boundary checks. 245 * 246 * @param string $term 247 * @return string 248 */ 249 public function snippetRePreprocess(string $term): string 250 { 251 // do not process asian terms where word boundaries are not explicit 252 if (Asian::isAsianWords($term)) return $term; 253 254 if (UTF8_PROPERTYSUPPORT) { 255 // unicode word boundaries 256 // see http://stackoverflow.com/a/2449017/172068 257 $BL = '(?<!\pL)'; 258 $BR = '(?!\pL)'; 259 } else { 260 // not as correct as above, but at least won't break 261 $BL = '\b'; 262 $BR = '\b'; 263 } 264 265 if (str_starts_with($term, '\\*')) { 266 $term = substr($term, 2); 267 } else { 268 $term = $BL . $term; 269 } 270 271 if (str_ends_with($term, '\\*')) { 272 $term = substr($term, 0, -2); 273 } else { 274 $term .= $BR; 275 } 276 277 if (in_array($term, [$BL, $BR, $BL . $BR])) { 278 $term = ''; 279 } 280 return $term; 281 } 282} 283