1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\FrequencyCollectionSearch; 7use dokuwiki\Search\Collection\PageFulltextCollection; 8use dokuwiki\Search\Exception\SearchException; 9use dokuwiki\Search\Query\QueryEvaluator; 10use dokuwiki\Search\Query\QueryParser; 11use dokuwiki\Utf8; 12 13/** 14 * DokuWiki Fulltext Search 15 * 16 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 17 * @author Andreas Gohr <andi@splitbrain.org> 18 */ 19class FulltextSearch 20{ 21 /** @var int Maximum number of results to generate snippets for */ 22 protected int $maxSnippets = 15; 23 24 /** 25 * @return int 26 */ 27 public function getMaxSnippets(): int 28 { 29 return $this->maxSnippets; 30 } 31 32 /** 33 * @param int $maxSnippets 34 */ 35 public function setMaxSnippets(int $maxSnippets): void 36 { 37 $this->maxSnippets = $maxSnippets; 38 } 39 40 /** 41 * The fulltext search 42 * 43 * Returns a list of matching documents for the given query 44 * 45 * @triggers SEARCH_QUERY_FULLPAGE 46 * 47 * @param string $query the search query string 48 * @param array $highlight will be filled with terms to highlight 49 * @param string $sort sort mode: 'hits' (default) or 'mtime' 50 * @param int|string $after only show results with mtime after this date, 51 * accepts timestamp or strtotime arguments 52 * @param int|string $before only show results with mtime before this date, 53 * accepts timestamp or strtotime arguments 54 * 55 * @return array matching documents as pageid => score 56 */ 57 public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 58 { 59 if ($sort === null) { 60 $sort = 'hits'; 61 } 62 $data = [ 63 'query' => $query, 64 'sort' => $sort, 65 'after' => $after, 66 'before' => $before 67 ]; 68 $data['highlight'] =& $highlight; 69 $action = [$this, 'pageSearchCallBack']; 70 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 71 } 72 73 /** 74 * Returns a list of matching documents for the given query 75 * 76 * @author Andreas Gohr <andi@splitbrain.org> 77 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 78 * 79 * @param array $data event data 80 * @return array matching documents as pageid => score 81 */ 82 public function pageSearchCallBack(&$data) 83 { 84 // parse the given query 85 $q = (new QueryParser)->convert($data['query']); 86 $data['highlight'] = $q['highlight']; 87 88 if (empty($q['parsed_ary'])) return []; 89 90 // look up all words via FrequencyCollectionSearch 91 $collection = new PageFulltextCollection(); 92 $search = new FrequencyCollectionSearch($collection); 93 foreach ($q['words'] as $word) { 94 try { 95 $search->addTerm($word); 96 } catch (SearchException $e) { 97 // term too short or invalid, skip 98 } 99 } 100 $terms = $search->execute(); 101 102 // evaluate the query 103 $evaluator = new QueryEvaluator($q['parsed_ary'], $terms); 104 $docs = $evaluator->evaluate(); 105 106 if (empty($docs)) return []; 107 108 // prepare time filters 109 $after = $data['after'] ? (is_int($data['after']) ? $data['after'] : strtotime($data['after'])) : null; 110 $before = $data['before'] ? (is_int($data['before']) ? $data['before'] : strtotime($data['before'])) : null; 111 112 // filter by settings, acls, existence, and time range 113 $docs = array_filter($docs, static function ($score, $id) use ($after, $before) { 114 if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) { 115 return false; 116 } 117 if ($after || $before) { 118 $mTime = filemtime(wikiFN($id)); 119 if ($after && $after > $mTime) return false; 120 if ($before && $before < $mTime) return false; 121 } 122 return true; 123 }, ARRAY_FILTER_USE_BOTH); 124 125 if ($data['sort'] === 'mtime') { 126 uksort($docs, static function ($a, $b) { 127 return filemtime(wikiFN($b)) - filemtime(wikiFN($a)); 128 }); 129 } else { 130 arsort($docs); 131 } 132 133 return $docs; 134 } 135 136 /** 137 * Creates a snippet extract 138 * 139 * @author Andreas Gohr <andi@splitbrain.org> 140 * @triggers FULLTEXT_SNIPPET_CREATE 141 * 142 * @param string $id page id 143 * @param array $highlight 144 * @return mixed 145 */ 146 public function snippet($id, $highlight) 147 { 148 $text = rawWiki($id); 149 $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 150 $evdata = array( 151 'id' => $id, 152 'text' => &$text, 153 'highlight' => &$highlight, 154 'snippet' => '', 155 ); 156 157 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 158 if ($evt->advise_before()) { 159 $match = array(); 160 $snippets = array(); 161 $utf8_offset = $offset = $end = 0; 162 $len = Utf8\PhpString::strlen($text); 163 164 // build a regexp from the phrases to highlight 165 $re1 = '(' . 166 join( 167 '|', 168 array_map( 169 [$this, 'snippetRePreprocess'], 170 array_map( 171 'preg_quote_cb', 172 array_filter((array) $highlight) 173 ) 174 ) 175 ) . 176 ')'; 177 $re2 = "$re1.{0,75}(?!\\1)$re1"; 178 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 179 180 for ($cnt=4; $cnt--;) { 181 if (0) { 182 } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 183 } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 184 } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 185 } else { 186 break; 187 } 188 189 list($str, $idx) = $match[0]; 190 191 // convert $idx (a byte offset) into a utf8 character offset 192 $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx)); 193 $utf8_len = Utf8\PhpString::strlen($str); 194 195 // establish context, 100 bytes surrounding the match string 196 // first look to see if we can go 100 either side, 197 // then drop to 50 adding any excess if the other side can't go to 50, 198 $pre = min($utf8_idx - $utf8_offset, 100); 199 $post = min($len - $utf8_idx - $utf8_len, 100); 200 201 if ($pre > 50 && $post > 50) { 202 $pre = $post = 50; 203 } elseif ($pre > 50) { 204 $pre = min($pre, 100 - $post); 205 } elseif ($post > 50) { 206 $post = min($post, 100 - $pre); 207 } elseif ($offset == 0) { 208 // both are less than 50, means the context is the whole string 209 // make it so and break out of this loop - there is no need for the 210 // complex snippet calculations 211 $snippets = array($text); 212 break; 213 } 214 215 // establish context start and end points, try to append to previous 216 // context if possible 217 $start = $utf8_idx - $pre; 218 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 219 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 220 221 if ($append) { 222 $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append); 223 } else { 224 $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start); 225 } 226 227 // set $offset for next match attempt 228 // continue matching after the current match 229 // if the current match is not the longest possible match starting at the current offset 230 // this prevents further matching of this snippet but for possible matches of length 231 // smaller than match length + context (at least 50 characters) this match is part of the context 232 $utf8_offset = $utf8_idx + $utf8_len; 233 $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len)); 234 $offset = Utf8\Clean::correctIdx($text, $offset); 235 } 236 237 $m = "\1"; 238 $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 239 $snippet = preg_replace( 240 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 241 '<strong class="search_hit">$1</strong>', 242 hsc(join('... ', $snippets)) 243 ); 244 245 $evdata['snippet'] = $snippet; 246 } 247 $evt->advise_after(); 248 unset($evt); 249 250 return $evdata['snippet']; 251 } 252 253 /** 254 * Wraps a search term in regex boundary checks. 255 * 256 * @param string $term 257 * @return string 258 */ 259 public function snippetRePreprocess($term) 260 { 261 // do not process asian terms where word boundaries are not explicit 262 if (Utf8\Asian::isAsianWords($term)) return $term; 263 264 if (UTF8_PROPERTYSUPPORT) { 265 // unicode word boundaries 266 // see http://stackoverflow.com/a/2449017/172068 267 $BL = '(?<!\pL)'; 268 $BR = '(?!\pL)'; 269 } else { 270 // not as correct as above, but at least won't break 271 $BL = '\b'; 272 $BR = '\b'; 273 } 274 275 if (substr($term, 0, 2) == '\\*') { 276 $term = substr($term, 2); 277 } else { 278 $term = $BL.$term; 279 } 280 281 if (substr($term, -2, 2) == '\\*') { 282 $term = substr($term, 0, -2); 283 } else { 284 $term = $term.$BR; 285 } 286 287 if ($term == $BL || $term == $BR || $term == $BL.$BR) { 288 $term = ''; 289 } 290 return $term; 291 } 292} 293