1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\PageIndex; 7use dokuwiki\Search\PagewordIndex; 8use dokuwiki\Search\QueryParser; 9use dokuwiki\Utf8; 10 11// create snippets for the first few results only 12const FT_SNIPPET_NUMBER = 15; 13 14/** 15 * Class DokuWiki Fulltext Search 16 * 17 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18 * @author Andreas Gohr <andi@splitbrain.org> 19 */ 20class FulltextSearch 21{ 22 /** 23 * Fulltext Search constructor. prevent direct object creation 24 */ 25 protected function __construct() {} 26 27 /** 28 * The fulltext search 29 * 30 * Returns a list of matching documents for the given query 31 * 32 * refactored into pageSearch(), pageSearchCallBack() and trigger_event() 33 * 34 * @param string $query 35 * @param array $highlight 36 * @param string $sort 37 * @param int|string $after only show results with mtime after this date, 38 * accepts timestap or strtotime arguments 39 * @param int|string $before only show results with mtime before this date, 40 * accepts timestap or strtotime arguments 41 * 42 * @return array 43 */ 44 public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 45 { 46 if ($sort === null) { 47 $sort = 'hits'; 48 } 49 $data = [ 50 'query' => $query, 51 'sort' => $sort, 52 'after' => $after, 53 'before' => $before 54 ]; 55 $data['highlight'] =& $highlight; 56 $action = static::class.'::pageSearchCallBack'; 57 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 58 } 59 60 /** 61 * Returns a list of matching documents for the given query 62 * 63 * @author Andreas Gohr <andi@splitbrain.org> 64 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 65 * 66 * @param array $data event data 67 * @return array matching documents 68 */ 69 public static function pageSearchCallBack(&$data) 70 { 71 // parse the given query 72 $q = QueryParser::convert($data['query']); 73 $data['highlight'] = $q['highlight']; 74 75 if (empty($q['parsed_ary'])) return array(); 76 77 // lookup all words found in the query 78 $PagewordIndex = PagewordIndex::getInstance(); 79 $lookup = $PagewordIndex->lookup($q['words']); 80 81 // get all pages in this dokuwiki site (!: includes nonexistent pages) 82 $PageIndex = PageIndex::getInstance(); 83 $pages_all = array(); 84 foreach ($PageIndex->getPages() as $id) { 85 $pages_all[$id] = 0; // base: 0 hit 86 } 87 88 // process the query 89 $stack = array(); 90 foreach ($q['parsed_ary'] as $token) { 91 switch (substr($token, 0, 3)) { 92 case 'W+:': 93 case 'W-:': 94 case 'W_:': // word 95 $word = substr($token, 3); 96 $stack[] = (array) $lookup[$word]; 97 break; 98 case 'P+:': 99 case 'P-:': // phrase 100 $phrase = substr($token, 3); 101 // since phrases are always parsed as ((W1)(W2)...(P)), 102 // the end($stack) always points the pages that contain 103 // all words in this phrase 104 $pages = end($stack); 105 $pages_matched = array(); 106 foreach (array_keys($pages) as $id) { 107 $evdata = array( 108 'id' => $id, 109 'phrase' => $phrase, 110 'text' => rawWiki($id) 111 ); 112 $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 113 if ($evt->advise_before() && $evt->result !== true) { 114 $text = Utf8\PhpString::strtolower($evdata['text']); 115 if (strpos($text, $phrase) !== false) { 116 $evt->result = true; 117 } 118 } 119 $evt->advise_after(); 120 if ($evt->result === true) { 121 $pages_matched[$id] = 0; // phrase: always 0 hit 122 } 123 } 124 $stack[] = $pages_matched; 125 break; 126 case 'N+:': 127 case 'N-:': // namespace 128 $ns = cleanID(substr($token, 3)) . ':'; 129 $pages_matched = array(); 130 foreach (array_keys($pages_all) as $id) { 131 if (strpos($id, $ns) === 0) { 132 $pages_matched[$id] = 0; // namespace: always 0 hit 133 } 134 } 135 $stack[] = $pages_matched; 136 break; 137 case 'AND': // and operation 138 list($pages1, $pages2) = array_splice($stack, -2); 139 $stack[] = static::resultCombine(array($pages1, $pages2)); 140 break; 141 case 'OR': // or operation 142 list($pages1, $pages2) = array_splice($stack, -2); 143 $stack[] = static::resultUnite(array($pages1, $pages2)); 144 break; 145 case 'NOT': // not operation (unary) 146 $pages = array_pop($stack); 147 $stack[] = static::resultComplement(array($pages_all, $pages)); 148 break; 149 } 150 } 151 $docs = array_pop($stack); 152 153 if (empty($docs)) return array(); 154 155 // check: settings, acls, existence 156 foreach (array_keys($docs) as $id) { 157 if (isHiddenPage($id) 158 || auth_quickaclcheck($id) < AUTH_READ 159 || !page_exists($id, '', false) 160 ) { 161 unset($docs[$id]); 162 } 163 } 164 165 $docs = static::filterResultsByTime($docs, $data['after'], $data['before']); 166 167 if ($data['sort'] === 'mtime') { 168 uksort($docs, static::class.'::pagemtimesorter'); 169 } else { 170 // sort docs by count 171 arsort($docs); 172 } 173 174 return $docs; 175 } 176 177 /** 178 * @param array $results search results in the form pageid => value 179 * @param int|string $after only returns results with mtime after this date, 180 * accepts timestap or strtotime arguments 181 * @param int|string $before only returns results with mtime after this date, 182 * accepts timestap or strtotime arguments 183 * 184 * @return array 185 */ 186 protected static function filterResultsByTime(array $results, $after, $before) 187 { 188 if ($after || $before) { 189 $after = is_int($after) ? $after : strtotime($after); 190 $before = is_int($before) ? $before : strtotime($before); 191 192 foreach ($results as $id => $value) { 193 $mTime = filemtime(wikiFN($id)); 194 if ($after && $after > $mTime) { 195 unset($results[$id]); 196 continue; 197 } 198 if ($before && $before < $mTime) { 199 unset($results[$id]); 200 } 201 } 202 } 203 return $results; 204 } 205 206 /** 207 * Sort pages by their mtime, from newest to oldest 208 * 209 * @param string $a 210 * @param string $b 211 * 212 * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a 213 * and 0 if they are of the same age 214 */ 215 protected static function pagemtimesorter($a, $b) 216 { 217 $mtimeA = filemtime(wikiFN($a)); 218 $mtimeB = filemtime(wikiFN($b)); 219 return $mtimeB - $mtimeA; 220 } 221 222 /** 223 * Creates a snippet extract 224 * 225 * @author Andreas Gohr <andi@splitbrain.org> 226 * @triggers FULLTEXT_SNIPPET_CREATE 227 * 228 * @param string $id page id 229 * @param array $highlight 230 * @return mixed 231 */ 232 public static function snippet($id, $highlight) 233 { 234 $text = rawWiki($id); 235 $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 236 $evdata = array( 237 'id' => $id, 238 'text' => &$text, 239 'highlight' => &$highlight, 240 'snippet' => '', 241 ); 242 243 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 244 if ($evt->advise_before()) { 245 $match = array(); 246 $snippets = array(); 247 $utf8_offset = $offset = $end = 0; 248 $len = Utf8\PhpString::strlen($text); 249 250 // build a regexp from the phrases to highlight 251 $re1 = '(' . 252 join( 253 '|', 254 array_map( 255 static::class.'::snippetRePreprocess', 256 array_map( 257 'preg_quote_cb', 258 array_filter((array) $highlight) 259 ) 260 ) 261 ) . 262 ')'; 263 $re2 = "$re1.{0,75}(?!\\1)$re1"; 264 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 265 266 for ($cnt=4; $cnt--;) { 267 if (0) { 268 } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 269 } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 270 } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 271 } else { 272 break; 273 } 274 275 list($str, $idx) = $match[0]; 276 277 // convert $idx (a byte offset) into a utf8 character offset 278 $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx)); 279 $utf8_len = Utf8\PhpString::strlen($str); 280 281 // establish context, 100 bytes surrounding the match string 282 // first look to see if we can go 100 either side, 283 // then drop to 50 adding any excess if the other side can't go to 50, 284 $pre = min($utf8_idx - $utf8_offset, 100); 285 $post = min($len - $utf8_idx - $utf8_len, 100); 286 287 if ($pre > 50 && $post > 50) { 288 $pre = $post = 50; 289 } elseif ($pre > 50) { 290 $pre = min($pre, 100 - $post); 291 } elseif ($post > 50) { 292 $post = min($post, 100 - $pre); 293 } elseif ($offset == 0) { 294 // both are less than 50, means the context is the whole string 295 // make it so and break out of this loop - there is no need for the 296 // complex snippet calculations 297 $snippets = array($text); 298 break; 299 } 300 301 // establish context start and end points, try to append to previous 302 // context if possible 303 $start = $utf8_idx - $pre; 304 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 305 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 306 307 if ($append) { 308 $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append); 309 } else { 310 $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start); 311 } 312 313 // set $offset for next match attempt 314 // continue matching after the current match 315 // if the current match is not the longest possible match starting at the current offset 316 // this prevents further matching of this snippet but for possible matches of length 317 // smaller than match length + context (at least 50 characters) this match is part of the context 318 $utf8_offset = $utf8_idx + $utf8_len; 319 $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len)); 320 $offset = Utf8\Clean::correctIdx($text, $offset); 321 } 322 323 $m = "\1"; 324 $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 325 $snippet = preg_replace( 326 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 327 '<strong class="search_hit">$1</strong>', 328 hsc(join('... ', $snippets)) 329 ); 330 331 $evdata['snippet'] = $snippet; 332 } 333 $evt->advise_after(); 334 unset($evt); 335 336 return $evdata['snippet']; 337 } 338 339 /** 340 * Wraps a search term in regex boundary checks. 341 * 342 * @param string $term 343 * @return string 344 */ 345 public static function snippetRePreprocess($term) 346 { 347 // do not process asian terms where word boundaries are not explicit 348 if (Utf8\Asian::isAsianWords($term)) return $term; 349 350 if (UTF8_PROPERTYSUPPORT) { 351 // unicode word boundaries 352 // see http://stackoverflow.com/a/2449017/172068 353 $BL = '(?<!\pL)'; 354 $BR = '(?!\pL)'; 355 } else { 356 // not as correct as above, but at least won't break 357 $BL = '\b'; 358 $BR = '\b'; 359 } 360 361 if (substr($term, 0, 2) == '\\*') { 362 $term = substr($term, 2); 363 } else { 364 $term = $BL.$term; 365 } 366 367 if (substr($term, -2, 2) == '\\*') { 368 $term = substr($term, 0, -2); 369 } else { 370 $term = $term.$BR; 371 } 372 373 if ($term == $BL || $term == $BR || $term == $BL.$BR) { 374 $term = ''; 375 } 376 return $term; 377 } 378 379 /** 380 * Combine found documents and sum up their scores 381 * 382 * This function is used to combine searched words with a logical 383 * AND. Only documents available in all arrays are returned. 384 * 385 * based upon PEAR's PHP_Compat function for array_intersect_key() 386 * 387 * @param array $args An array of page arrays 388 * @return array 389 */ 390 protected static function resultCombine($args) 391 { 392 $array_count = count($args); 393 if ($array_count == 1) { 394 return $args[0]; 395 } 396 397 $result = array(); 398 if ($array_count > 1) { 399 foreach ($args[0] as $key => $value) { 400 $result[$key] = $value; 401 for ($i = 1; $i !== $array_count; $i++) { 402 if (!isset($args[$i][$key])) { 403 unset($result[$key]); 404 break; 405 } 406 $result[$key] += $args[$i][$key]; 407 } 408 } 409 } 410 return $result; 411 } 412 413 /** 414 * Unites found documents and sum up their scores 415 * based upon resultCombine() method 416 * 417 * @param array $args An array of page arrays 418 * @return array 419 * 420 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 421 */ 422 protected static function resultUnite($args) 423 { 424 $array_count = count($args); 425 if ($array_count === 1) { 426 return $args[0]; 427 } 428 429 $result = $args[0]; 430 for ($i = 1; $i !== $array_count; $i++) { 431 foreach (array_keys($args[$i]) as $id) { 432 $result[$id] += $args[$i][$id]; 433 } 434 } 435 return $result; 436 } 437 438 /** 439 * Computes the difference of documents using page id for comparison 440 * nearly identical to PHP5's array_diff_key() 441 * 442 * @param array $args An array of page arrays 443 * @return array 444 * 445 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 446 */ 447 protected static function resultComplement($args) 448 { 449 $array_count = count($args); 450 if ($array_count === 1) { 451 return $args[0]; 452 } 453 454 $result = $args[0]; 455 foreach (array_keys($result) as $id) { 456 for ($i = 1; $i !== $array_count; $i++) { 457 if (isset($args[$i][$id])) unset($result[$id]); 458 } 459 } 460 return $result; 461 } 462} 463