1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\FulltextIndex; 7use dokuwiki\Search\QueryParser; 8use dokuwiki\Utf8; 9 10// create snippets for the first few results only 11const FT_SNIPPET_NUMBER = 15; 12 13/** 14 * Class DokuWiki Fulltext Search 15 * 16 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 17 * @author Andreas Gohr <andi@splitbrain.org> 18 */ 19class FulltextSearch 20{ 21 /** @var FulltextSearch $instance */ 22 protected static $instance = null; 23 24 /** 25 * Get new or existing singleton instance of the FulltextSearch 26 * 27 * @return FulltextSearch 28 */ 29 public static function getInstance() 30 { 31 if (is_null(static::$instance)) { 32 static::$instance = new static(); 33 } 34 return static::$instance; 35 } 36 37 /** 38 * Fulltext Search constructor. prevent direct object creation 39 */ 40 protected function __construct() {} 41 42 /** 43 * The fulltext search 44 * 45 * Returns a list of matching documents for the given query 46 * 47 * refactored into pageSearch(), pageSearchCallBack() and trigger_event() 48 * 49 * @param string $query 50 * @param array $highlight 51 * @param string $sort 52 * @param int|string $after only show results with mtime after this date, 53 * accepts timestap or strtotime arguments 54 * @param int|string $before only show results with mtime before this date, 55 * accepts timestap or strtotime arguments 56 * 57 * @return array 58 */ 59 public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 60 { 61 if ($sort === null) { 62 $sort = 'hits'; 63 } 64 $data = [ 65 'query' => $query, 66 'sort' => $sort, 67 'after' => $after, 68 'before' => $before 69 ]; 70 $data['highlight'] =& $highlight; 71 $action = [$this, 'pageSearchCallBack']; 72 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 73 } 74 75 /** 76 * Returns a list of matching documents for the given query 77 * 78 * @author Andreas Gohr <andi@splitbrain.org> 79 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 80 * 81 * @param array $data event data 82 * @return array matching documents 83 */ 84 public function pageSearchCallBack(&$data) 85 { 86 // parse the given query 87 $q = (new QueryParser)->convert($data['query']); 88 $data['highlight'] = $q['highlight']; 89 90 if (empty($q['parsed_ary'])) return array(); 91 92 // lookup all words found in the query 93 $FulltextIndex = FulltextIndex::getInstance(); 94 $lookup = $FulltextIndex->lookupWords($q['words']); 95 96 // get all pages in this dokuwiki site (!: includes nonexistent pages) 97 $pages_all = array(); 98 foreach ($FulltextIndex->getPages() as $id) { 99 $pages_all[$id] = 0; // base: 0 hit 100 } 101 102 // process the query 103 $stack = array(); 104 foreach ($q['parsed_ary'] as $token) { 105 switch (substr($token, 0, 3)) { 106 case 'W+:': 107 case 'W-:': 108 case 'W_:': // word 109 $word = substr($token, 3); 110 $stack[] = (array) $lookup[$word]; 111 break; 112 case 'P+:': 113 case 'P-:': // phrase 114 $phrase = substr($token, 3); 115 // since phrases are always parsed as ((W1)(W2)...(P)), 116 // the end($stack) always points the pages that contain 117 // all words in this phrase 118 $pages = end($stack); 119 $pages_matched = array(); 120 foreach (array_keys($pages) as $id) { 121 $evdata = array( 122 'id' => $id, 123 'phrase' => $phrase, 124 'text' => rawWiki($id) 125 ); 126 $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 127 if ($event->advise_before() && $event->result !== true) { 128 $text = Utf8\PhpString::strtolower($evdata['text']); 129 if (strpos($text, $phrase) !== false) { 130 $event->result = true; 131 } 132 } 133 $event->advise_after(); 134 if ($event->result === true) { 135 $pages_matched[$id] = 0; // phrase: always 0 hit 136 } 137 } 138 $stack[] = $pages_matched; 139 break; 140 case 'N+:': 141 case 'N-:': // namespace 142 $ns = cleanID(substr($token, 3)) . ':'; 143 $pages_matched = array(); 144 foreach (array_keys($pages_all) as $id) { 145 if (strpos($id, $ns) === 0) { 146 $pages_matched[$id] = 0; // namespace: always 0 hit 147 } 148 } 149 $stack[] = $pages_matched; 150 break; 151 case 'AND': // and operation 152 list($pages1, $pages2) = array_splice($stack, -2); 153 $stack[] = $this->resultCombine(array($pages1, $pages2)); 154 break; 155 case 'OR': // or operation 156 list($pages1, $pages2) = array_splice($stack, -2); 157 $stack[] = $this->resultUnite(array($pages1, $pages2)); 158 break; 159 case 'NOT': // not operation (unary) 160 $pages = array_pop($stack); 161 $stack[] = $this->resultComplement(array($pages_all, $pages)); 162 break; 163 } 164 } 165 $docs = array_pop($stack); 166 167 if (empty($docs)) return array(); 168 169 // check: settings, acls, existence 170 foreach (array_keys($docs) as $id) { 171 if (isHiddenPage($id) 172 || auth_quickaclcheck($id) < AUTH_READ 173 || !page_exists($id, '', false) 174 ) { 175 unset($docs[$id]); 176 } 177 } 178 179 $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']); 180 181 if ($data['sort'] === 'mtime') { 182 uksort($docs, [$this, 'pagemtimesorter']); 183 } else { 184 // sort docs by count 185 arsort($docs); 186 } 187 188 return $docs; 189 } 190 191 /** 192 * @param array $results search results in the form pageid => value 193 * @param int|string $after only returns results with mtime after this date, 194 * accepts timestap or strtotime arguments 195 * @param int|string $before only returns results with mtime after this date, 196 * accepts timestap or strtotime arguments 197 * 198 * @return array 199 */ 200 protected function filterResultsByTime(array $results, $after, $before) 201 { 202 if ($after || $before) { 203 $after = is_int($after) ? $after : strtotime($after); 204 $before = is_int($before) ? $before : strtotime($before); 205 206 foreach ($results as $id => $value) { 207 $mTime = filemtime(wikiFN($id)); 208 if ($after && $after > $mTime) { 209 unset($results[$id]); 210 continue; 211 } 212 if ($before && $before < $mTime) { 213 unset($results[$id]); 214 } 215 } 216 } 217 return $results; 218 } 219 220 /** 221 * Sort pages by their mtime, from newest to oldest 222 * 223 * @param string $a 224 * @param string $b 225 * 226 * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a 227 * and 0 if they are of the same age 228 */ 229 protected function pagemtimesorter($a, $b) 230 { 231 $mtimeA = filemtime(wikiFN($a)); 232 $mtimeB = filemtime(wikiFN($b)); 233 return $mtimeB - $mtimeA; 234 } 235 236 /** 237 * Creates a snippet extract 238 * 239 * @author Andreas Gohr <andi@splitbrain.org> 240 * @triggers FULLTEXT_SNIPPET_CREATE 241 * 242 * @param string $id page id 243 * @param array $highlight 244 * @return mixed 245 */ 246 public function snippet($id, $highlight) 247 { 248 $text = rawWiki($id); 249 $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 250 $evdata = array( 251 'id' => $id, 252 'text' => &$text, 253 'highlight' => &$highlight, 254 'snippet' => '', 255 ); 256 257 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 258 if ($evt->advise_before()) { 259 $match = array(); 260 $snippets = array(); 261 $utf8_offset = $offset = $end = 0; 262 $len = Utf8\PhpString::strlen($text); 263 264 // build a regexp from the phrases to highlight 265 $re1 = '(' . 266 join( 267 '|', 268 array_map( 269 [$this, 'snippetRePreprocess'], 270 array_map( 271 'preg_quote_cb', 272 array_filter((array) $highlight) 273 ) 274 ) 275 ) . 276 ')'; 277 $re2 = "$re1.{0,75}(?!\\1)$re1"; 278 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 279 280 for ($cnt=4; $cnt--;) { 281 if (0) { 282 } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 283 } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 284 } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 285 } else { 286 break; 287 } 288 289 list($str, $idx) = $match[0]; 290 291 // convert $idx (a byte offset) into a utf8 character offset 292 $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx)); 293 $utf8_len = Utf8\PhpString::strlen($str); 294 295 // establish context, 100 bytes surrounding the match string 296 // first look to see if we can go 100 either side, 297 // then drop to 50 adding any excess if the other side can't go to 50, 298 $pre = min($utf8_idx - $utf8_offset, 100); 299 $post = min($len - $utf8_idx - $utf8_len, 100); 300 301 if ($pre > 50 && $post > 50) { 302 $pre = $post = 50; 303 } elseif ($pre > 50) { 304 $pre = min($pre, 100 - $post); 305 } elseif ($post > 50) { 306 $post = min($post, 100 - $pre); 307 } elseif ($offset == 0) { 308 // both are less than 50, means the context is the whole string 309 // make it so and break out of this loop - there is no need for the 310 // complex snippet calculations 311 $snippets = array($text); 312 break; 313 } 314 315 // establish context start and end points, try to append to previous 316 // context if possible 317 $start = $utf8_idx - $pre; 318 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 319 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 320 321 if ($append) { 322 $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append); 323 } else { 324 $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start); 325 } 326 327 // set $offset for next match attempt 328 // continue matching after the current match 329 // if the current match is not the longest possible match starting at the current offset 330 // this prevents further matching of this snippet but for possible matches of length 331 // smaller than match length + context (at least 50 characters) this match is part of the context 332 $utf8_offset = $utf8_idx + $utf8_len; 333 $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len)); 334 $offset = Utf8\Clean::correctIdx($text, $offset); 335 } 336 337 $m = "\1"; 338 $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 339 $snippet = preg_replace( 340 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 341 '<strong class="search_hit">$1</strong>', 342 hsc(join('... ', $snippets)) 343 ); 344 345 $evdata['snippet'] = $snippet; 346 } 347 $evt->advise_after(); 348 unset($evt); 349 350 return $evdata['snippet']; 351 } 352 353 /** 354 * Wraps a search term in regex boundary checks. 355 * 356 * @param string $term 357 * @return string 358 */ 359 public function snippetRePreprocess($term) 360 { 361 // do not process asian terms where word boundaries are not explicit 362 if (Utf8\Asian::isAsianWords($term)) return $term; 363 364 if (UTF8_PROPERTYSUPPORT) { 365 // unicode word boundaries 366 // see http://stackoverflow.com/a/2449017/172068 367 $BL = '(?<!\pL)'; 368 $BR = '(?!\pL)'; 369 } else { 370 // not as correct as above, but at least won't break 371 $BL = '\b'; 372 $BR = '\b'; 373 } 374 375 if (substr($term, 0, 2) == '\\*') { 376 $term = substr($term, 2); 377 } else { 378 $term = $BL.$term; 379 } 380 381 if (substr($term, -2, 2) == '\\*') { 382 $term = substr($term, 0, -2); 383 } else { 384 $term = $term.$BR; 385 } 386 387 if ($term == $BL || $term == $BR || $term == $BL.$BR) { 388 $term = ''; 389 } 390 return $term; 391 } 392 393 /** 394 * Combine found documents and sum up their scores 395 * 396 * This function is used to combine searched words with a logical 397 * AND. Only documents available in all arrays are returned. 398 * 399 * based upon PEAR's PHP_Compat function for array_intersect_key() 400 * 401 * @param array $args An array of page arrays 402 * @return array 403 */ 404 protected function resultCombine($args) 405 { 406 $array_count = count($args); 407 if ($array_count == 1) { 408 return $args[0]; 409 } 410 411 $result = array(); 412 if ($array_count > 1) { 413 foreach ($args[0] as $key => $value) { 414 $result[$key] = $value; 415 for ($i = 1; $i !== $array_count; $i++) { 416 if (!isset($args[$i][$key])) { 417 unset($result[$key]); 418 break; 419 } 420 $result[$key] += $args[$i][$key]; 421 } 422 } 423 } 424 return $result; 425 } 426 427 /** 428 * Unites found documents and sum up their scores 429 * based upon resultCombine() method 430 * 431 * @param array $args An array of page arrays 432 * @return array 433 * 434 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 435 */ 436 protected function resultUnite($args) 437 { 438 $array_count = count($args); 439 if ($array_count === 1) { 440 return $args[0]; 441 } 442 443 $result = $args[0]; 444 for ($i = 1; $i !== $array_count; $i++) { 445 foreach (array_keys($args[$i]) as $id) { 446 $result[$id] += $args[$i][$id]; 447 } 448 } 449 return $result; 450 } 451 452 /** 453 * Computes the difference of documents using page id for comparison 454 * nearly identical to PHP5's array_diff_key() 455 * 456 * @param array $args An array of page arrays 457 * @return array 458 * 459 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 460 */ 461 protected function resultComplement($args) 462 { 463 $array_count = count($args); 464 if ($array_count === 1) { 465 return $args[0]; 466 } 467 468 $result = $args[0]; 469 foreach (array_keys($result) as $id) { 470 for ($i = 1; $i !== $array_count; $i++) { 471 if (isset($args[$i][$id])) unset($result[$id]); 472 } 473 } 474 return $result; 475 } 476} 477