1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Utf8; 7 8// create snippets for the first few results only 9const FT_SNIPPET_NUMBER = 15; 10 11/** 12 * Class DokuWiki Fulltext Search 13 * 14 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 15 * @author Andreas Gohr <andi@splitbrain.org> 16 */ 17class FulltextSearch 18{ 19 /** @var FulltextSearch $instance */ 20 protected static $instance = null; 21 22 /** 23 * Get new or existing singleton instance of the FulltextSearch 24 * 25 * @return FulltextSearch 26 */ 27 public static function getInstance() 28 { 29 if (is_null(static::$instance)) { 30 static::$instance = new static(); 31 } 32 return static::$instance; 33 } 34 35 /** 36 * Fulltext Search constructor. prevent direct object creation 37 */ 38 protected function __construct() {} 39 40 /** 41 * The fulltext search 42 * 43 * Returns a list of matching documents for the given query 44 * 45 * refactored into pageSearch(), pageSearchCallBack() and trigger_event() 46 * 47 * @param string $query 48 * @param array $highlight 49 * @param string $sort 50 * @param int|string $after only show results with mtime after this date, 51 * accepts timestap or strtotime arguments 52 * @param int|string $before only show results with mtime before this date, 53 * accepts timestap or strtotime arguments 54 * 55 * @return array 56 */ 57 public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 58 { 59 if ($sort === null) { 60 $sort = 'hits'; 61 } 62 $data = [ 63 'query' => $query, 64 'sort' => $sort, 65 'after' => $after, 66 'before' => $before 67 ]; 68 $data['highlight'] =& $highlight; 69 $action = [$this, 'pageSearchCallBack']; 70 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 71 } 72 73 /** 74 * Returns a list of matching documents for the given query 75 * 76 * @author Andreas Gohr <andi@splitbrain.org> 77 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 78 * 79 * @param array $data event data 80 * @return array matching documents 81 */ 82 public function pageSearchCallBack(&$data) 83 { 84 // parse the given query 85 $q = (new QueryParser)->convert($data['query']); 86 $data['highlight'] = $q['highlight']; 87 88 if (empty($q['parsed_ary'])) return array(); 89 90 // lookup all words found in the query 91 $FulltextIndex = FulltextIndex::getInstance(); 92 $lookup = $FulltextIndex->lookupWords($q['words']); 93 94 // get all pages in this dokuwiki site (!: includes nonexistent pages) 95 $pages_all = array(); 96 foreach ($FulltextIndex->getPages() as $id) { 97 $pages_all[$id] = 0; // base: 0 hit 98 } 99 100 // process the query 101 $stack = array(); 102 foreach ($q['parsed_ary'] as $token) { 103 switch (substr($token, 0, 3)) { 104 case 'W+:': 105 case 'W-:': 106 case 'W_:': // word 107 $word = substr($token, 3); 108 $stack[] = (array) $lookup[$word]; 109 break; 110 case 'P+:': 111 case 'P-:': // phrase 112 $phrase = substr($token, 3); 113 // since phrases are always parsed as ((W1)(W2)...(P)), 114 // the end($stack) always points the pages that contain 115 // all words in this phrase 116 $pages = end($stack); 117 $pages_matched = array(); 118 foreach (array_keys($pages) as $id) { 119 $evdata = array( 120 'id' => $id, 121 'phrase' => $phrase, 122 'text' => rawWiki($id) 123 ); 124 $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 125 if ($event->advise_before() && $event->result !== true) { 126 $text = Utf8\PhpString::strtolower($evdata['text']); 127 if (strpos($text, $phrase) !== false) { 128 $event->result = true; 129 } 130 } 131 $event->advise_after(); 132 if ($event->result === true) { 133 $pages_matched[$id] = 0; // phrase: always 0 hit 134 } 135 } 136 $stack[] = $pages_matched; 137 break; 138 case 'N+:': 139 case 'N-:': // namespace 140 $ns = cleanID(substr($token, 3)) . ':'; 141 $pages_matched = array(); 142 foreach (array_keys($pages_all) as $id) { 143 if (strpos($id, $ns) === 0) { 144 $pages_matched[$id] = 0; // namespace: always 0 hit 145 } 146 } 147 $stack[] = $pages_matched; 148 break; 149 case 'AND': // and operation 150 list($pages1, $pages2) = array_splice($stack, -2); 151 $stack[] = $this->resultCombine(array($pages1, $pages2)); 152 break; 153 case 'OR': // or operation 154 list($pages1, $pages2) = array_splice($stack, -2); 155 $stack[] = $this->resultUnite(array($pages1, $pages2)); 156 break; 157 case 'NOT': // not operation (unary) 158 $pages = array_pop($stack); 159 $stack[] = $this->resultComplement(array($pages_all, $pages)); 160 break; 161 } 162 } 163 $docs = array_pop($stack); 164 165 if (empty($docs)) return array(); 166 167 // check: settings, acls, existence 168 foreach (array_keys($docs) as $id) { 169 if (isHiddenPage($id) 170 || auth_quickaclcheck($id) < AUTH_READ 171 || !page_exists($id, '', false) 172 ) { 173 unset($docs[$id]); 174 } 175 } 176 177 $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']); 178 179 if ($data['sort'] === 'mtime') { 180 uksort($docs, [$this, 'pagemtimesorter']); 181 } else { 182 // sort docs by count 183 arsort($docs); 184 } 185 186 return $docs; 187 } 188 189 /** 190 * @param array $results search results in the form pageid => value 191 * @param int|string $after only returns results with mtime after this date, 192 * accepts timestap or strtotime arguments 193 * @param int|string $before only returns results with mtime after this date, 194 * accepts timestap or strtotime arguments 195 * 196 * @return array 197 */ 198 protected function filterResultsByTime(array $results, $after, $before) 199 { 200 if ($after || $before) { 201 $after = is_int($after) ? $after : strtotime($after); 202 $before = is_int($before) ? $before : strtotime($before); 203 204 foreach ($results as $id => $value) { 205 $mTime = filemtime(wikiFN($id)); 206 if ($after && $after > $mTime) { 207 unset($results[$id]); 208 continue; 209 } 210 if ($before && $before < $mTime) { 211 unset($results[$id]); 212 } 213 } 214 } 215 return $results; 216 } 217 218 /** 219 * Sort pages by their mtime, from newest to oldest 220 * 221 * @param string $a 222 * @param string $b 223 * 224 * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a 225 * and 0 if they are of the same age 226 */ 227 protected function pagemtimesorter($a, $b) 228 { 229 $mtimeA = filemtime(wikiFN($a)); 230 $mtimeB = filemtime(wikiFN($b)); 231 return $mtimeB - $mtimeA; 232 } 233 234 /** 235 * Creates a snippet extract 236 * 237 * @author Andreas Gohr <andi@splitbrain.org> 238 * @triggers FULLTEXT_SNIPPET_CREATE 239 * 240 * @param string $id page id 241 * @param array $highlight 242 * @return mixed 243 */ 244 public function snippet($id, $highlight) 245 { 246 $text = rawWiki($id); 247 $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 248 $evdata = array( 249 'id' => $id, 250 'text' => &$text, 251 'highlight' => &$highlight, 252 'snippet' => '', 253 ); 254 255 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 256 if ($evt->advise_before()) { 257 $match = array(); 258 $snippets = array(); 259 $utf8_offset = $offset = $end = 0; 260 $len = Utf8\PhpString::strlen($text); 261 262 // build a regexp from the phrases to highlight 263 $re1 = '(' . 264 join( 265 '|', 266 array_map( 267 [$this, 'snippetRePreprocess'], 268 array_map( 269 'preg_quote_cb', 270 array_filter((array) $highlight) 271 ) 272 ) 273 ) . 274 ')'; 275 $re2 = "$re1.{0,75}(?!\\1)$re1"; 276 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 277 278 for ($cnt=4; $cnt--;) { 279 if (0) { 280 } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 281 } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 282 } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 283 } else { 284 break; 285 } 286 287 list($str, $idx) = $match[0]; 288 289 // convert $idx (a byte offset) into a utf8 character offset 290 $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx)); 291 $utf8_len = Utf8\PhpString::strlen($str); 292 293 // establish context, 100 bytes surrounding the match string 294 // first look to see if we can go 100 either side, 295 // then drop to 50 adding any excess if the other side can't go to 50, 296 $pre = min($utf8_idx - $utf8_offset, 100); 297 $post = min($len - $utf8_idx - $utf8_len, 100); 298 299 if ($pre > 50 && $post > 50) { 300 $pre = $post = 50; 301 } elseif ($pre > 50) { 302 $pre = min($pre, 100 - $post); 303 } elseif ($post > 50) { 304 $post = min($post, 100 - $pre); 305 } elseif ($offset == 0) { 306 // both are less than 50, means the context is the whole string 307 // make it so and break out of this loop - there is no need for the 308 // complex snippet calculations 309 $snippets = array($text); 310 break; 311 } 312 313 // establish context start and end points, try to append to previous 314 // context if possible 315 $start = $utf8_idx - $pre; 316 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 317 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 318 319 if ($append) { 320 $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append); 321 } else { 322 $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start); 323 } 324 325 // set $offset for next match attempt 326 // continue matching after the current match 327 // if the current match is not the longest possible match starting at the current offset 328 // this prevents further matching of this snippet but for possible matches of length 329 // smaller than match length + context (at least 50 characters) this match is part of the context 330 $utf8_offset = $utf8_idx + $utf8_len; 331 $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len)); 332 $offset = Utf8\Clean::correctIdx($text, $offset); 333 } 334 335 $m = "\1"; 336 $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 337 $snippet = preg_replace( 338 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 339 '<strong class="search_hit">$1</strong>', 340 hsc(join('... ', $snippets)) 341 ); 342 343 $evdata['snippet'] = $snippet; 344 } 345 $evt->advise_after(); 346 unset($evt); 347 348 return $evdata['snippet']; 349 } 350 351 /** 352 * Wraps a search term in regex boundary checks. 353 * 354 * @param string $term 355 * @return string 356 */ 357 public function snippetRePreprocess($term) 358 { 359 // do not process asian terms where word boundaries are not explicit 360 if (Utf8\Asian::isAsianWords($term)) return $term; 361 362 if (UTF8_PROPERTYSUPPORT) { 363 // unicode word boundaries 364 // see http://stackoverflow.com/a/2449017/172068 365 $BL = '(?<!\pL)'; 366 $BR = '(?!\pL)'; 367 } else { 368 // not as correct as above, but at least won't break 369 $BL = '\b'; 370 $BR = '\b'; 371 } 372 373 if (substr($term, 0, 2) == '\\*') { 374 $term = substr($term, 2); 375 } else { 376 $term = $BL.$term; 377 } 378 379 if (substr($term, -2, 2) == '\\*') { 380 $term = substr($term, 0, -2); 381 } else { 382 $term = $term.$BR; 383 } 384 385 if ($term == $BL || $term == $BR || $term == $BL.$BR) { 386 $term = ''; 387 } 388 return $term; 389 } 390 391 /** 392 * Combine found documents and sum up their scores 393 * 394 * This function is used to combine searched words with a logical 395 * AND. Only documents available in all arrays are returned. 396 * 397 * based upon PEAR's PHP_Compat function for array_intersect_key() 398 * 399 * @param array $args An array of page arrays 400 * @return array 401 */ 402 protected function resultCombine($args) 403 { 404 $array_count = count($args); 405 if ($array_count == 1) { 406 return $args[0]; 407 } 408 409 $result = array(); 410 if ($array_count > 1) { 411 foreach ($args[0] as $key => $value) { 412 $result[$key] = $value; 413 for ($i = 1; $i !== $array_count; $i++) { 414 if (!isset($args[$i][$key])) { 415 unset($result[$key]); 416 break; 417 } 418 $result[$key] += $args[$i][$key]; 419 } 420 } 421 } 422 return $result; 423 } 424 425 /** 426 * Unites found documents and sum up their scores 427 * based upon resultCombine() method 428 * 429 * @param array $args An array of page arrays 430 * @return array 431 * 432 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 433 */ 434 protected function resultUnite($args) 435 { 436 $array_count = count($args); 437 if ($array_count === 1) { 438 return $args[0]; 439 } 440 441 $result = $args[0]; 442 for ($i = 1; $i !== $array_count; $i++) { 443 foreach (array_keys($args[$i]) as $id) { 444 $result[$id] += $args[$i][$id]; 445 } 446 } 447 return $result; 448 } 449 450 /** 451 * Computes the difference of documents using page id for comparison 452 * nearly identical to PHP5's array_diff_key() 453 * 454 * @param array $args An array of page arrays 455 * @return array 456 * 457 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 458 */ 459 protected function resultComplement($args) 460 { 461 $array_count = count($args); 462 if ($array_count === 1) { 463 return $args[0]; 464 } 465 466 $result = $args[0]; 467 foreach (array_keys($result) as $id) { 468 for ($i = 1; $i !== $array_count; $i++) { 469 if (isset($args[$i][$id])) unset($result[$id]); 470 } 471 } 472 return $result; 473 } 474} 475