1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\FulltextIndex; 7use dokuwiki\Search\QueryParser; 8use dokuwiki\Utf8; 9 10// create snippets for the first few results only 11const FT_SNIPPET_NUMBER = 15; 12 13/** 14 * Class DokuWiki Fulltext Search 15 * 16 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 17 * @author Andreas Gohr <andi@splitbrain.org> 18 */ 19class FulltextSearch 20{ 21 /** 22 * Fulltext Search constructor. prevent direct object creation 23 */ 24 protected function __construct() {} 25 26 /** 27 * The fulltext search 28 * 29 * Returns a list of matching documents for the given query 30 * 31 * refactored into pageSearch(), pageSearchCallBack() and trigger_event() 32 * 33 * @param string $query 34 * @param array $highlight 35 * @param string $sort 36 * @param int|string $after only show results with mtime after this date, 37 * accepts timestap or strtotime arguments 38 * @param int|string $before only show results with mtime before this date, 39 * accepts timestap or strtotime arguments 40 * 41 * @return array 42 */ 43 public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 44 { 45 if ($sort === null) { 46 $sort = 'hits'; 47 } 48 $data = [ 49 'query' => $query, 50 'sort' => $sort, 51 'after' => $after, 52 'before' => $before 53 ]; 54 $data['highlight'] =& $highlight; 55 $action = static::class.'::pageSearchCallBack'; 56 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 57 } 58 59 /** 60 * Returns a list of matching documents for the given query 61 * 62 * @author Andreas Gohr <andi@splitbrain.org> 63 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 64 * 65 * @param array $data event data 66 * @return array matching documents 67 */ 68 public static function pageSearchCallBack(&$data) 69 { 70 // parse the given query 71 $q = QueryParser::convert($data['query']); 72 $data['highlight'] = $q['highlight']; 73 74 if (empty($q['parsed_ary'])) return array(); 75 76 // lookup all words found in the query 77 $FulltextIndex = FulltextIndex::getInstance(); 78 $lookup = $FulltextIndex->lookupWords($q['words']); 79 80 // get all pages in this dokuwiki site (!: includes nonexistent pages) 81 $pages_all = array(); 82 foreach ($FulltextIndex->getPages() as $id) { 83 $pages_all[$id] = 0; // base: 0 hit 84 } 85 86 // process the query 87 $stack = array(); 88 foreach ($q['parsed_ary'] as $token) { 89 switch (substr($token, 0, 3)) { 90 case 'W+:': 91 case 'W-:': 92 case 'W_:': // word 93 $word = substr($token, 3); 94 $stack[] = (array) $lookup[$word]; 95 break; 96 case 'P+:': 97 case 'P-:': // phrase 98 $phrase = substr($token, 3); 99 // since phrases are always parsed as ((W1)(W2)...(P)), 100 // the end($stack) always points the pages that contain 101 // all words in this phrase 102 $pages = end($stack); 103 $pages_matched = array(); 104 foreach (array_keys($pages) as $id) { 105 $evdata = array( 106 'id' => $id, 107 'phrase' => $phrase, 108 'text' => rawWiki($id) 109 ); 110 $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 111 if ($event->advise_before() && $event->result !== true) { 112 $text = Utf8\PhpString::strtolower($evdata['text']); 113 if (strpos($text, $phrase) !== false) { 114 $event->result = true; 115 } 116 } 117 $event->advise_after(); 118 if ($event->result === true) { 119 $pages_matched[$id] = 0; // phrase: always 0 hit 120 } 121 } 122 $stack[] = $pages_matched; 123 break; 124 case 'N+:': 125 case 'N-:': // namespace 126 $ns = cleanID(substr($token, 3)) . ':'; 127 $pages_matched = array(); 128 foreach (array_keys($pages_all) as $id) { 129 if (strpos($id, $ns) === 0) { 130 $pages_matched[$id] = 0; // namespace: always 0 hit 131 } 132 } 133 $stack[] = $pages_matched; 134 break; 135 case 'AND': // and operation 136 list($pages1, $pages2) = array_splice($stack, -2); 137 $stack[] = static::resultCombine(array($pages1, $pages2)); 138 break; 139 case 'OR': // or operation 140 list($pages1, $pages2) = array_splice($stack, -2); 141 $stack[] = static::resultUnite(array($pages1, $pages2)); 142 break; 143 case 'NOT': // not operation (unary) 144 $pages = array_pop($stack); 145 $stack[] = static::resultComplement(array($pages_all, $pages)); 146 break; 147 } 148 } 149 $docs = array_pop($stack); 150 151 if (empty($docs)) return array(); 152 153 // check: settings, acls, existence 154 foreach (array_keys($docs) as $id) { 155 if (isHiddenPage($id) 156 || auth_quickaclcheck($id) < AUTH_READ 157 || !page_exists($id, '', false) 158 ) { 159 unset($docs[$id]); 160 } 161 } 162 163 $docs = static::filterResultsByTime($docs, $data['after'], $data['before']); 164 165 if ($data['sort'] === 'mtime') { 166 uksort($docs, static::class.'::pagemtimesorter'); 167 } else { 168 // sort docs by count 169 arsort($docs); 170 } 171 172 return $docs; 173 } 174 175 /** 176 * @param array $results search results in the form pageid => value 177 * @param int|string $after only returns results with mtime after this date, 178 * accepts timestap or strtotime arguments 179 * @param int|string $before only returns results with mtime after this date, 180 * accepts timestap or strtotime arguments 181 * 182 * @return array 183 */ 184 protected static function filterResultsByTime(array $results, $after, $before) 185 { 186 if ($after || $before) { 187 $after = is_int($after) ? $after : strtotime($after); 188 $before = is_int($before) ? $before : strtotime($before); 189 190 foreach ($results as $id => $value) { 191 $mTime = filemtime(wikiFN($id)); 192 if ($after && $after > $mTime) { 193 unset($results[$id]); 194 continue; 195 } 196 if ($before && $before < $mTime) { 197 unset($results[$id]); 198 } 199 } 200 } 201 return $results; 202 } 203 204 /** 205 * Sort pages by their mtime, from newest to oldest 206 * 207 * @param string $a 208 * @param string $b 209 * 210 * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a 211 * and 0 if they are of the same age 212 */ 213 protected static function pagemtimesorter($a, $b) 214 { 215 $mtimeA = filemtime(wikiFN($a)); 216 $mtimeB = filemtime(wikiFN($b)); 217 return $mtimeB - $mtimeA; 218 } 219 220 /** 221 * Creates a snippet extract 222 * 223 * @author Andreas Gohr <andi@splitbrain.org> 224 * @triggers FULLTEXT_SNIPPET_CREATE 225 * 226 * @param string $id page id 227 * @param array $highlight 228 * @return mixed 229 */ 230 public static function snippet($id, $highlight) 231 { 232 $text = rawWiki($id); 233 $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 234 $evdata = array( 235 'id' => $id, 236 'text' => &$text, 237 'highlight' => &$highlight, 238 'snippet' => '', 239 ); 240 241 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 242 if ($evt->advise_before()) { 243 $match = array(); 244 $snippets = array(); 245 $utf8_offset = $offset = $end = 0; 246 $len = Utf8\PhpString::strlen($text); 247 248 // build a regexp from the phrases to highlight 249 $re1 = '(' . 250 join( 251 '|', 252 array_map( 253 static::class.'::snippetRePreprocess', 254 array_map( 255 'preg_quote_cb', 256 array_filter((array) $highlight) 257 ) 258 ) 259 ) . 260 ')'; 261 $re2 = "$re1.{0,75}(?!\\1)$re1"; 262 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 263 264 for ($cnt=4; $cnt--;) { 265 if (0) { 266 } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 267 } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 268 } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 269 } else { 270 break; 271 } 272 273 list($str, $idx) = $match[0]; 274 275 // convert $idx (a byte offset) into a utf8 character offset 276 $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx)); 277 $utf8_len = Utf8\PhpString::strlen($str); 278 279 // establish context, 100 bytes surrounding the match string 280 // first look to see if we can go 100 either side, 281 // then drop to 50 adding any excess if the other side can't go to 50, 282 $pre = min($utf8_idx - $utf8_offset, 100); 283 $post = min($len - $utf8_idx - $utf8_len, 100); 284 285 if ($pre > 50 && $post > 50) { 286 $pre = $post = 50; 287 } elseif ($pre > 50) { 288 $pre = min($pre, 100 - $post); 289 } elseif ($post > 50) { 290 $post = min($post, 100 - $pre); 291 } elseif ($offset == 0) { 292 // both are less than 50, means the context is the whole string 293 // make it so and break out of this loop - there is no need for the 294 // complex snippet calculations 295 $snippets = array($text); 296 break; 297 } 298 299 // establish context start and end points, try to append to previous 300 // context if possible 301 $start = $utf8_idx - $pre; 302 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 303 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 304 305 if ($append) { 306 $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append); 307 } else { 308 $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start); 309 } 310 311 // set $offset for next match attempt 312 // continue matching after the current match 313 // if the current match is not the longest possible match starting at the current offset 314 // this prevents further matching of this snippet but for possible matches of length 315 // smaller than match length + context (at least 50 characters) this match is part of the context 316 $utf8_offset = $utf8_idx + $utf8_len; 317 $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len)); 318 $offset = Utf8\Clean::correctIdx($text, $offset); 319 } 320 321 $m = "\1"; 322 $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 323 $snippet = preg_replace( 324 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 325 '<strong class="search_hit">$1</strong>', 326 hsc(join('... ', $snippets)) 327 ); 328 329 $evdata['snippet'] = $snippet; 330 } 331 $evt->advise_after(); 332 unset($evt); 333 334 return $evdata['snippet']; 335 } 336 337 /** 338 * Wraps a search term in regex boundary checks. 339 * 340 * @param string $term 341 * @return string 342 */ 343 public static function snippetRePreprocess($term) 344 { 345 // do not process asian terms where word boundaries are not explicit 346 if (Utf8\Asian::isAsianWords($term)) return $term; 347 348 if (UTF8_PROPERTYSUPPORT) { 349 // unicode word boundaries 350 // see http://stackoverflow.com/a/2449017/172068 351 $BL = '(?<!\pL)'; 352 $BR = '(?!\pL)'; 353 } else { 354 // not as correct as above, but at least won't break 355 $BL = '\b'; 356 $BR = '\b'; 357 } 358 359 if (substr($term, 0, 2) == '\\*') { 360 $term = substr($term, 2); 361 } else { 362 $term = $BL.$term; 363 } 364 365 if (substr($term, -2, 2) == '\\*') { 366 $term = substr($term, 0, -2); 367 } else { 368 $term = $term.$BR; 369 } 370 371 if ($term == $BL || $term == $BR || $term == $BL.$BR) { 372 $term = ''; 373 } 374 return $term; 375 } 376 377 /** 378 * Combine found documents and sum up their scores 379 * 380 * This function is used to combine searched words with a logical 381 * AND. Only documents available in all arrays are returned. 382 * 383 * based upon PEAR's PHP_Compat function for array_intersect_key() 384 * 385 * @param array $args An array of page arrays 386 * @return array 387 */ 388 protected static function resultCombine($args) 389 { 390 $array_count = count($args); 391 if ($array_count == 1) { 392 return $args[0]; 393 } 394 395 $result = array(); 396 if ($array_count > 1) { 397 foreach ($args[0] as $key => $value) { 398 $result[$key] = $value; 399 for ($i = 1; $i !== $array_count; $i++) { 400 if (!isset($args[$i][$key])) { 401 unset($result[$key]); 402 break; 403 } 404 $result[$key] += $args[$i][$key]; 405 } 406 } 407 } 408 return $result; 409 } 410 411 /** 412 * Unites found documents and sum up their scores 413 * based upon resultCombine() method 414 * 415 * @param array $args An array of page arrays 416 * @return array 417 * 418 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 419 */ 420 protected static function resultUnite($args) 421 { 422 $array_count = count($args); 423 if ($array_count === 1) { 424 return $args[0]; 425 } 426 427 $result = $args[0]; 428 for ($i = 1; $i !== $array_count; $i++) { 429 foreach (array_keys($args[$i]) as $id) { 430 $result[$id] += $args[$i][$id]; 431 } 432 } 433 return $result; 434 } 435 436 /** 437 * Computes the difference of documents using page id for comparison 438 * nearly identical to PHP5's array_diff_key() 439 * 440 * @param array $args An array of page arrays 441 * @return array 442 * 443 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 444 */ 445 protected static function resultComplement($args) 446 { 447 $array_count = count($args); 448 if ($array_count === 1) { 449 return $args[0]; 450 } 451 452 $result = $args[0]; 453 foreach (array_keys($result) as $id) { 454 for ($i = 1; $i !== $array_count; $i++) { 455 if (isset($args[$i][$id])) unset($result[$id]); 456 } 457 } 458 return $result; 459 } 460} 461