1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9if(!defined('DOKU_INC')) die('meh.'); 10require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * The fulltext search 15 * 16 * Returns a list of matching documents for the given query 17 * 18 * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event() 19 * 20 */ 21function ft_pageSearch($query,&$highlight){ 22 23 $data['query'] = $query; 24 $data['highlight'] =& $highlight; 25 26 return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch'); 27} 28function _ft_pageSearch(&$data){ 29 // split out original parameters 30 $query = $data['query']; 31 $highlight =& $data['highlight']; 32 33 $q = ft_queryParser($query); 34 35 $highlight = array(); 36 37 // remember for hilighting later 38 foreach($q['words'] as $wrd){ 39 $highlight[] = str_replace('*','',$wrd); 40 } 41 42 // lookup all words found in the query 43 $words = array_merge($q['and'],$q['not']); 44 if(!count($words)) return array(); 45 $result = idx_lookup($words); 46 if(!count($result)) return array(); 47 48 // merge search results with query 49 foreach($q['and'] as $pos => $w){ 50 $q['and'][$pos] = $result[$w]; 51 } 52 // create a list of unwanted docs 53 $not = array(); 54 foreach($q['not'] as $pos => $w){ 55 $not = array_merge($not,array_keys($result[$w])); 56 } 57 58 // combine and-words 59 if(count($q['and']) > 1){ 60 $docs = ft_resultCombine($q['and']); 61 }else{ 62 $docs = $q['and'][0]; 63 } 64 if(!count($docs)) return array(); 65 66 // create a list of hidden pages in the result 67 $hidden = array(); 68 $hidden = array_filter(array_keys($docs),'isHiddenPage'); 69 $not = array_merge($not,$hidden); 70 71 // filter unmatched namespaces 72 if(!empty($q['ns'])) { 73 $pattern = implode('|^',$q['ns']); 74 foreach($docs as $key => $val) { 75 if(!preg_match('/^'.$pattern.'/',$key)) { 76 unset($docs[$key]); 77 } 78 } 79 } 80 81 // filter unwanted namespaces 82 if(!empty($q['notns'])) { 83 $pattern = implode('|^',$q['notns']); 84 foreach($docs as $key => $val) { 85 if(preg_match('/^'.$pattern.'/',$key)) { 86 unset($docs[$key]); 87 } 88 } 89 } 90 91 // remove negative matches 92 foreach($not as $n){ 93 unset($docs[$n]); 94 } 95 96 if(!count($docs)) return array(); 97 // handle phrases 98 if(count($q['phrases'])){ 99 $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 100 // use this for higlighting later: 101 $highlight = array_merge($highlight,$q['phrases']); 102 $q['phrases'] = array_map('preg_quote_cb',$q['phrases']); 103 // check the source of all documents for the exact phrases 104 foreach(array_keys($docs) as $id){ 105 $text = utf8_strtolower(rawWiki($id)); 106 foreach($q['phrases'] as $phrase){ 107 if(!preg_match('/'.$phrase.'/usi',$text)){ 108 unset($docs[$id]); // no hit - remove 109 break; 110 } 111 } 112 } 113 } 114 115 if(!count($docs)) return array(); 116 117 // check ACL permissions 118 foreach(array_keys($docs) as $doc){ 119 if(auth_quickaclcheck($doc) < AUTH_READ){ 120 unset($docs[$doc]); 121 } 122 } 123 124 if(!count($docs)) return array(); 125 126 // if there are any hits left, sort them by count 127 arsort($docs); 128 129 return $docs; 130} 131 132/** 133 * Returns the backlinks for a given page 134 * 135 * Does a quick lookup with the fulltext index, then 136 * evaluates the instructions of the found pages 137 */ 138function ft_backlinks($id){ 139 global $conf; 140 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 141 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 142 143 $result = array(); 144 145 // quick lookup of the pagename 146 $page = noNS($id); 147 $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) 148 $docs = array_keys(ft_resultCombine(array_values($matches))); 149 $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages 150 if(!count($docs)) return $result; 151 require_once(DOKU_INC.'inc/parserutils.php'); 152 153 // check metadata for matching links 154 foreach($docs as $match){ 155 // metadata relation reference links are already resolved 156 $links = p_get_metadata($match,'relation references'); 157 if (isset($links[$id])) $result[] = $match; 158 } 159 160 if(!count($result)) return $result; 161 162 // check ACL permissions 163 foreach(array_keys($result) as $idx){ 164 if(auth_quickaclcheck($result[$idx]) < AUTH_READ){ 165 unset($result[$idx]); 166 } 167 } 168 169 sort($result); 170 return $result; 171} 172 173/** 174 * Returns the pages that use a given media file 175 * 176 * Does a quick lookup with the fulltext index, then 177 * evaluates the instructions of the found pages 178 * 179 * Aborts after $max found results 180 */ 181function ft_mediause($id,$max){ 182 global $conf; 183 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 184 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 185 186 if(!$max) $max = 1; // need to find at least one 187 188 $result = array(); 189 190 // quick lookup of the mediafile 191 $media = noNS($id); 192 $matches = idx_lookup(idx_tokenizer($media,$stopwords)); 193 $docs = array_keys(ft_resultCombine(array_values($matches))); 194 if(!count($docs)) return $result; 195 196 // go through all found pages 197 $found = 0; 198 $pcre = preg_quote($media,'/'); 199 foreach($docs as $doc){ 200 $ns = getNS($doc); 201 preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches); 202 foreach($matches[1] as $img){ 203 $img = trim($img); 204 if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images 205 list($img) = explode('?',$img); // remove any parameters 206 resolve_mediaid($ns,$img,$exists); // resolve the possibly relative img 207 208 if($img == $id){ // we have a match 209 $result[] = $doc; 210 $found++; 211 break; 212 } 213 } 214 if($found >= $max) break; 215 } 216 217 sort($result); 218 return $result; 219} 220 221 222 223/** 224 * Quicksearch for pagenames 225 * 226 * By default it only matches the pagename and ignores the 227 * namespace. This can be changed with the second parameter 228 * 229 * refactored into ft_pageLookup(), _ft_pageLookup() and trigger_event() 230 * 231 * @author Andreas Gohr <andi@splitbrain.org> 232 */ 233function ft_pageLookup($id,$pageonly=true){ 234 $data = array('id' => $id, 'pageonly' => $pageonly); 235 return trigger_event('SEARCH_QUERY_PAGELOOKUP',$data,'_ft_pageLookup'); 236} 237 238function _ft_pageLookup(&$data){ 239 // split out original parameterrs 240 $id = $data['id']; 241 $pageonly = $data['pageonly']; 242 243 global $conf; 244 $id = preg_quote($id,'/'); 245 $pages = file($conf['indexdir'].'/page.idx'); 246 if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages)); 247 248 $cnt = count($pages); 249 for($i=0; $i<$cnt; $i++){ 250 if($pageonly){ 251 if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 252 unset($pages[$i]); 253 continue; 254 } 255 } 256 if(!page_exists($pages[$i])){ 257 unset($pages[$i]); 258 continue; 259 } 260 } 261 262 $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages 263 if(!count($pages)) return array(); 264 265 // check ACL permissions 266 foreach(array_keys($pages) as $idx){ 267 if(auth_quickaclcheck(trim($pages[$idx])) < AUTH_READ){ 268 unset($pages[$idx]); 269 } 270 } 271 272 $pages = array_map('trim',$pages); 273 usort($pages,'ft_pagesorter'); 274 return $pages; 275} 276 277/** 278 * Sort pages based on their namespace level first, then on their string 279 * values. This makes higher hierarchy pages rank higher than lower hierarchy 280 * pages. 281 */ 282function ft_pagesorter($a, $b){ 283 $ac = count(explode(':',$a)); 284 $bc = count(explode(':',$b)); 285 if($ac < $bc){ 286 return -1; 287 }elseif($ac > $bc){ 288 return 1; 289 } 290 return strcmp ($a,$b); 291} 292 293/** 294 * Creates a snippet extract 295 * 296 * @author Andreas Gohr <andi@splitbrain.org> 297 */ 298function ft_snippet($id,$highlight){ 299 $text = rawWiki($id); 300 $match = array(); 301 $snippets = array(); 302 $utf8_offset = $offset = $end = 0; 303 $len = utf8_strlen($text); 304 305 // build a regexp from the phrases to highlight 306 $re1 = '('.join('|',array_map('preg_quote_cb',array_filter((array) $highlight))).')'; 307 $re2 = "$re1.{0,75}(?!\\1)$re1"; 308 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 309 310 for ($cnt=4; $cnt--;) { 311 if (0) { 312 } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { 313 } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { 314 } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { 315 } else { 316 break; 317 } 318 319 list($str,$idx) = $match[0]; 320 321 // convert $idx (a byte offset) into a utf8 character offset 322 $utf8_idx = utf8_strlen(substr($text,0,$idx)); 323 $utf8_len = utf8_strlen($str); 324 325 // establish context, 100 bytes surrounding the match string 326 // first look to see if we can go 100 either side, 327 // then drop to 50 adding any excess if the other side can't go to 50, 328 $pre = min($utf8_idx-$utf8_offset,100); 329 $post = min($len-$utf8_idx-$utf8_len,100); 330 331 if ($pre>50 && $post>50) { 332 $pre = $post = 50; 333 } else if ($pre>50) { 334 $pre = min($pre,100-$post); 335 } else if ($post>50) { 336 $post = min($post, 100-$pre); 337 } else { 338 // both are less than 50, means the context is the whole string 339 // make it so and break out of this loop - there is no need for the 340 // complex snippet calculations 341 $snippets = array($text); 342 break; 343 } 344 345 // establish context start and end points, try to append to previous 346 // context if possible 347 $start = $utf8_idx - $pre; 348 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 349 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 350 351 if ($append) { 352 $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); 353 } else { 354 $snippets[] = utf8_substr($text,$start,$end-$start); 355 } 356 357 // set $offset for next match attempt 358 // substract strlen to avoid splitting a potential search success, 359 // this is an approximation as the search pattern may match strings 360 // of varying length and it will fail if the context snippet 361 // boundary breaks a matching string longer than the current match 362 $utf8_offset = $utf8_idx + $post; 363 $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); 364 $offset = utf8_correctIdx($text,$offset); 365 } 366 367 $m = "\1"; 368 $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets); 369 $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets))); 370 371 return $snippet; 372} 373 374/** 375 * Combine found documents and sum up their scores 376 * 377 * This function is used to combine searched words with a logical 378 * AND. Only documents available in all arrays are returned. 379 * 380 * based upon PEAR's PHP_Compat function for array_intersect_key() 381 * 382 * @param array $args An array of page arrays 383 */ 384function ft_resultCombine($args){ 385 $array_count = count($args); 386 if($array_count == 1){ 387 return $args[0]; 388 } 389 390 $result = array(); 391 if ($array_count > 1) { 392 foreach ($args[0] as $key => $value) { 393 $result[$key] = $value; 394 for ($i = 1; $i !== $array_count; $i++) { 395 if (!isset($args[$i][$key])) { 396 unset($result[$key]); 397 break; 398 } 399 $result[$key] += $args[$i][$key]; 400 } 401 } 402 } 403 return $result; 404} 405 406/** 407 * Builds an array of search words from a query 408 * 409 * @todo support OR and parenthesises? 410 */ 411function ft_queryParser($query){ 412 global $conf; 413 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 414 if(@file_exists($swfile)){ 415 $stopwords = file($swfile); 416 }else{ 417 $stopwords = array(); 418 } 419 420 $q = array(); 421 $q['query'] = $query; 422 $q['ns'] = array(); 423 $q['notns'] = array(); 424 $q['phrases'] = array(); 425 $q['words'] = array(); 426 $q['and'] = array(); 427 $q['not'] = array(); 428 429 // handle phrase searches 430 while(preg_match('/"(.*?)"/',$query,$match)){ 431 $q['phrases'][] = $match[1]; 432 $q['and'] = array_merge($q['and'], idx_tokenizer($match[0],$stopwords)); 433 $query = preg_replace('/"(.*?)"/','',$query,1); 434 } 435 436 $words = explode(' ',$query); 437 foreach($words as $w){ 438 if($w{0} == '-'){ 439 $token = idx_tokenizer($w,$stopwords,true); 440 if(count($token)) $q['not'] = array_merge($q['not'],$token); 441 } else if ($w{0} == '@') { // Namespace to search? 442 $w = substr($w,1); 443 $q['ns'] = array_merge($q['ns'],(array)$w); 444 } else if ($w{0} == '^') { // Namespace not to search? 445 $w = substr($w,1); 446 $q['notns'] = array_merge($q['notns'],(array)$w); 447 }else{ 448 // asian "words" need to be searched as phrases 449 if(@preg_match_all('/(('.IDX_ASIAN.')+)/u',$w,$matches)){ 450 $q['phrases'] = array_merge($q['phrases'],$matches[1]); 451 452 } 453 $token = idx_tokenizer($w,$stopwords,true); 454 if(count($token)){ 455 $q['and'] = array_merge($q['and'],$token); 456 $q['words'] = array_merge($q['words'],$token); 457 } 458 } 459 } 460 461 return $q; 462} 463 464//Setup VIM: ex: et ts=4 enc=utf-8 : 465