1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',fullpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * Wrapper around preg_quote adding the default delimiter 15 */ 16function ft_preg_quote_cb($string){ 17 return preg_quote($string,'/'); 18} 19 20/** 21 * The fulltext search 22 * 23 * Returns a list of matching documents for the given query 24 * 25 */ 26function ft_pageSearch($query,&$poswords){ 27 $q = ft_queryParser($query); 28 // use this for higlighting later: 29 $poswords = str_replace('*','',join(' ',$q['and'])); 30 31 // lookup all words found in the query 32 $words = array_merge($q['and'],$q['not']); 33 if(!count($words)) return array(); 34 $result = idx_lookup($words); 35 if(!count($result)) return array(); 36 37 // merge search results with query 38 foreach($q['and'] as $pos => $w){ 39 $q['and'][$pos] = $result[$w]; 40 } 41 // create a list of unwanted docs 42 $not = array(); 43 foreach($q['not'] as $pos => $w){ 44 $not = array_merge($not,array_keys($result[$w])); 45 } 46 47 // combine and-words 48 if(count($q['and']) > 1){ 49 $docs = ft_resultCombine($q['and']); 50 }else{ 51 $docs = $q['and'][0]; 52 } 53 if(!count($docs)) return array(); 54 55 // create a list of hidden pages in the result 56 $hidden = array(); 57 $hidden = array_filter(array_keys($docs),'isHiddenPage'); 58 $not = array_merge($not,$hidden); 59 60 // filter unmatched namespaces 61 if(!empty($q['ns'])) { 62 $pattern = implode('|^',$q['ns']); 63 foreach($docs as $key => $val) { 64 if(!preg_match('/^'.$pattern.'/',$key)) { 65 unset($docs[$key]); 66 } 67 } 68 } 69 70 // remove negative matches 71 foreach($not as $n){ 72 unset($docs[$n]); 73 } 74 75 if(!count($docs)) return array(); 76 // handle phrases 77 if(count($q['phrases'])){ 78 //build a regexp 79 $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 80 $q['phrases'] = array_map('ft_preg_quote_cb',$q['phrases']); 81 // check the source of all documents for the exact phrases 82 foreach(array_keys($docs) as $id){ 83 $text = utf8_strtolower(rawWiki($id)); 84 foreach($q['phrases'] as $phrase){ 85 if(!preg_match('/'.$phrase.'/usi',$text)){ 86 unset($docs[$id]); // no hit - remove 87 break; 88 } 89 } 90 } 91 } 92 93 if(!count($docs)) return array(); 94 95 // check ACL permissions 96 foreach(array_keys($docs) as $doc){ 97 if(auth_quickaclcheck($doc) < AUTH_READ){ 98 unset($docs[$doc]); 99 } 100 } 101 102 if(!count($docs)) return array(); 103 104 // if there are any hits left, sort them by count 105 arsort($docs); 106 107 return $docs; 108} 109 110/** 111 * Returns the backlinks for a given page 112 * 113 * Does a quick lookup with the fulltext index, then 114 * evaluates the instructions of the found pages 115 */ 116function ft_backlinks($id){ 117 global $conf; 118 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 119 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 120 121 $result = array(); 122 123 // quick lookup of the pagename 124 $page = noNS($id); 125 $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) 126 $docs = array_keys(ft_resultCombine(array_values($matches))); 127 $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages 128 if(!count($docs)) return $result; 129 require_once(DOKU_INC.'inc/parserutils.php'); 130 131 // check metadata for matching links 132 foreach($docs as $match){ 133 // metadata relation reference links are already resolved 134 $links = p_get_metadata($match,'relation references'); 135 if (isset($links[$id])) $result[] = $match; 136 } 137 138 if(!count($result)) return $result; 139 140 // check ACL permissions 141 foreach(array_keys($result) as $idx){ 142 if(auth_quickaclcheck($result[$idx]) < AUTH_READ){ 143 unset($result[$idx]); 144 } 145 } 146 147 sort($result); 148 return $result; 149} 150 151/** 152 * Quicksearch for pagenames 153 * 154 * By default it only matches the pagename and ignores the 155 * namespace. This can be changed with the second parameter 156 * 157 * @author Andreas Gohr <andi@splitbrain.org> 158 */ 159function ft_pageLookup($id,$pageonly=true){ 160 global $conf; 161 $id = preg_quote($id,'/'); 162 $pages = file($conf['indexdir'].'/page.idx'); 163 if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages)); 164 165 $cnt = count($pages); 166 for($i=0; $i<$cnt; $i++){ 167 if($pageonly){ 168 if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 169 unset($pages[$i]); 170 continue; 171 } 172 } 173 if(!page_exists($pages[$i])){ 174 unset($pages[$i]); 175 continue; 176 } 177 } 178 179 $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages 180 if(!count($pages)) return array(); 181 182 // check ACL permissions 183 foreach(array_keys($pages) as $idx){ 184 if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){ 185 unset($pages[$idx]); 186 } 187 } 188 189 $pages = array_map('trim',$pages); 190 sort($pages); 191 return $pages; 192} 193 194/** 195 * Creates a snippet extract 196 * 197 * @author Andreas Gohr <andi@splitbrain.org> 198 */ 199function ft_snippet($id,$poswords){ 200 $poswords = preg_quote($poswords,'#'); 201 $re = '('.str_replace(' ','|',$poswords).')'; 202 $text = rawWiki($id); 203 204 $match = array(); 205 $snippets = array(); 206 $utf8_offset = $offset = $end = 0; 207 $len = utf8_strlen($text); 208 209 for ($cnt=3; $cnt--;) { 210 if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; 211 212 list($str,$idx) = $match[0]; 213 214 // convert $idx (a byte offset) into a utf8 character offset 215 $utf8_idx = utf8_strlen(substr($text,0,$idx)); 216 $utf8_len = utf8_strlen($str); 217 218 // establish context, 100 bytes surrounding the match string 219 // first look to see if we can go 100 either side, 220 // then drop to 50 adding any excess if the other side can't go to 50, 221 $pre = min($utf8_idx-$utf8_offset,100); 222 $post = min($len-$utf8_idx-$utf8_len,100); 223 224 if ($pre>50 && $post>50) { 225 $pre = $post = 50; 226 } else if ($pre>50) { 227 $pre = min($pre,100-$post); 228 } else if ($post>50) { 229 $post = min($post, 100-$pre); 230 } else { 231 // both are less than 50, means the context is the whole string 232 // make it so and break out of this loop - there is no need for the 233 // complex snippet calculations 234 $snippets = array($text); 235 break; 236 } 237 238 // establish context start and end points, try to append to previous 239 // context if possible 240 $start = $utf8_idx - $pre; 241 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 242 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 243 244 if ($append) { 245 $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); 246 } else { 247 $snippets[] = utf8_substr($text,$start,$end-$start); 248 } 249 250 // set $offset for next match attempt 251 // substract strlen to avoid splitting a potential search success, 252 // this is an approximation as the search pattern may match strings 253 // of varying length and it will fail if the context snippet 254 // boundary breaks a matching string longer than the current match 255 $utf8_offset = $utf8_idx + $post; 256 $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); 257 $offset = utf8_correctIdx($text,$offset); 258 } 259 260 $m = "\1"; 261 $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); 262 $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets))); 263 264 return $snippet; 265} 266 267/** 268 * Combine found documents and sum up their scores 269 * 270 * This function is used to combine searched words with a logical 271 * AND. Only documents available in all arrays are returned. 272 * 273 * based upon PEAR's PHP_Compat function for array_intersect_key() 274 * 275 * @param array $args An array of page arrays 276 */ 277function ft_resultCombine($args){ 278 $array_count = count($args); 279 if($array_count == 1){ 280 return $args[0]; 281 } 282 283 $result = array(); 284 if ($array_count > 1) { 285 foreach ($args[0] as $key => $value) { 286 $result[$key] = $value; 287 for ($i = 1; $i !== $array_count; $i++) { 288 if (!isset($args[$i][$key])) { 289 unset($result[$key]); 290 break; 291 } 292 $result[$key] += $args[$i][$key]; 293 } 294 } 295 } 296 return $result; 297} 298 299/** 300 * Builds an array of search words from a query 301 * 302 * @todo support OR and parenthesises? 303 */ 304function ft_queryParser($query){ 305 global $conf; 306 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 307 if(@file_exists($swfile)){ 308 $stopwords = file($swfile); 309 }else{ 310 $stopwords = array(); 311 } 312 313 $q = array(); 314 $q['query'] = $query; 315 $q['ns'] = array(); 316 $q['phrases'] = array(); 317 $q['and'] = array(); 318 $q['not'] = array(); 319 320 // strip namespace from query 321 if(preg_match('/([^@]*)@(.*)/',$query,$match)) { 322 $query = $match[1]; 323 $q['ns'] = explode('@',preg_replace("/ /",'',$match[2])); 324 } 325 326 // handle phrase searches 327 while(preg_match('/"(.*?)"/',$query,$match)){ 328 $q['phrases'][] = $match[1]; 329 $q['and'] = array_merge($q['and'], idx_tokenizer($match[0],$stopwords)); 330 $query = preg_replace('/"(.*?)"/','',$query,1); 331 } 332 333 $words = explode(' ',$query); 334 foreach($words as $w){ 335 if($w{0} == '-'){ 336 $token = idx_tokenizer($w,$stopwords,true); 337 if(count($token)) $q['not'] = array_merge($q['not'],$token); 338 }else{ 339 // asian "words" need to be searched as phrases 340 if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ 341 $q['phrases'] = array_merge($q['phrases'],$matches[1]); 342 343 } 344 $token = idx_tokenizer($w,$stopwords,true); 345 if(count($token)) $q['and'] = array_merge($q['and'],$token); 346 } 347 } 348 349 return $q; 350} 351 352//Setup VIM: ex: et ts=4 enc=utf-8 : 353