1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * Wrapper around preg_quote adding the default delimiter 15 */ 16function ft_preg_quote_cb($string){ 17 return preg_quote($string,'/'); 18} 19 20/** 21 * The fulltext search 22 * 23 * Returns a list of matching documents for the given query 24 * 25 */ 26function ft_pageSearch($query,&$poswords){ 27 $q = ft_queryParser($query); 28 // use this for higlighting later: 29 $poswords = str_replace('*','',join(' ',$q['and'])); 30 31 // lookup all words found in the query 32 $words = array_merge($q['and'],$q['not']); 33 if(!count($words)) return array(); 34 $result = idx_lookup($words); 35 36 // merge search results with query 37 foreach($q['and'] as $pos => $w){ 38 $q['and'][$pos] = $result[$w]; 39 } 40 // create a list of unwanted docs 41 $not = array(); 42 foreach($q['not'] as $pos => $w){ 43 $not = array_merge($not,array_keys($result[$w])); 44 } 45 46 // combine and-words 47 if(count($q['and']) > 1){ 48 $docs = ft_resultCombine($q['and']); 49 }else{ 50 $docs = $q['and'][0]; 51 } 52 if(!count($docs)) return array(); 53 54 // create a list of hidden pages in the result 55 $hidden = array(); 56 $hidden = array_filter(array_keys($docs),'isHiddenPage'); 57 $not = array_merge($not,$hidden); 58 59 // filter unmatched namespaces 60 if(!empty($q['ns'])) { 61 $pattern = implode('|^',$q['ns']); 62 foreach($docs as $key => $val) { 63 if(!preg_match('/^'.$pattern.'/',$key)) { 64 unset($docs[$key]); 65 } 66 } 67 } 68 69 // remove negative matches 70 foreach($not as $n){ 71 unset($docs[$n]); 72 } 73 74 if(!count($docs)) return array(); 75 // handle phrases 76 if(count($q['phrases'])){ 77 //build a regexp 78 $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 79 $q['phrases'] = array_map('ft_preg_quote_cb',$q['phrases']); 80 $regex = '('.join('|',$q['phrases']).')'; 81 // check the source of all documents for the exact phrases 82 foreach(array_keys($docs) as $id){ 83 $text = utf8_strtolower(rawWiki($id)); 84 if(!preg_match('/'.$regex.'/usi',$text)){ 85 unset($docs[$id]); // no hit - remove 86 } 87 } 88 } 89 90 if(!count($docs)) return array(); 91 92 // check ACL permissions 93 foreach(array_keys($docs) as $doc){ 94 if(auth_quickaclcheck($doc) < AUTH_READ){ 95 unset($docs[$doc]); 96 } 97 } 98 99 if(!count($docs)) return array(); 100 101 // if there are any hits left, sort them by count 102 arsort($docs); 103 104 return $docs; 105} 106 107/** 108 * Returns the backlinks for a given page 109 * 110 * Does a quick lookup with the fulltext index, then 111 * evaluates the instructions of the found pages 112 */ 113function ft_backlinks($id){ 114 global $conf; 115 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 116 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 117 118 $result = array(); 119 120 // quick lookup of the pagename 121 $page = noNS($id); 122 $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) 123 $docs = array_keys(ft_resultCombine(array_values($matches))); 124 $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages 125 if(!count($docs)) return $result; 126 require_once(DOKU_INC.'inc/parserutils.php'); 127 128 // check metadata for matching links 129 foreach($docs as $match){ 130 // metadata relation reference links are already resolved 131 $links = p_get_metadata($match,'relation references'); 132 if (isset($links[$id])) $result[] = $match; 133 } 134 135 if(!count($result)) return $result; 136 137 // check ACL permissions 138 foreach(array_keys($result) as $idx){ 139 if(auth_quickaclcheck($result[$idx]) < AUTH_READ){ 140 unset($result[$idx]); 141 } 142 } 143 144 sort($result); 145 return $result; 146} 147 148/** 149 * Quicksearch for pagenames 150 * 151 * By default it only matches the pagename and ignores the 152 * namespace. This can be changed with the second parameter 153 * 154 * @author Andreas Gohr <andi@splitbrain.org> 155 */ 156function ft_pageLookup($id,$pageonly=true){ 157 global $conf; 158 $id = preg_quote($id,'/'); 159 $pages = file($conf['indexdir'].'/page.idx'); 160 if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages)); 161 162 $cnt = count($pages); 163 for($i=0; $i<$cnt; $i++){ 164 if($pageonly){ 165 if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 166 unset($pages[$i]); 167 continue; 168 } 169 } 170 if(!@file_exists(wikiFN($pages[$i]))){ 171 unset($pages[$i]); 172 continue; 173 } 174 } 175 176 $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages 177 if(!count($pages)) return array(); 178 179 // check ACL permissions 180 foreach(array_keys($pages) as $idx){ 181 if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){ 182 unset($pages[$idx]); 183 } 184 } 185 186 $pages = array_map('trim',$pages); 187 sort($pages); 188 return $pages; 189} 190 191/** 192 * Creates a snippet extract 193 * 194 * @author Andreas Gohr <andi@splitbrain.org> 195 */ 196function ft_snippet($id,$poswords){ 197 $poswords = preg_quote($poswords,'#'); 198 $re = '('.str_replace(' ','|',$poswords).')'; 199 $text = rawWiki($id); 200 201 $match = array(); 202 $snippets = array(); 203 $utf8_offset = $offset = $end = 0; 204 $len = utf8_strlen($text); 205 206 for ($cnt=3; $cnt--;) { 207 if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; 208 209 list($str,$idx) = $match[0]; 210 211 // convert $idx (a byte offset) into a utf8 character offset 212 $utf8_idx = utf8_strlen(substr($text,0,$idx)); 213 $utf8_len = utf8_strlen($str); 214 215 // establish context, 100 bytes surrounding the match string 216 // first look to see if we can go 100 either side, 217 // then drop to 50 adding any excess if the other side can't go to 50, 218 $pre = min($utf8_idx-$utf8_offset,100); 219 $post = min($len-$utf8_idx-$utf8_len,100); 220 221 if ($pre>50 && $post>50) { 222 $pre = $post = 50; 223 } else if ($pre>50) { 224 $pre = min($pre,100-$post); 225 } else if ($post>50) { 226 $post = min($post, 100-$pre); 227 } else { 228 // both are less than 50, means the context is the whole string 229 // make it so and break out of this loop - there is no need for the 230 // complex snippet calculations 231 $snippets = array($text); 232 break; 233 } 234 235 // establish context start and end points, try to append to previous 236 // context if possible 237 $start = $utf8_idx - $pre; 238 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 239 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 240 241 if ($append) { 242 $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); 243 } else { 244 $snippets[] = utf8_substr($text,$start,$end-$start); 245 } 246 247 // set $offset for next match attempt 248 // substract strlen to avoid splitting a potential search success, 249 // this is an approximation as the search pattern may match strings 250 // of varying length and it will fail if the context snippet 251 // boundary breaks a matching string longer than the current match 252 $utf8_offset = $utf8_idx + $post; 253 $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); 254 $offset = utf8_correctIdx($text,$offset); 255 } 256 257 $m = "\1"; 258 $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); 259 $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); 260 261 return $snippet; 262} 263 264/** 265 * Combine found documents and sum up their scores 266 * 267 * This function is used to combine searched words with a logical 268 * AND. Only documents available in all arrays are returned. 269 * 270 * based upon PEAR's PHP_Compat function for array_intersect_key() 271 * 272 * @param array $args An array of page arrays 273 */ 274function ft_resultCombine($args){ 275 $array_count = count($args); 276 if($array_count == 1){ 277 return $args[0]; 278 } 279 280 $result = array(); 281 foreach ($args[0] as $key1 => $value1) { 282 for ($i = 1; $i !== $array_count; $i++) { 283 foreach ($args[$i] as $key2 => $value2) { 284 if ((string) $key1 === (string) $key2) { 285 if(!isset($result[$key1])) $result[$key1] = $value1; 286 $result[$key1] += $value2; 287 } 288 } 289 } 290 } 291 return $result; 292} 293 294/** 295 * Builds an array of search words from a query 296 * 297 * @todo support OR and parenthesises? 298 */ 299function ft_queryParser($query){ 300 global $conf; 301 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 302 if(@file_exists($swfile)){ 303 $stopwords = file($swfile); 304 }else{ 305 $stopwords = array(); 306 } 307 308 $q = array(); 309 $q['query'] = $query; 310 $q['ns'] = array(); 311 $q['phrases'] = array(); 312 $q['and'] = array(); 313 $q['not'] = array(); 314 315 // strip namespace from query 316 if(preg_match('/([^@]*)@(.*)/',$query,$match)) { 317 $query = $match[1]; 318 $q['ns'] = explode('@',preg_replace("/ /",'',$match[2])); 319 } 320 321 // handle phrase searches 322 while(preg_match('/"(.*?)"/',$query,$match)){ 323 $q['phrases'][] = $match[1]; 324 $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); 325 $query = preg_replace('/"(.*?)"/','',$query,1); 326 } 327 328 $words = explode(' ',$query); 329 foreach($words as $w){ 330 if($w{0} == '-'){ 331 $token = idx_tokenizer($w,$stopwords,true); 332 if(count($token)) $q['not'] = array_merge($q['not'],$token); 333 }else{ 334 // asian "words" need to be searched as phrases 335 if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ 336 $q['phrases'] = array_merge($q['phrases'],$matches[1]); 337 338 } 339 $token = idx_tokenizer($w,$stopwords,true); 340 if(count($token)) $q['and'] = array_merge($q['and'],$token); 341 } 342 } 343 344 return $q; 345} 346 347//Setup VIM: ex: et ts=4 enc=utf-8 : 348