1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * Wrapper around preg_quote adding the default delimiter 15 */ 16function ft_preg_quote_cb($string){ 17 return preg_quote($string,'/'); 18} 19 20/** 21 * The fulltext search 22 * 23 * Returns a list of matching documents for the given query 24 * 25 */ 26function ft_pageSearch($query,&$poswords){ 27 $q = ft_queryParser($query); 28 // use this for higlighting later: 29 $poswords = str_replace('*','',join(' ',$q['and'])); 30 31 // lookup all words found in the query 32 $words = array_merge($q['and'],$q['not']); 33 if(!count($words)) return array(); 34 $result = idx_lookup($words); 35 if(!count($result)) return array(); 36 37 // merge search results with query 38 foreach($q['and'] as $pos => $w){ 39 $q['and'][$pos] = $result[$w]; 40 } 41 // create a list of unwanted docs 42 $not = array(); 43 foreach($q['not'] as $pos => $w){ 44 $not = array_merge($not,array_keys($result[$w])); 45 } 46 47 // combine and-words 48 if(count($q['and']) > 1){ 49 $docs = ft_resultCombine($q['and']); 50 }else{ 51 $docs = $q['and'][0]; 52 } 53 if(!count($docs)) return array(); 54 55 // create a list of hidden pages in the result 56 $hidden = array(); 57 $hidden = array_filter(array_keys($docs),'isHiddenPage'); 58 $not = array_merge($not,$hidden); 59 60 // filter unmatched namespaces 61 if(!empty($q['ns'])) { 62 $pattern = implode('|^',$q['ns']); 63 foreach($docs as $key => $val) { 64 if(!preg_match('/^'.$pattern.'/',$key)) { 65 unset($docs[$key]); 66 } 67 } 68 } 69 70 // remove negative matches 71 foreach($not as $n){ 72 unset($docs[$n]); 73 } 74 75 if(!count($docs)) return array(); 76 // handle phrases 77 if(count($q['phrases'])){ 78 //build a regexp 79 $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 80 $q['phrases'] = array_map('ft_preg_quote_cb',$q['phrases']); 81 $regex = '('.join('|',$q['phrases']).')'; 82 // check the source of all documents for the exact phrases 83 foreach(array_keys($docs) as $id){ 84 $text = utf8_strtolower(rawWiki($id)); 85 if(!preg_match('/'.$regex.'/usi',$text)){ 86 unset($docs[$id]); // no hit - remove 87 } 88 } 89 } 90 91 if(!count($docs)) return array(); 92 93 // check ACL permissions 94 foreach(array_keys($docs) as $doc){ 95 if(auth_quickaclcheck($doc) < AUTH_READ){ 96 unset($docs[$doc]); 97 } 98 } 99 100 if(!count($docs)) return array(); 101 102 // if there are any hits left, sort them by count 103 arsort($docs); 104 105 return $docs; 106} 107 108/** 109 * Returns the backlinks for a given page 110 * 111 * Does a quick lookup with the fulltext index, then 112 * evaluates the instructions of the found pages 113 */ 114function ft_backlinks($id){ 115 global $conf; 116 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 117 $stopwords = @file_exists($swfile) ? file($swfile) : array(); 118 119 $result = array(); 120 121 // quick lookup of the pagename 122 $page = noNS($id); 123 $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) 124 $docs = array_keys(ft_resultCombine(array_values($matches))); 125 $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages 126 if(!count($docs)) return $result; 127 require_once(DOKU_INC.'inc/parserutils.php'); 128 129 // check metadata for matching links 130 foreach($docs as $match){ 131 // metadata relation reference links are already resolved 132 $links = p_get_metadata($match,'relation references'); 133 if (isset($links[$id])) $result[] = $match; 134 } 135 136 if(!count($result)) return $result; 137 138 // check ACL permissions 139 foreach(array_keys($result) as $idx){ 140 if(auth_quickaclcheck($result[$idx]) < AUTH_READ){ 141 unset($result[$idx]); 142 } 143 } 144 145 sort($result); 146 return $result; 147} 148 149/** 150 * Quicksearch for pagenames 151 * 152 * By default it only matches the pagename and ignores the 153 * namespace. This can be changed with the second parameter 154 * 155 * @author Andreas Gohr <andi@splitbrain.org> 156 */ 157function ft_pageLookup($id,$pageonly=true){ 158 global $conf; 159 $id = preg_quote($id,'/'); 160 $pages = file($conf['indexdir'].'/page.idx'); 161 if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages)); 162 163 $cnt = count($pages); 164 for($i=0; $i<$cnt; $i++){ 165 if($pageonly){ 166 if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 167 unset($pages[$i]); 168 continue; 169 } 170 } 171 if(!@file_exists(wikiFN($pages[$i]))){ 172 unset($pages[$i]); 173 continue; 174 } 175 } 176 177 $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages 178 if(!count($pages)) return array(); 179 180 // check ACL permissions 181 foreach(array_keys($pages) as $idx){ 182 if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){ 183 unset($pages[$idx]); 184 } 185 } 186 187 $pages = array_map('trim',$pages); 188 sort($pages); 189 return $pages; 190} 191 192/** 193 * Creates a snippet extract 194 * 195 * @author Andreas Gohr <andi@splitbrain.org> 196 */ 197function ft_snippet($id,$poswords){ 198 $poswords = preg_quote($poswords,'#'); 199 $re = '('.str_replace(' ','|',$poswords).')'; 200 $text = rawWiki($id); 201 202 $match = array(); 203 $snippets = array(); 204 $utf8_offset = $offset = $end = 0; 205 $len = utf8_strlen($text); 206 207 for ($cnt=3; $cnt--;) { 208 if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; 209 210 list($str,$idx) = $match[0]; 211 212 // convert $idx (a byte offset) into a utf8 character offset 213 $utf8_idx = utf8_strlen(substr($text,0,$idx)); 214 $utf8_len = utf8_strlen($str); 215 216 // establish context, 100 bytes surrounding the match string 217 // first look to see if we can go 100 either side, 218 // then drop to 50 adding any excess if the other side can't go to 50, 219 $pre = min($utf8_idx-$utf8_offset,100); 220 $post = min($len-$utf8_idx-$utf8_len,100); 221 222 if ($pre>50 && $post>50) { 223 $pre = $post = 50; 224 } else if ($pre>50) { 225 $pre = min($pre,100-$post); 226 } else if ($post>50) { 227 $post = min($post, 100-$pre); 228 } else { 229 // both are less than 50, means the context is the whole string 230 // make it so and break out of this loop - there is no need for the 231 // complex snippet calculations 232 $snippets = array($text); 233 break; 234 } 235 236 // establish context start and end points, try to append to previous 237 // context if possible 238 $start = $utf8_idx - $pre; 239 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 240 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 241 242 if ($append) { 243 $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); 244 } else { 245 $snippets[] = utf8_substr($text,$start,$end-$start); 246 } 247 248 // set $offset for next match attempt 249 // substract strlen to avoid splitting a potential search success, 250 // this is an approximation as the search pattern may match strings 251 // of varying length and it will fail if the context snippet 252 // boundary breaks a matching string longer than the current match 253 $utf8_offset = $utf8_idx + $post; 254 $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); 255 $offset = utf8_correctIdx($text,$offset); 256 } 257 258 $m = "\1"; 259 $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); 260 $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets))); 261 262 return $snippet; 263} 264 265/** 266 * Combine found documents and sum up their scores 267 * 268 * This function is used to combine searched words with a logical 269 * AND. Only documents available in all arrays are returned. 270 * 271 * based upon PEAR's PHP_Compat function for array_intersect_key() 272 * 273 * @param array $args An array of page arrays 274 */ 275function ft_resultCombine($args){ 276 $array_count = count($args); 277 if($array_count == 1){ 278 return $args[0]; 279 } 280 281 $result = array(); 282 foreach ($args[0] as $key1 => $value1) { 283 for ($i = 1; $i !== $array_count; $i++) { 284 foreach ($args[$i] as $key2 => $value2) { 285 if ((string) $key1 === (string) $key2) { 286 if(!isset($result[$key1])) $result[$key1] = $value1; 287 $result[$key1] += $value2; 288 } 289 } 290 } 291 } 292 return $result; 293} 294 295/** 296 * Builds an array of search words from a query 297 * 298 * @todo support OR and parenthesises? 299 */ 300function ft_queryParser($query){ 301 global $conf; 302 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 303 if(@file_exists($swfile)){ 304 $stopwords = file($swfile); 305 }else{ 306 $stopwords = array(); 307 } 308 309 $q = array(); 310 $q['query'] = $query; 311 $q['ns'] = array(); 312 $q['phrases'] = array(); 313 $q['and'] = array(); 314 $q['not'] = array(); 315 316 // strip namespace from query 317 if(preg_match('/([^@]*)@(.*)/',$query,$match)) { 318 $query = $match[1]; 319 $q['ns'] = explode('@',preg_replace("/ /",'',$match[2])); 320 } 321 322 // handle phrase searches 323 while(preg_match('/"(.*?)"/',$query,$match)){ 324 $q['phrases'][] = $match[1]; 325 $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); 326 $query = preg_replace('/"(.*?)"/','',$query,1); 327 } 328 329 $words = explode(' ',$query); 330 foreach($words as $w){ 331 if($w{0} == '-'){ 332 $token = idx_tokenizer($w,$stopwords,true); 333 if(count($token)) $q['not'] = array_merge($q['not'],$token); 334 }else{ 335 // asian "words" need to be searched as phrases 336 if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ 337 $q['phrases'] = array_merge($q['phrases'],$matches[1]); 338 339 } 340 $token = idx_tokenizer($w,$stopwords,true); 341 if(count($token)) $q['and'] = array_merge($q['and'],$token); 342 } 343 } 344 345 return $q; 346} 347 348//Setup VIM: ex: et ts=4 enc=utf-8 : 349