1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15// Asian characters are handled as words. The following regexp defines the 16// Unicode-Ranges for Asian characters 17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 18// I'm no language expert. If you think some ranges are wrongly chosen or 19// a range is missing, please contact me 20define('IDX_ASIAN','['. 21 '\x{0E00}-\x{0E7F}'. // Thai 22 '\x{2E80}-\x{D7AF}'. // CJK -> Hangul 23 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 24 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 25 ']'); 26 27 28/** 29 * Write a list of strings to an index file. 30 * 31 * @author Tom N Harris <tnharris@whoopdedo.org> 32 */ 33function idx_saveIndex($pre, $wlen, $idx){ 34 global $conf; 35 $fn = $conf['indexdir'].'/'.$pre.$wlen; 36 $fh = @fopen($fn.'.tmp','w'); 37 if(!$fh) return false; 38 fwrite($fh,join('',$idx)); 39 fclose($fh); 40 if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); 41 io_rename($fn.'.tmp', $fn.'.idx'); 42 return true; 43} 44 45/** 46 * Read the list of words in an index (if it exists). 47 * 48 * @author Tom N Harris <tnharris@whoopdedo.org> 49 */ 50function idx_getIndex($pre, $wlen){ 51 global $conf; 52 $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; 53 if(!@file_exists($fn)) return array(); 54 return file($fn); 55} 56 57/** 58 * Create an empty index file if it doesn't exist yet. 59 * 60 * @author Tom N Harris <tnharris@whoopdedo.org> 61 */ 62function idx_touchIndex($pre, $wlen){ 63 global $conf; 64 $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; 65 if(!@file_exists($fn)){ 66 touch($fn); 67 if($conf['fperm']) chmod($fn, $conf['fperm']); 68 } 69} 70 71/** 72 * Split a page into words 73 * 74 * Returns an array of word counts, false if an error occured. 75 * Array is keyed on the word length, then the word index. 76 * 77 * @author Andreas Gohr <andi@splitbrain.org> 78 * @author Christopher Smith <chris@jalakai.co.uk> 79 */ 80function idx_getPageWords($page){ 81 global $conf; 82 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 83 if(@file_exists($swfile)){ 84 $stopwords = file($swfile); 85 }else{ 86 $stopwords = array(); 87 } 88 89 $body = rawWiki($page); 90 $body = strtr($body, "\r\n\t", ' '); 91 $tokens = explode(' ', $body); 92 $tokens = array_count_values($tokens); // count the frequency of each token 93 94// ensure the deaccented or romanised page names of internal links are added to the token array 95// (this is necessary for the backlink function -- there maybe a better way!) 96 if ($conf['deaccent']) { 97 $links = p_get_metadata($page,'relation references'); 98 99 $tmp = join(' ',array_keys($links)); // make a single string 100 $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space 101 $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens 102 103 foreach ($link_tokens as $link_token) { 104 if (isset($tokens[$link_token])) continue; 105 $tokens[$link_token] = 1; 106 } 107 } 108 109 $words = array(); 110 foreach ($tokens as $word => $count) { 111 $arr = idx_tokenizer($word,$stopwords); 112 $arr = array_count_values($arr); 113 foreach ($arr as $w => $c) { 114 $l = strlen($w); 115 if(isset($words[$l])){ 116 $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0; 117 }else{ 118 $words[$l] = array($w => $c * $count); 119 } 120 } 121 } 122 123 // arrive here with $words = array(wordlen => array(word => frequency)) 124 125 $index = array(); //resulting index 126 foreach (array_keys($words) as $wlen){ 127 $word_idx = idx_getIndex('w',$wlen); 128 foreach ($words[$wlen] as $word => $freq) { 129 $wid = array_search("$word\n",$word_idx); 130 if(!is_int($wid)){ 131 $word_idx[] = "$word\n"; 132 $wid = count($word_idx)-1; 133 } 134 if(!isset($index[$wlen])) 135 $index[$wlen] = array(); 136 $index[$wlen][$wid] = $freq; 137 } 138 139 // save back word index 140 if(!idx_saveIndex('w',$wlen,$word_idx)){ 141 trigger_error("Failed to write word index", E_USER_ERROR); 142 return false; 143 } 144 } 145 146 return $index; 147} 148 149/** 150 * Adds/updates the search for the given page 151 * 152 * This is the core function of the indexer which does most 153 * of the work. This function needs to be called with proper 154 * locking! 155 * 156 * @author Andreas Gohr <andi@splitbrain.org> 157 */ 158function idx_addPage($page){ 159 global $conf; 160 161 // load known documents 162 $page_idx = idx_getIndex('page',''); 163 164 // get page id (this is the linenumber in page.idx) 165 $pid = array_search("$page\n",$page_idx); 166 if(!is_int($pid)){ 167 $page_idx[] = "$page\n"; 168 $pid = count($page_idx)-1; 169 // page was new - write back 170 if (!idx_saveIndex('page','',$page_idx)) 171 return false; 172 } 173 174 // get word usage in page 175 $words = idx_getPageWords($page); 176 if($words === false) return false; 177 if(!count($words)) return true; 178 179 foreach(array_keys($words) as $wlen){ 180 // Open index and temp file 181 $fn = $conf['indexdir']."/i$wlen"; 182 idx_touchIndex('i',$wlen); 183 $idx = fopen($fn.'.idx','r'); 184 $tmp = fopen($fn.'.tmp','w'); 185 if(!$idx || !$tmp){ 186 trigger_error("Failed to open index files", E_USER_ERROR); 187 return false; 188 } 189 190 // copy from index to temp file, modifying where needed 191 $lno = 0; 192 $line = ''; 193 while (!feof($idx)) { 194 // read full line 195 $line .= fgets($idx, 4096); 196 if(substr($line,-1) != "\n") continue; 197 198 // write a new Line to temp file 199 idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]); 200 201 $line = ''; // reset line buffer 202 $lno++; // increase linecounter 203 } 204 fclose($idx); 205 206 // add missing lines (usually index and word should contain 207 // the same number of lines, however if the page contained 208 // new words the word file has some more lines which need to 209 // be added here 210 $word_idx = idx_getIndex('w',$wlen); 211 $wcnt = count($word_idx); 212 for($lno; $lno<$wcnt; $lno++){ 213 idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]); 214 } 215 216 // close the temp file and move it over to be the new one 217 fclose($tmp); 218 if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); 219 // try rename first (fast) fallback to copy (slow) 220 io_rename($fn.'.tmp', $fn.'.idx'); 221 } 222 223 return true; 224} 225 226/** 227 * Write a new index line to the filehandle 228 * 229 * This function writes an line for the index file to the 230 * given filehandle. It removes the given document from 231 * the given line and readds it when $count is >0. 232 * 233 * @author Andreas Gohr <andi@splitbrain.org> 234 */ 235function idx_writeIndexLine($fh,$line,$pid,$count){ 236 $line = trim($line); 237 238 if($line != ''){ 239 $parts = explode(':',$line); 240 // remove doc from given line 241 foreach($parts as $part){ 242 if($part == '') continue; 243 list($doc,$cnt) = explode('*',$part); 244 if($doc != $pid){ 245 fwrite($fh,"$doc*$cnt:"); 246 } 247 } 248 } 249 250 // add doc 251 if ($count){ 252 fwrite($fh,"$pid*$count"); 253 } 254 255 // add newline 256 fwrite($fh,"\n"); 257} 258 259/** 260 * Get the word lengths that have been indexed. 261 * 262 * Reads the index directory and returns an array of lengths 263 * that there are indices for. 264 * 265 * @author Tom N Harris <tnharris@whoopdedo.org> 266 */ 267function idx_indexLengths($minlen){ 268 global $conf; 269 $dir = @opendir($conf['indexdir']); 270 if($dir===false) 271 return array(); 272 $idx = array(); 273 // Exact match first. 274 if(@file_exists($conf['indexdir']."/i$minlen.idx")) 275 $idx[] = $minlen; 276 while (($f = readdir($dir)) !== false) { 277 if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){ 278 $i = substr($f,1,-4); 279 if (is_numeric($i) && $i > $minlen) 280 $idx[] = $i; 281 } 282 } 283 closedir($dir); 284 return $idx; 285} 286 287/** 288 * Lookup words in index 289 * 290 * Takes an array of word and will return a list of matching 291 * documents for each one. 292 * 293 * Important: No ACL checking is done here! All results are 294 * returned, regardless of permissions 295 * 296 * @author Andreas Gohr <andi@splitbrain.org> 297 */ 298function idx_lookup($words){ 299 global $conf; 300 301 $result = array(); 302 303 // load known words and documents 304 $page_idx = idx_getIndex('page',''); 305 306 // get word IDs 307 $wids = array(); 308 foreach($words as $word){ 309 $result[$word] = array(); 310 $wild = 0; 311 $xword = $word; 312 $wlen = strlen($word); 313 314 // check for wildcards 315 if(substr($xword,0,1) == '*'){ 316 $xword = substr($xword,1); 317 $wild = 1; 318 $ptn = '/'.preg_quote($xword,'/').'$/'; 319 $wlen -= 1; 320# $l = -1*strlen($xword)-1; 321 } 322 if(substr($xword,-1,1) == '*'){ 323 $xword = substr($xword,0,-1); 324 $wild += 2; 325 $wlen -= 1; 326 } 327 if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue; 328 329 // look for the ID(s) for the given word 330 if($wild){ // handle wildcard search 331 foreach (idx_indexLengths($wlen) as $ixlen){ 332 $word_idx = idx_getIndex('w',$ixlen); 333 $cnt = count($word_idx); 334 for($wid=0; $wid<$cnt; $wid++){ 335 $iword = $word_idx[$wid]; 336 if( (($wild==3) && is_int(strpos($iword,$xword))) || 337# (($wild==1) && ("$xword\n" == substr($iword,$l))) || 338 (($wild==1) && preg_match($ptn,$iword)) || 339# (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) 340 (($wild==2) && (0 === strpos($iword,$xword))) 341 342 ){ 343 if(!isset($wids[$ixlen])) $wids[$ixlen] = array(); 344 $wids[$ixlen][] = $wid; 345 $result[$word][] = "$ixlen*$wid"; 346 } 347 } 348 } 349 }else{ // handle exact search 350 $word_idx = idx_getIndex('w',$wlen); 351 $wid = array_search("$word\n",$word_idx); 352 if(is_int($wid)){ 353 $wids[$wlen] = array($wid); 354 $result[$word][] = "$wlen*$wid"; 355 }else{ 356 $result[$word] = array(); 357 } 358 } 359 } 360 361 $docs = array(); // hold docs found 362 foreach(array_keys($wids) as $wlen){ 363 sort($wids[$wlen]); 364 $wids[$wlen] = array_unique($wids[$wlen]); 365 366 // Open index 367 idx_touchIndex('i',$wlen); 368 $idx = fopen($conf['indexdir']."/i$wlen.idx",'r'); 369 if(!$idx){ 370 msg("Failed to open index file",-1); 371 return false; 372 } 373 374 // Walk the index til the lines are found 375 $lno = 0; 376 $line = ''; 377 $ixids =& $wids[$wlen]; 378 $srch = array_shift($ixids); // which word do we look for? 379 while (!feof($idx)) { 380 // read full line 381 $line .= fgets($idx, 4096); 382 if(substr($line,-1) != "\n") continue; 383 if($lno > $srch) break; // shouldn't happen 384 385 // do we want this line? 386 if($lno == $srch){ 387 // add docs to list 388 $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line); 389 390 $srch = array_shift($ixids); // next word to look up 391 if($srch == null) break; // no more words 392 } 393 394 $line = ''; // reset line buffer 395 $lno++; // increase linecounter 396 } 397 fclose($idx); 398 } 399 400 401 // merge found pages into final result array 402 $final = array(); 403 foreach(array_keys($result) as $word){ 404 $final[$word] = array(); 405 foreach($result[$word] as $wid){ 406 $hits = &$docs[$wid]; 407 foreach ($hits as $hitkey => $hitcnt) { 408 $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; 409 } 410 } 411 } 412 return $final; 413} 414 415/** 416 * Returns a list of documents and counts from a index line 417 * 418 * It omits docs with a count of 0 and pages that no longer 419 * exist. 420 * 421 * @param array $page_idx The list of known pages 422 * @param string $line A line from the main index 423 * @author Andreas Gohr <andi@splitbrain.org> 424 */ 425function idx_parseIndexLine(&$page_idx,$line){ 426 $result = array(); 427 428 $line = trim($line); 429 if($line == '') return $result; 430 431 $parts = explode(':',$line); 432 foreach($parts as $part){ 433 if($part == '') continue; 434 list($doc,$cnt) = explode('*',$part); 435 if(!$cnt) continue; 436 $doc = trim($page_idx[$doc]); 437 if(!$doc) continue; 438 // make sure the document still exists 439 if(!@file_exists(wikiFN($doc,'',false))) continue; 440 441 $result[$doc] = $cnt; 442 } 443 return $result; 444} 445 446/** 447 * Tokenizes a string into an array of search words 448 * 449 * Uses the same algorithm as idx_getPageWords() 450 * 451 * @param string $string the query as given by the user 452 * @param arrayref $stopwords array of stopwords 453 * @param boolean $wc are wildcards allowed? 454 * 455 * @todo make combined function to use alone or in getPageWords 456 */ 457function idx_tokenizer($string,&$stopwords,$wc=false){ 458 $words = array(); 459 $wc = ($wc) ? '' : $wc = '\*'; 460 461 if(preg_match('/[^0-9A-Za-z]/u', $string)){ 462 // handle asian chars as single words (may fail on older PHP version) 463 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); 464 if(!is_null($asia)) $string = $asia; //recover from regexp failure 465 466 $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); 467 foreach ($arr as $w) { 468 if (!is_numeric($w) && strlen($w) < 3) continue; 469 $w = utf8_strtolower($w); 470 if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 471 $words[] = $w; 472 } 473 }else{ 474 $w = $string; 475 if (!is_numeric($w) && strlen($w) < 3) return $words; 476 $w = strtolower($w); 477 if(is_int(array_search("$w\n",$stopwords))) return $words; 478 $words[] = $w; 479 } 480 481 return $words; 482} 483 484//Setup VIM: ex: et ts=4 enc=utf-8 : 485