1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9 10if(!defined('DOKU_INC')) die('meh.'); 11 12// Version tag used to force rebuild on upgrade 13define('INDEXER_VERSION', 3); 14 15// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 16if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 17 18// Asian characters are handled as words. The following regexp defines the 19// Unicode-Ranges for Asian characters 20// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 21// I'm no language expert. If you think some ranges are wrongly chosen or 22// a range is missing, please contact me 23define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 24define('IDX_ASIAN2','['. 25 '\x{2E80}-\x{3040}'. // CJK -> Hangul 26 '\x{309D}-\x{30A0}'. 27 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. 28 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 29 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 30 ']'); 31define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 32 '\x{3042}\x{3044}\x{3046}\x{3048}'. 33 '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 34 '\x{3084}\x{3086}\x{3088}-\x{308D}'. 35 '\x{308F}-\x{3094}'. 36 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 37 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 38 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 39 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 40 ']['. 41 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 42 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 43 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 44 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 45 '\x{31F0}-\x{31FF}'. 46 ']?'); 47define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); 48 49/** 50 * Version of the indexer taking into consideration the external tokenizer. 51 * The indexer is only compatible with data written by the same version. 52 * 53 * @author Tom N Harris <tnharris@whoopdedo.org> 54 */ 55function idx_get_version(){ 56 global $conf; 57 if($conf['external_tokenizer']) 58 return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); 59 else 60 return INDEXER_VERSION; 61} 62 63/** 64 * Measure the length of a string. 65 * Differs from strlen in handling of asian characters. 66 * 67 * @author Tom N Harris <tnharris@whoopdedo.org> 68 */ 69function wordlen($w){ 70 $l = strlen($w); 71 // If left alone, all chinese "words" will get put into w3.idx 72 // So the "length" of a "word" is faked 73 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 74 foreach($leadbytes[0] as $b) 75 $l += ord($b) - 0xE1; 76 } 77 return $l; 78} 79 80/** 81 * Class that encapsulates operations on the indexer database. 82 * 83 * @author Tom N Harris <tnharris@whoopdedo.org> 84 */ 85class Doku_Indexer { 86 87 /** 88 * Adds the contents of a page to the fulltext index 89 * 90 * The added text replaces previous words for the same page. 91 * An empty value erases the page. 92 * 93 * @param string $page a page name 94 * @param string $text the body of the page 95 * @return boolean the function completed successfully 96 * @author Tom N Harris <tnharris@whoopdedo.org> 97 * @author Andreas Gohr <andi@splitbrain.org> 98 */ 99 public function addPageWords($page, $text) { 100 if (!$this->_lock()) 101 return "locked"; 102 103 // load known documents 104 $page_idx = $this->_addIndexKey('page', '', $page); 105 if ($page_idx === false) { 106 $this->_unlock(); 107 return false; 108 } 109 110 $pagewords = array(); 111 // get word usage in page 112 $words = $this->_getPageWords($text); 113 if ($words === false) { 114 $this->_unlock(); 115 return false; 116 } 117 118 if (!empty($words)) { 119 foreach (array_keys($words) as $wlen) { 120 $index = $this->_getIndex('i', $wlen); 121 foreach ($words[$wlen] as $wid => $freq) { 122 $idx = ($wid<count($index)) ? $index[$wid] : ''; 123 $index[$wid] = $this->_updateTuple($idx, $pid, $freq); 124 $pagewords[] = "$wlen*$wid"; 125 } 126 if (!$this->_saveIndex('i', $wlen, $index)) { 127 $this->_unlock(); 128 return false; 129 } 130 } 131 } 132 133 // Remove obsolete index entries 134 $pageword_idx = $this->_getIndexKey('pageword', '', $pid); 135 if ($pageword_idx !== '') { 136 $oldwords = explode(':',$pageword_idx); 137 $delwords = array_diff($oldwords, $pagewords); 138 $upwords = array(); 139 foreach ($delwords as $word) { 140 if ($word != '') { 141 list($wlen,$wid) = explode('*', $word); 142 $wid = (int)$wid; 143 $upwords[$wlen][] = $wid; 144 } 145 } 146 foreach ($upwords as $wlen => $widx) { 147 $index = $this->_getIndex('i', $wlen); 148 foreach ($widx as $wid) { 149 $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0); 150 } 151 $this->_saveIndex('i', $wlen, $index); 152 } 153 } 154 // Save the reverse index 155 $pageword_idx = join(':', $pagewords); 156 if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) { 157 $this->_unlock(); 158 return false; 159 } 160 161 $this->_unlock(); 162 return true; 163 } 164 165 /** 166 * Split the words in a page and add them to the index. 167 * 168 * @author Andreas Gohr <andi@splitbrain.org> 169 * @author Christopher Smith <chris@jalakai.co.uk> 170 * @author Tom N Harris <tnharris@whoopdedo.org> 171 */ 172 private function _getPageWords($text) { 173 global $conf; 174 175 $tokens = $this->tokenizer($text); 176 $tokens = array_count_values($tokens); // count the frequency of each token 177 178 $words = array(); 179 foreach ($tokens as $w=>$c) { 180 $l = wordlen($w); 181 if (isset($words[$l])){ 182 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 183 }else{ 184 $words[$l] = array($w => $c); 185 } 186 } 187 188 // arrive here with $words = array(wordlen => array(word => frequency)) 189 $word_idx_modified = false; 190 $index = array(); //resulting index 191 foreach (array_keys($words) as $wlen) { 192 $word_idx = $this->_getIndex('w', $wlen); 193 foreach ($words[$wlen] as $word => $freq) { 194 $wid = array_search($word, $word_idx); 195 if ($wid === false) { 196 $wid = count($word_idx); 197 $word_idx[] = $word; 198 $word_idx_modified = true; 199 } 200 if (!isset($index[$wlen])) 201 $index[$wlen] = array(); 202 $index[$wlen][$wid] = $freq; 203 } 204 // save back the word index 205 if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx)) 206 return false; 207 } 208 209 return $index; 210 } 211 212 /** 213 * Add keys to the metadata index. 214 * 215 * Adding new keys does not remove other keys for the page. 216 * An empty value will erase the key. 217 * The $key parameter can be an array to add multiple keys. $value will 218 * not be used if $key is an array. 219 * 220 * @param string $page a page name 221 * @param mixed $key a key string or array of key=>value pairs 222 * @param mixed $value the value or list of values 223 * @return boolean the function completed successfully 224 * @author Tom N Harris <tnharris@whoopdedo.org> 225 */ 226 public function addMetaKeys($page, $key, $value=null) { 227 if (!is_array($key)) { 228 $key = array($key => $value); 229 } elseif (!is_null($value)) { 230 // $key is array, but $value is not null 231 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 232 } 233 234 $this->_lock(); 235 236 // load known documents 237 $pid = $this->_addIndexKey('page', '', $page); 238 if ($pid === false) { 239 $this->_unlock(); 240 return false; 241 } 242 243 foreach ($key as $name => $values) { 244 $metaname = idx_cleanName($name); 245 $metaidx = $this->_getIndex($metaname, '_i'); 246 $metawords = $this->_getIndex($metaname, '_w'); 247 $addwords = false; 248 $update = array(); 249 if (!is_array($val)) $values = array($values); 250 foreach ($values as $val) { 251 $val = (string)$val; 252 if ($val !== "") { 253 $id = array_search($val, $metawords); 254 if ($id === false) { 255 $id = count($metawords); 256 $metawords[$id] = $val; 257 $addwords = true; 258 } 259 $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1); 260 $update[$id] = 1; 261 } else { 262 $id = array_search($val, $metawords); 263 if ($id !== false) { 264 $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); 265 $update[$id] = 0; 266 } 267 } 268 } 269 if (!empty($update)) { 270 if ($addwords) 271 $this->_saveIndex($metaname.'_w', '', $metawords); 272 $this->_saveIndex($metaname.'_i', '', $metaidx); 273 $val_idx = $this->_getIndexKey($metaname, '_p', $pid); 274 $val_idx = array_flip(explode(':', $val_idx)); 275 foreach ($update as $id => $add) { 276 if ($add) $val_idx[$id] = 1; 277 else unset($val_idx[$id]); 278 } 279 $val_idx = array_keys($val_idx); 280 $this->_saveIndexKey($metaname.'_p', '', $pid, implode(':', $val_idx)); 281 } 282 unset($metaidx); 283 unset($metawords); 284 } 285 return true; 286 } 287 288 /** 289 * Remove a page from the index 290 * 291 * Erases entries in all known indexes. 292 * 293 * @param string $page a page name 294 * @return boolean the function completed successfully 295 * @author Tom N Harris <tnharris@whoopdedo.org> 296 */ 297 public function deletePage($page) { 298 } 299 300 /** 301 * Split the text into words for fulltext search 302 * 303 * TODO: does this also need &$stopwords ? 304 * 305 * @param string $text plain text 306 * @param boolean $wc are wildcards allowed? 307 * @return array list of words in the text 308 * @author Tom N Harris <tnharris@whoopdedo.org> 309 * @author Andreas Gohr <andi@splitbrain.org> 310 */ 311 public function tokenizer($text, $wc=false) { 312 global $conf; 313 $words = array(); 314 $wc = ($wc) ? '' : '\*'; 315 $stopwords =& idx_get_stopwords(); 316 317 if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') { 318 if (0 == io_exec($conf['tokenizer_cmd'], $text, $output)) 319 $text = $output; 320 } else { 321 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 322 // handle asian chars as single words (may fail on older PHP version) 323 $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); 324 if (!is_null($asia)) $text = $asia; // recover from regexp falure 325 } 326 } 327 $text = strtr($text, "\r\n\t", ' '); 328 if (preg_match('/[^0-9A-Za-z ]/u', $text)) 329 $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); 330 331 $wordlist = explode(' ', $text); 332 foreach ($wordlist as $word) { 333 $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 334 utf8_strtolower($word) : strtolower($word); 335 if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; 336 if (array_search($word, $stopwords) !== false) continue; 337 $words[] = $word; 338 } 339 return $words; 340 } 341 342 /** 343 * Find pages in the fulltext index containing the words, 344 * 345 * The search words must be pre-tokenized, meaning only letters and 346 * numbers with an optional wildcard 347 * 348 * The returned array will have the original tokens as key. The values 349 * in the returned list is an array with the page names as keys and the 350 * number of times that token appeas on the page as value. 351 * 352 * @param arrayref $tokens list of words to search for 353 * @return array list of page names with usage counts 354 * @author Tom N Harris <tnharris@whoopdedo.org> 355 * @author Andreas Gohr <andi@splitbrain.org> 356 */ 357 public function lookup(&$tokens) { 358 $result = array(); 359 $wids = $this->_getIndexWords($tokens, $result); 360 if (empty($wids)) return array(); 361 // load known words and documents 362 $page_idx = $this->_getIndex('page', ''); 363 $docs = array(); 364 foreach (array_keys($wids) as $wlen) { 365 $wids[$wlen] = array_unique($wids[$wlen]); 366 $index = $this->_getIndex('i', $wlen); 367 foreach($wids[$wlen] as $ixid) { 368 if ($ixid < count($index)) 369 $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]); 370 } 371 } 372 // merge found pages into final result array 373 $final = array(); 374 foreach ($result as $word => $res) { 375 $final[$word] = array(); 376 foreach ($res as $wid) { 377 $hits = &$docs[$wid]; 378 foreach ($hits as $hitkey => $hitcnt) { 379 // make sure the document still exists 380 if (!page_exists($hitkey, '', false)) continue; 381 if (!isset($final[$word][$hitkey])) 382 $final[$word][$hitkey] = $hitcnt; 383 else 384 $final[$word][$hitkey] += $hitcnt; 385 } 386 } 387 } 388 return $final; 389 } 390 391 /** 392 * Find pages containing a metadata key. 393 * 394 * The metadata values are compared as case-sensitive strings. Pass a 395 * callback function that returns true or false to use a different 396 * comparison function 397 * 398 * @param string $key name of the metadata key to look for 399 * @param string $value search term to look for 400 * @param callback $func comparison function 401 * @return array list with page names, keys are query values if more than one given 402 * @author Tom N Harris <tnharris@whoopdedo.org> 403 */ 404 public function lookupKey($key, $value, $func=null) { 405 return array(); 406 } 407 408 /** 409 * Find the index ID of each search term. 410 * 411 * The query terms should only contain valid characters, with a '*' at 412 * either the beginning or end of the word (or both). 413 * The $result parameter can be used to merge the index locations with 414 * the appropriate query term. 415 * 416 * @param arrayref $words The query terms. 417 * @param arrayref $result Set to word => array("length*id" ...) 418 * @return array Set to length => array(id ...) 419 * @author Tom N Harris <tnharris@whoopdedo.org> 420 */ 421 private function _getIndexWords(&$words, &$result) { 422 $tokens = array(); 423 $tokenlength = array(); 424 $tokenwild = array(); 425 foreach ($words as $word) { 426 $result[$word] = array(); 427 $caret = false; 428 $dollar = false; 429 $xword = $word; 430 $wlen = wordlen($word); 431 432 // check for wildcards 433 if (substr($xword, 0, 1) == '*') { 434 $xword = substr($xword, 1); 435 $caret = true; 436 $wlen -= 1; 437 } 438 if (substr($xword, -1, 1) == '*') { 439 $xword = substr($xword, 0, -1); 440 $dollar = true; 441 $wlen -= 1; 442 } 443 if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) 444 continue; 445 if (!isset($tokens[$xword])) 446 $tokenlength[$wlen][] = $xword; 447 if ($caret || $dollar) { 448 $re = preg_quote($xword, '/'); 449 if ($caret) $re = '^'.$re; 450 if ($dollar) $re = $re.'$'; 451 $tokens[$xword][] = array($word, '/'.$re.'/'); 452 if (!isset($tokenwild[$xword])) 453 $tokenwild[$xword] = $wlen; 454 } else { 455 $tokens[$xword][] = array($word, null); 456 } 457 } 458 asort($tokenwild); 459 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 460 // $tokenlength = array( base word length => base word ... ) 461 // $tokenwild = array( base word => base word length ... ) 462 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 463 $indexes_known = $this->_indexLengths($length_filter); 464 if (!empty($tokenwild)) sort($indexes_known); 465 // get word IDs 466 $wids = array(); 467 foreach ($indexes_known as $ixlen) { 468 $word_idx = $this->_getIndex('w', $ixlen); 469 // handle exact search 470 if (isset($tokenlength[$ixlen])) { 471 foreach ($tokenlength[$ixlen] as $xword) { 472 $wid = array_search($xword, $word_idx); 473 if ($wid !== false) { 474 $wids[$ixlen][] = $wid; 475 foreach ($tokens[$xword] as $w) 476 $result[$w[0]][] = "$ixlen*$wid"; 477 } 478 } 479 } 480 // handle wildcard search 481 foreach ($tokenwild as $xword => $wlen) { 482 if ($wlen >= $ixlen) break; 483 foreach ($tokens[$xword] as $w) { 484 if (is_null($w[1])) continue; 485 foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 486 $wids[$ixlen][] = $wid; 487 $result[$w[0]][] = "$ixlen*$wid"; 488 } 489 } 490 } 491 } 492 return $wids; 493 } 494 495 /** 496 * Return a list of all pages 497 * 498 * @param string $key list only pages containing the metadata key (optional) 499 * @return array list of page names 500 * @author Tom N Harris <tnharris@whoopdedo.org> 501 */ 502 public function getPages($key=null) { 503 $page_idx = $this->_getIndex('page', ''); 504 if (is_null($key)) return $page_idx; 505 } 506 507 /** 508 * Return a list of words sorted by number of times used 509 * 510 * @param int $min bottom frequency threshold 511 * @param int $max upper frequency limit. No limit if $max<$min 512 * @param string $key metadata key to list. Uses the fulltext index if not given 513 * @return array list of words as the keys and frequency as values 514 * @author Tom N Harris <tnharris@whoopdedo.org> 515 */ 516 public function histogram($min=1, $max=0, $key=null) { 517 } 518 519 /** 520 * Lock the indexer. 521 * 522 * @author Tom N Harris <tnharris@whoopdedo.org> 523 */ 524 private function _lock() { 525 global $conf; 526 $status = true; 527 $lock = $conf['lockdir'].'/_indexer.lock'; 528 while (!@mkdir($lock, $conf['dmode'])) { 529 usleep(50); 530 if (time() - @filemtime($lock) > 60*5) { 531 // looks like a stale lock, remove it 532 @rmdir($lock); 533 $status = "stale lock removed"; 534 } else { 535 return false; 536 } 537 } 538 if ($conf['dperm']) 539 chmod($lock, $conf['dperm']); 540 return $status; 541 } 542 543 /** 544 * Release the indexer lock. 545 * 546 * @author Tom N Harris <tnharris@whoopdedo.org> 547 */ 548 private function _unlock() { 549 global $conf; 550 @rmdir($conf['lockdir'].'/_indexer.lock'); 551 return true; 552 } 553 554 /** 555 * Retrieve the entire index. 556 * 557 * @author Tom N Harris <tnharris@whoopdedo.org> 558 */ 559 private function _getIndex($idx, $suffix) { 560 global $conf; 561 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 562 if (!@file_exists($fn)) return array(); 563 return file($fn, FILE_IGNORE_NEW_LINES); 564 } 565 566 /** 567 * Replace the contents of the index with an array. 568 * 569 * @author Tom N Harris <tnharris@whoopdedo.org> 570 */ 571 private function _saveIndex($idx, $suffix, &$lines) { 572 global $conf; 573 $fn = $conf['indexdir'].'/'.$idx.$suffix; 574 $fh = @fopen($fn.'.tmp', 'w'); 575 if (!$fh) return false; 576 fwrite($fh, join("\n", $lines)); 577 fclose($fh); 578 if (isset($conf['fperm'])) 579 chmod($fn.'.tmp', $conf['fperm']); 580 io_rename($fn.'.tmp', $fn.'.idx'); 581 if ($suffix !== '') 582 $this->_cacheIndexDir($idx, $suffix, empty($lines)); 583 return true; 584 } 585 586 /** 587 * Retrieve a line from the index. 588 * 589 * @author Tom N Harris <tnharris@whoopdedo.org> 590 */ 591 private function _getIndexKey($idx, $suffix, $id) { 592 global $conf; 593 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 594 if (!@file_exists($fn)) return ''; 595 $fh = @fopen($fn, 'r'); 596 if (!$fh) return ''; 597 $ln = -1; 598 while (($line = fgets($fh)) !== false) { 599 if (++$ln == $id) break; 600 } 601 fclose($fh); 602 return rtrim((string)$line); 603 } 604 605 /** 606 * Write a line into the index. 607 * 608 * @author Tom N Harris <tnharris@whoopdedo.org> 609 */ 610 private function _saveIndexKey($idx, $suffix, $id, $line) { 611 global $conf; 612 if (substr($line, -1) != "\n") 613 $line .= "\n"; 614 $fn = $conf['indexdir'].'/'.$idx.$suffix; 615 $fh = @fopen($fn.'.tmp', 'w'); 616 if (!fh) return false; 617 $ih = @fopen($fn.'.idx', 'r'); 618 if ($ih) { 619 $ln = -1; 620 while (($curline = fgets($ih)) !== false) { 621 fwrite($fh, (++$ln == $id) ? $line : $curline); 622 } 623 if ($id > $ln) { 624 while ($id > ++$ln) 625 fwrite($fh, "\n"); 626 fwrite($fh, $line); 627 } 628 fclose($ih); 629 } else { 630 $ln = -1; 631 while ($id > ++$ln) 632 fwrite($fh, "\n"); 633 fwrite($fh, $line); 634 } 635 fclose($fh); 636 if (isset($conf['fperm'])) 637 chmod($fn.'.tmp', $conf['fperm']); 638 io_rename($fn.'.tmp', $fn.'.idx'); 639 if ($suffix !== '') 640 $this->_cacheIndexDir($idx, $suffix); 641 return true; 642 } 643 644 /** 645 * Retrieve or insert a value in the index. 646 * 647 * @author Tom N Harris <tnharris@whoopdedo.org> 648 */ 649 private function _addIndexKey($idx, $suffix, $value) { 650 $index = $this->_getIndex($idx, $suffix); 651 $id = array_search($value, $index); 652 if ($id === false) { 653 $id = count($index); 654 $index[$id] = $value; 655 if (!$this->_saveIndex($idx, $suffix, $index)) { 656 trigger_error("Failed to write $idx index", E_USER_ERROR); 657 return false; 658 } 659 } 660 return $id; 661 } 662 663 private function _cacheIndexDir($idx, $suffix, $delete=false) { 664 global $conf; 665 if ($idx == 'i') 666 $cachename = $conf['indexdir'].'/lengths'; 667 else 668 $cachename = $conf['indexdir'].'/'.$idx.'lengths'; 669 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 670 if ($lengths === false) $lengths = array(); 671 $old = array_search((string)$suffix, $lengths); 672 if (empty($lines)) { 673 if ($old === false) return; 674 unset($lengths[$old]); 675 } else { 676 if ($old !== false) return; 677 $lengths[] = $suffix; 678 sort($lengths); 679 } 680 $fh = @fopen($cachename.'.tmp', 'w'); 681 if (!$fh) { 682 trigger_error("Failed to write index cache", E_USER_ERROR); 683 return; 684 } 685 @fwrite($fh, implode("\n", $lengths)); 686 @fclose($fh); 687 if (isset($conf['fperm'])) 688 chmod($cachename.'.tmp', $conf['fperm']); 689 io_rename($cachename.'.tmp', $cachename.'.idx'); 690 } 691 692 /** 693 * Get the list of lengths indexed in the wiki. 694 * 695 * Read the index directory or a cache file and returns 696 * a sorted array of lengths of the words used in the wiki. 697 * 698 * @author YoBoY <yoboy.leguesh@gmail.com> 699 */ 700 private function _listIndexLengths() { 701 global $conf; 702 $cachename = $conf['indexdir'].'/lengths'; 703 clearstatcache(); 704 if (@file_exists($cachename.'.idx')) { 705 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 706 if ($lengths !== false) { 707 $idx = array(); 708 foreach ($lengths as $length) 709 $idx[] = (int)$length; 710 return $idx; 711 } 712 } 713 714 $dir = @opendir($conf['indexdir']); 715 if ($dir === false) 716 return array(); 717 $lengths[] = array(); 718 while (($f = readdir($dir)) !== false) { 719 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 720 $i = substr($f, 1, -4); 721 if (is_numeric($i)) 722 $lengths[] = (int)$i; 723 } 724 } 725 closedir($dir); 726 sort($lengths); 727 // save this in a file 728 $fh = @fopen($cachename.'.tmp', 'w'); 729 if (!$fh) { 730 trigger_error("Failed to write index cache", E_USER_ERROR); 731 return; 732 } 733 @fwrite($fh, implode("\n", $lengths)); 734 @fclose($fh); 735 if (isset($conf['fperm'])) 736 chmod($cachename.'.tmp', $conf['fperm']); 737 io_rename($cachename.'.tmp', $cachename.'.idx'); 738 739 return $lengths; 740 } 741 742 /** 743 * Get the word lengths that have been indexed. 744 * 745 * Reads the index directory and returns an array of lengths 746 * that there are indices for. 747 * 748 * @author YoBoY <yoboy.leguesh@gmail.com> 749 */ 750 private function _indexLengths($filter) { 751 global $conf; 752 $idx = array(); 753 if (is_array($filter)) { 754 // testing if index files exist only 755 $path = $conf['indexdir']."/i"; 756 foreach ($filter as $key => $value) { 757 if (@file_exists($path.$key.'.idx')) 758 $idx[] = $key; 759 } 760 } else { 761 $lengths = idx_listIndexLengths(); 762 foreach ($lengths as $key => $length) { 763 // keep all the values equal or superior 764 if ((int)$length >= (int)$filter) 765 $idx[] = $length; 766 } 767 } 768 return $idx; 769 } 770 771 /** 772 * Insert or replace a tuple in a line. 773 * 774 * @author Tom N Harris <tnharris@whoopdedo.org> 775 */ 776 private function _updateTuple($line, $id, $count) { 777 $newLine = $line; 778 if ($newLine !== '') 779 $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); 780 $newLine = trim($newLine, ':'); 781 if ($count) { 782 if (strlen($newLine) > 0) 783 return "$id*$count:".$newLine; 784 else 785 return "$id*$count".$newLine; 786 } 787 return $newLine; 788 } 789 790 /** 791 * Split a line into an array of tuples. 792 * 793 * @author Tom N Harris <tnharris@whoopdedo.org> 794 * @author Andreas Gohr <andi@splitbrain.org> 795 */ 796 private function _parseTuples(&$keys, $line) { 797 $result = array(); 798 if ($line == '') return $result; 799 $parts = explode(':', $line); 800 foreach ($parts as $tuple) { 801 if ($tuple == '') continue; 802 list($key, $cnt) = explode('*', $tuple); 803 if (!$cnt) continue; 804 $key = $keys[$key]; 805 if (!$key) continue; 806 $result[$key] = $cnt; 807 } 808 return $result; 809 } 810} 811 812/** 813 * Create an instance of the indexer. 814 * 815 * @return object a Doku_Indexer 816 * @author Tom N Harris <tnharris@whoopdedo.org> 817 */ 818function idx_get_indexer() { 819 static $Indexer = null; 820 if (is_null($Indexer)) { 821 $Indexer = new Doku_Indexer(); 822 } 823 return $Indexer; 824} 825 826/** 827 * Returns words that will be ignored. 828 * 829 * @return array list of stop words 830 * @author Tom N Harris <tnharris@whoopdedo.org> 831 */ 832function & idx_get_stopwords() { 833 static $stopwords = null; 834 if (is_null($stopwords)) { 835 global $conf; 836 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 837 if(@file_exists($swfile)){ 838 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 839 }else{ 840 $stopwords = array(); 841 } 842 } 843 return $stopwords; 844} 845 846/** 847 * Adds/updates the search index for the given page 848 * 849 * Locking is handled internally. 850 * 851 * @param string $page name of the page to index 852 * @param boolean $verbose print status messages 853 * @return boolean the function completed successfully 854 * @author Tom N Harris <tnharris@whoopdedo.org> 855 */ 856function idx_addPage($page, $verbose=false) { 857 // check if indexing needed 858 $idxtag = metaFN($page,'.indexed'); 859 if(@file_exists($idxtag)){ 860 if(trim(io_readFile($idxtag)) == idx_get_version()){ 861 $last = @filemtime($idxtag); 862 if($last > @filemtime(wikiFN($ID))){ 863 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 864 return false; 865 } 866 } 867 } 868 869 $body = ''; 870 $data = array($page, $body); 871 $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); 872 if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page); 873 $evt->advise_after(); 874 unset($evt); 875 list($page,$body) = $data; 876 877 $Indexer = idx_get_indexer(); 878 $result = $Indexer->addPageWords($page, $body); 879 if ($result == "locked") { 880 if ($verbose) print("Indexer: locked".DOKU_LF); 881 return false; 882 } 883 if ($result) 884 io_saveFile(metaFN($page,'.indexed'), idx_get_version()); 885 if ($verbose) { 886 print("Indexer: finished".DOKU_LF); 887 return true; 888 } 889 return $result; 890} 891 892/** 893 * Find tokens in the fulltext index 894 * 895 * Takes an array of words and will return a list of matching 896 * pages for each one. 897 * 898 * Important: No ACL checking is done here! All results are 899 * returned, regardless of permissions 900 * 901 * @param arrayref $words list of words to search for 902 * @return array list of pages found, associated with the search terms 903 */ 904function idx_lookup(&$words) { 905 $Indexer = idx_get_indexer(); 906 return $Indexer->lookup($words); 907} 908 909/** 910 * Split a string into tokens 911 * 912 */ 913function idx_tokenizer($string, $wc=false) { 914 $Indexer = idx_get_indexer(); 915 return $Indexer->tokenizer($string, $wc); 916} 917 918/* For compatibility */ 919 920/** 921 * Read the list of words in an index (if it exists). 922 * 923 * @author Tom N Harris <tnharris@whoopdedo.org> 924 */ 925function idx_getIndex($idx, $suffix) { 926 global $conf; 927 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 928 if (!@file_exists($fn)) return array(); 929 return file($fn); 930} 931 932/** 933 * Get the list of lengths indexed in the wiki. 934 * 935 * Read the index directory or a cache file and returns 936 * a sorted array of lengths of the words used in the wiki. 937 * 938 * @author YoBoY <yoboy.leguesh@gmail.com> 939 */ 940function idx_listIndexLengths() { 941 global $conf; 942 // testing what we have to do, create a cache file or not. 943 if ($conf['readdircache'] == 0) { 944 $docache = false; 945 } else { 946 clearstatcache(); 947 if (@file_exists($conf['indexdir'].'/lengths.idx') 948 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 949 if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) { 950 $idx = array(); 951 foreach ($lengths as $length) { 952 $idx[] = (int)$length; 953 } 954 return $idx; 955 } 956 } 957 $docache = true; 958 } 959 960 if ($conf['readdircache'] == 0 || $docache) { 961 $dir = @opendir($conf['indexdir']); 962 if ($dir === false) 963 return array(); 964 $idx[] = array(); 965 while (($f = readdir($dir)) !== false) { 966 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 967 $i = substr($f, 1, -4); 968 if (is_numeric($i)) 969 $idx[] = (int)$i; 970 } 971 } 972 closedir($dir); 973 sort($idx); 974 // save this in a file 975 if ($docache) { 976 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 977 @fwrite($handle, implode("\n", $idx)); 978 @fclose($handle); 979 } 980 return $idx; 981 } 982 983 return array(); 984} 985 986/** 987 * Get the word lengths that have been indexed. 988 * 989 * Reads the index directory and returns an array of lengths 990 * that there are indices for. 991 * 992 * @author YoBoY <yoboy.leguesh@gmail.com> 993 */ 994function idx_indexLengths($filter) { 995 global $conf; 996 $idx = array(); 997 if (is_array($filter)) { 998 // testing if index files exist only 999 $path = $conf['indexdir']."/i"; 1000 foreach ($filter as $key => $value) { 1001 if (@file_exists($path.$key.'.idx')) 1002 $idx[] = $key; 1003 } 1004 } else { 1005 $lengths = idx_listIndexLengths(); 1006 foreach ($lengths as $key => $length) { 1007 // keep all the values equal or superior 1008 if ((int)$length >= (int)$filter) 1009 $idx[] = $length; 1010 } 1011 } 1012 return $idx; 1013} 1014 1015/** 1016 * Clean a name of a key for use as a file name. 1017 * 1018 * Romanizes non-latin characters, then strips away anything that's 1019 * not a letter, number, or underscore. 1020 * 1021 * @author Tom N Harris <tnharris@whoopdedo.org> 1022 */ 1023function idx_cleanName($name) { 1024 $name = utf8_romanize(trim((string)$name)); 1025 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1026 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1027 return strtolower($name); 1028} 1029 1030//Setup VIM: ex: et ts=4 : 1031