1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9 10if(!defined('DOKU_INC')) die('meh.'); 11 12// Version tag used to force rebuild on upgrade 13define('INDEXER_VERSION', 3); 14 15// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 16if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 17 18// Asian characters are handled as words. The following regexp defines the 19// Unicode-Ranges for Asian characters 20// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 21// I'm no language expert. If you think some ranges are wrongly chosen or 22// a range is missing, please contact me 23define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 24define('IDX_ASIAN2','['. 25 '\x{2E80}-\x{3040}'. // CJK -> Hangul 26 '\x{309D}-\x{30A0}'. 27 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. 28 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 29 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 30 ']'); 31define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 32 '\x{3042}\x{3044}\x{3046}\x{3048}'. 33 '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 34 '\x{3084}\x{3086}\x{3088}-\x{308D}'. 35 '\x{308F}-\x{3094}'. 36 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 37 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 38 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 39 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 40 ']['. 41 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 42 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 43 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 44 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 45 '\x{31F0}-\x{31FF}'. 46 ']?'); 47define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); 48 49/** 50 * Version of the indexer taking into consideration the external tokenizer. 51 * The indexer is only compatible with data written by the same version. 52 * 53 * @author Tom N Harris <tnharris@whoopdedo.org> 54 */ 55function idx_get_version(){ 56 global $conf; 57 if($conf['external_tokenizer']) 58 return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); 59 else 60 return INDEXER_VERSION; 61} 62 63/** 64 * Measure the length of a string. 65 * Differs from strlen in handling of asian characters. 66 * 67 * @author Tom N Harris <tnharris@whoopdedo.org> 68 */ 69function wordlen($w){ 70 $l = strlen($w); 71 // If left alone, all chinese "words" will get put into w3.idx 72 // So the "length" of a "word" is faked 73 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 74 foreach($leadbytes[0] as $b) 75 $l += ord($b) - 0xE1; 76 } 77 return $l; 78} 79 80/** 81 * Class that encapsulates operations on the indexer database. 82 * 83 * @author Tom N Harris <tnharris@whoopdedo.org> 84 */ 85class Doku_Indexer { 86 87 /** 88 * Adds the contents of a page to the fulltext index 89 * 90 * The added text replaces previous words for the same page. 91 * An empty value erases the page. 92 * 93 * @param string $page a page name 94 * @param string $text the body of the page 95 * @return boolean the function completed successfully 96 * @author Tom N Harris <tnharris@whoopdedo.org> 97 * @author Andreas Gohr <andi@splitbrain.org> 98 */ 99 public function addPageWords($page, $text) { 100 if (!$this->_lock()) 101 return "locked"; 102 103 // load known documents 104 $page_idx = $this->_addIndexKey('page', '', $page); 105 if ($page_idx === false) { 106 $this->_unlock(); 107 return false; 108 } 109 110 $pagewords = array(); 111 // get word usage in page 112 $words = $this->_getPageWords($text); 113 if ($words === false) { 114 $this->_unlock(); 115 return false; 116 } 117 118 if (!empty($words)) { 119 foreach (array_keys($words) as $wlen) { 120 $index = $this->_getIndex('i', $wlen); 121 foreach ($words[$wlen] as $wid => $freq) { 122 $idx = ($wid<count($index)) ? $index[$wid] : ''; 123 $index[$wid] = $this->_updateTuple($idx, $pid, $freq); 124 $pagewords[] = "$wlen*$wid"; 125 } 126 if (!$this->_saveIndex('i', $wlen, $index)) { 127 $this->_unlock(); 128 return false; 129 } 130 } 131 } 132 133 // Remove obsolete index entries 134 $pageword_idx = $this->_getIndexKey('pageword', '', $pid); 135 if ($pageword_idx !== '') { 136 $oldwords = explode(':',$pageword_idx); 137 $delwords = array_diff($oldwords, $pagewords); 138 $upwords = array(); 139 foreach ($delwords as $word) { 140 if ($word != '') { 141 list($wlen,$wid) = explode('*', $word); 142 $wid = (int)$wid; 143 $upwords[$wlen][] = $wid; 144 } 145 } 146 foreach ($upwords as $wlen => $widx) { 147 $index = $this->_getIndex('i', $wlen); 148 foreach ($widx as $wid) { 149 $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0); 150 } 151 $this->_saveIndex('i', $wlen, $index); 152 } 153 } 154 // Save the reverse index 155 $pageword_idx = join(':', $pagewords); 156 if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) { 157 $this->_unlock(); 158 return false; 159 } 160 161 $this->_unlock(); 162 return true; 163 } 164 165 /** 166 * Split the words in a page and add them to the index. 167 * 168 * @author Andreas Gohr <andi@splitbrain.org> 169 * @author Christopher Smith <chris@jalakai.co.uk> 170 * @author Tom N Harris <tnharris@whoopdedo.org> 171 */ 172 private function _getPageWords($text) { 173 global $conf; 174 175 $tokens = $this->tokenizer($text); 176 $tokens = array_count_values($tokens); // count the frequency of each token 177 178 $words = array(); 179 foreach ($tokens as $w=>$c) { 180 $l = wordlen($w); 181 if (isset($words[$l])){ 182 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 183 }else{ 184 $words[$l] = array($w => $c); 185 } 186 } 187 188 // arrive here with $words = array(wordlen => array(word => frequency)) 189 $word_idx_modified = false; 190 $index = array(); //resulting index 191 foreach (array_keys($words) as $wlen) { 192 $word_idx = $this->_getIndex('w', $wlen); 193 foreach ($words[$wlen] as $word => $freq) { 194 $wid = array_search($word, $word_idx); 195 if ($wid === false) { 196 $wid = count($word_idx); 197 $word_idx[] = $word; 198 $word_idx_modified = true; 199 } 200 if (!isset($index[$wlen])) 201 $index[$wlen] = array(); 202 $index[$wlen][$wid] = $freq; 203 } 204 // save back the word index 205 if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx)) 206 return false; 207 } 208 209 return $index; 210 } 211 212 /** 213 * Add keys to the metadata index. 214 * 215 * Adding new keys does not remove other keys for the page. 216 * An empty value will erase the key. 217 * The $key parameter can be an array to add multiple keys. $value will 218 * not be used if $key is an array. 219 * 220 * @param string $page a page name 221 * @param mixed $key a key string or array of key=>value pairs 222 * @param mixed $value the value or list of values 223 * @return boolean the function completed successfully 224 * @author Tom N Harris <tnharris@whoopdedo.org> 225 */ 226 public function addMetaKeys($page, $key, $value=null) { 227 if (!is_array($key)) { 228 $key = array($key => $value); 229 } elseif (!is_null($value)) { 230 // $key is array, but $value is not null 231 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 232 } 233 234 $this->_lock(); 235 236 // load known documents 237 $pid = $this->_addIndexKey('page', '', $page); 238 if ($pid === false) { 239 $this->_unlock(); 240 return false; 241 } 242 243 foreach ($key as $name => $values) { 244 $metaname = idx_cleanName($name); 245 $metaidx = $this->_getIndex($metaname, '_i'); 246 $metawords = $this->_getIndex($metaname, '_w'); 247 $addwords = false; 248 $update = array(); 249 if (!is_array($val)) $values = array($values); 250 foreach ($values as $val) { 251 $val = (string)$val; 252 if ($val !== "") { 253 $id = array_search($val, $metawords); 254 if ($id === false) { 255 $id = count($metawords); 256 $metawords[$id] = $val; 257 $addwords = true; 258 } 259 $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1); 260 $update[$id] = 1; 261 } else { 262 $id = array_search($val, $metawords); 263 if ($id !== false) { 264 $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); 265 $update[$id] = 0; 266 } 267 } 268 } 269 if (!empty($update)) { 270 if ($addwords) 271 $this->_saveIndex($metaname.'_w', '', $metawords); 272 $this->_saveIndex($metaname.'_i', '', $metaidx); 273 $val_idx = $this->_getIndexKey($metaname, '_p', $pid); 274 $val_idx = array_flip(explode(':', $val_idx)); 275 foreach ($update as $id => $add) { 276 if ($add) $val_idx[$id] = 1; 277 else unset($val_idx[$id]); 278 } 279 $val_idx = array_keys($val_idx); 280 $this->_saveIndexKey($metaname.'_p', '', $pid, implode(':', $val_idx)); 281 } 282 unset($metaidx); 283 unset($metawords); 284 } 285 return true; 286 } 287 288 /** 289 * Remove a page from the index 290 * 291 * Erases entries in all known indexes. 292 * 293 * @param string $page a page name 294 * @return boolean the function completed successfully 295 * @author Tom N Harris <tnharris@whoopdedo.org> 296 */ 297 public function deletePage($page) { 298 } 299 300 /** 301 * Split the text into words for fulltext search 302 * 303 * TODO: does this also need &$stopwords ? 304 * 305 * @param string $text plain text 306 * @param boolean $wc are wildcards allowed? 307 * @return array list of words in the text 308 * @author Tom N Harris <tnharris@whoopdedo.org> 309 * @author Andreas Gohr <andi@splitbrain.org> 310 */ 311 public function tokenizer($text, $wc=false) { 312 global $conf; 313 $words = array(); 314 $wc = ($wc) ? '' : '\*'; 315 $stopwords =& idx_get_stopwords(); 316 317 if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') { 318 if (0 == io_exec($conf['tokenizer_cmd'], $text, $output)) 319 $text = $output; 320 } else { 321 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 322 // handle asian chars as single words (may fail on older PHP version) 323 $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); 324 if (!is_null($asia)) $text = $asia; // recover from regexp falure 325 } 326 } 327 $text = strtr($text, "\r\n\t", ' '); 328 if (preg_match('/[^0-9A-Za-z ]/u', $text)) 329 $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); 330 331 $wordlist = explode(' ', $text); 332 foreach ($wordlist as $word) { 333 $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 334 utf8_strtolower($word) : strtolower($word); 335 if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; 336 if (array_search($word, $stopwords) !== false) continue; 337 $words[] = $word; 338 } 339 return $words; 340 } 341 342 /** 343 * Find pages in the fulltext index containing the words, 344 * 345 * The search words must be pre-tokenized, meaning only letters and 346 * numbers with an optional wildcard 347 * 348 * The returned array will have the original tokens as key. The values 349 * in the returned list is an array with the page names as keys and the 350 * number of times that token appeas on the page as value. 351 * 352 * @param arrayref $tokens list of words to search for 353 * @return array list of page names with usage counts 354 * @author Tom N Harris <tnharris@whoopdedo.org> 355 * @author Andreas Gohr <andi@splitbrain.org> 356 */ 357 public function lookup(&$tokens) { 358 $result = array(); 359 $wids = $this->_getIndexWords($tokens, $result); 360 if (empty($wids)) return array(); 361 // load known words and documents 362 $page_idx = $this->_getIndex('page', ''); 363 $docs = array(); 364 foreach (array_keys($wids) as $wlen) { 365 $wids[$wlen] = array_unique($wids[$wlen]); 366 $index = $this->_getIndex('i', $wlen); 367 foreach($wids[$wlen] as $ixid) { 368 if ($ixid < count($index)) 369 $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]); 370 } 371 } 372 // merge found pages into final result array 373 $final = array(); 374 foreach ($result as $word => $res) { 375 $final[$word] = array(); 376 foreach ($res as $wid) { 377 $hits = &$docs[$wid]; 378 foreach ($hits as $hitkey => $hitcnt) { 379 // make sure the document still exists 380 if (!page_exists($hitkey, '', false)) continue; 381 if (!isset($final[$word][$hitkey])) 382 $final[$word][$hitkey] = $hitcnt; 383 else 384 $final[$word][$hitkey] += $hitcnt; 385 } 386 } 387 } 388 return $final; 389 } 390 391 /** 392 * Find pages containing a metadata key. 393 * 394 * The metadata values are compared as case-sensitive strings. Pass a 395 * callback function that returns true or false to use a different 396 * comparison function 397 * 398 * @param string $key name of the metadata key to look for 399 * @param string $value search term to look for 400 * @param callback $func comparison function 401 * @return array list with page names, keys are query values if more than one given 402 * @author Tom N Harris <tnharris@whoopdedo.org> 403 */ 404 public function lookupKey($key, $value, $func=null) { 405 return array(); 406 } 407 408 /** 409 * Find the index ID of each search term. 410 * 411 * The query terms should only contain valid characters, with a '*' at 412 * either the beginning or end of the word (or both). 413 * The $result parameter can be used to merge the index locations with 414 * the appropriate query term. 415 * 416 * @param arrayref $words The query terms. 417 * @param arrayref $result Set to word => array("length*id" ...) 418 * @return array Set to length => array(id ...) 419 * @author Tom N Harris <tnharris@whoopdedo.org> 420 */ 421 private function _getIndexWords(&$words, &$result) { 422 $tokens = array(); 423 $tokenlength = array(); 424 $tokenwild = array(); 425 foreach ($words as $word) { 426 $result[$word] = array(); 427 $caret = false; 428 $dollar = false; 429 $xword = $word; 430 $wlen = wordlen($word); 431 432 // check for wildcards 433 if (substr($xword, 0, 1) == '*') { 434 $xword = substr($xword, 1); 435 $caret = true; 436 $wlen -= 1; 437 } 438 if (substr($xword, -1, 1) == '*') { 439 $xword = substr($xword, 0, -1); 440 $dollar = true; 441 $wlen -= 1; 442 } 443 if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) 444 continue; 445 if (!isset($tokens[$xword])) 446 $tokenlength[$wlen][] = $xword; 447 if ($caret || $dollar) { 448 $re = preg_quote($xword, '/'); 449 if ($caret) $re = '^'.$re; 450 if ($dollar) $re = $re.'$'; 451 $tokens[$xword][] = array($word, '/'.$re.'/'); 452 if (!isset($tokenwild[$xword])) 453 $tokenwild[$xword] = $wlen; 454 } else { 455 $tokens[$xword][] = array($word, null); 456 } 457 } 458 asort($tokenwild); 459 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 460 // $tokenlength = array( base word length => base word ... ) 461 // $tokenwild = array( base word => base word length ... ) 462 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 463 $indexes_known = $this->_indexLengths($length_filter); 464 if (!empty($tokenwild)) sort($indexes_known); 465 // get word IDs 466 $wids = array(); 467 foreach ($indexes_known as $ixlen) { 468 $word_idx = $this->_getIndex('w', $ixlen); 469 // handle exact search 470 if (isset($tokenlength[$ixlen])) { 471 foreach ($tokenlength[$ixlen] as $xword) { 472 $wid = array_search($xword, $word_idx); 473 if ($wid !== false) { 474 $wids[$ixlen][] = $wid; 475 foreach ($tokens[$xword] as $w) 476 $result[$w[0]][] = "$ixlen*$wid"; 477 } 478 } 479 } 480 // handle wildcard search 481 foreach ($tokenwild as $xword => $wlen) { 482 if ($wlen >= $ixlen) break; 483 foreach ($tokens[$xword] as $w) { 484 if (is_null($w[1])) continue; 485 foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 486 $wids[$ixlen][] = $wid; 487 $result[$w[0]][] = "$ixlen*$wid"; 488 } 489 } 490 } 491 } 492 return $wids; 493 } 494 495 /** 496 * Return a list of all pages 497 * 498 * @param string $key list only pages containing the metadata key (optional) 499 * @return array list of page names 500 * @author Tom N Harris <tnharris@whoopdedo.org> 501 */ 502 public function getPages($key=null) { 503 $page_idx = $this->_getIndex('page', ''); 504 if (is_null($key)) return $page_idx; 505 } 506 507 /** 508 * Return a list of words sorted by number of times used 509 * 510 * @param int $min bottom frequency threshold 511 * @param int $max upper frequency limit. No limit if $max<$min 512 * @param string $key metadata key to list. Uses the fulltext index if not given 513 * @return array list of words as the keys and frequency as values 514 * @author Tom N Harris <tnharris@whoopdedo.org> 515 */ 516 public function histogram($min=1, $max=0, $key=null) { 517 } 518 519 /** 520 * Lock the indexer. 521 * 522 * @author Tom N Harris <tnharris@whoopdedo.org> 523 */ 524 private function _lock() { 525 global $conf; 526 $status = true; 527 $lock = $conf['lockdir'].'/_indexer.lock'; 528 while (!@mkdir($lock, $conf['dmode'])) { 529 usleep(50); 530 if (time() - @filemtime($lock) > 60*5) { 531 // looks like a stale lock, remove it 532 @rmdir($lock); 533 $status = "stale lock removed"; 534 } else { 535 return false; 536 } 537 } 538 if ($conf['dperm']) 539 chmod($lock, $conf['dperm']); 540 return $status; 541 } 542 543 /** 544 * Release the indexer lock. 545 * 546 * @author Tom N Harris <tnharris@whoopdedo.org> 547 */ 548 private function _unlock() { 549 global $conf; 550 @rmdir($conf['lockdir'].'/_indexer.lock'); 551 return true; 552 } 553 554 /** 555 * Retrieve the entire index. 556 * 557 * @author Tom N Harris <tnharris@whoopdedo.org> 558 */ 559 private function _getIndex($idx, $suffix) { 560 global $conf; 561 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 562 if (!@file_exists($fn)) return array(); 563 return file($fn, FILE_IGNORE_NEW_LINES); 564 } 565 566 /** 567 * Replace the contents of the index with an array. 568 * 569 * @author Tom N Harris <tnharris@whoopdedo.org> 570 */ 571 private function _saveIndex($idx, $suffix, &$lines) { 572 global $conf; 573 $fn = $conf['indexdir'].'/'.$idx.$suffix; 574 $fh = @fopen($fn.'.tmp', 'w'); 575 if (!$fh) return false; 576 fwrite($fh, join("\n", $lines)); 577 fclose($fh); 578 if (isset($conf['fperm'])) 579 chmod($fn.'.tmp', $conf['fperm']); 580 io_rename($fn.'.tmp', $fn.'.idx'); 581 if ($suffix !== '') 582 $this->_cacheIndexDir($idx, $suffix, empty($lines)); 583 return true; 584 } 585 586 /** 587 * Retrieve a line from the index. 588 * 589 * @author Tom N Harris <tnharris@whoopdedo.org> 590 */ 591 private function _getIndexKey($idx, $suffix, $id) { 592 global $conf; 593 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 594 if (!@file_exists($fn)) return ''; 595 $fh = @fopen($fn, 'r'); 596 if (!$fh) return ''; 597 $ln = -1; 598 while (($line = fgets($fh)) !== false) { 599 if (++$ln == $id) break; 600 } 601 fclose($fh); 602 return rtrim((string)$line); 603 } 604 605 /** 606 * Write a line into the index. 607 * 608 * @author Tom N Harris <tnharris@whoopdedo.org> 609 */ 610 private function _saveIndexKey($idx, $suffix, $id, $line) { 611 global $conf; 612 if (substr($line, -1) != "\n") 613 $line .= "\n"; 614 $fn = $conf['indexdir'].'/'.$idx.$suffix; 615 $fh = @fopen($fn.'.tmp', 'w'); 616 if (!fh) return false; 617 $ih = @fopen($fn.'.idx', 'r'); 618 if ($ih) { 619 $ln = -1; 620 while (($curline = fgets($ih)) !== false) { 621 fwrite($fh, (++$ln == $id) ? $line : $curline); 622 } 623 if ($id > $ln) 624 fwrite($fh, $line); 625 fclose($ih); 626 } else { 627 fwrite($fh, $line); 628 } 629 fclose($fh); 630 if (isset($conf['fperm'])) 631 chmod($fn.'.tmp', $conf['fperm']); 632 io_rename($fn.'.tmp', $fn.'.idx'); 633 if ($suffix !== '') 634 $this->_cacheIndexDir($idx, $suffix); 635 return true; 636 } 637 638 /** 639 * Retrieve or insert a value in the index. 640 * 641 * @author Tom N Harris <tnharris@whoopdedo.org> 642 */ 643 private function _addIndexKey($idx, $suffix, $value) { 644 $index = $this->_getIndex($idx, $suffix); 645 $id = array_search($value, $index); 646 if ($id === false) { 647 $id = count($index); 648 $index[$id] = $value; 649 if (!$this->_saveIndex($idx, $suffix, $index)) { 650 trigger_error("Failed to write $idx index", E_USER_ERROR); 651 return false; 652 } 653 } 654 return $id; 655 } 656 657 private function _cacheIndexDir($idx, $suffix, $delete=false) { 658 global $conf; 659 if ($idx == 'i') 660 $cachename = $conf['indexdir'].'/lengths'; 661 else 662 $cachename = $conf['indexdir'].'/'.$idx.'lengths'; 663 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 664 if ($lengths === false) $lengths = array(); 665 $old = array_search((string)$suffix, $lengths); 666 if (empty($lines)) { 667 if ($old === false) return; 668 unset($lengths[$old]); 669 } else { 670 if ($old !== false) return; 671 $lengths[] = $suffix; 672 sort($lengths); 673 } 674 $fh = @fopen($cachename.'.tmp', 'w'); 675 if (!$fh) { 676 trigger_error("Failed to write index cache", E_USER_ERROR); 677 return; 678 } 679 @fwrite($fh, implode("\n", $lengths)); 680 @fclose($fh); 681 if (isset($conf['fperm'])) 682 chmod($cachename.'.tmp', $conf['fperm']); 683 io_rename($cachename.'.tmp', $cachename.'.idx'); 684 } 685 686 /** 687 * Get the list of lengths indexed in the wiki. 688 * 689 * Read the index directory or a cache file and returns 690 * a sorted array of lengths of the words used in the wiki. 691 * 692 * @author YoBoY <yoboy.leguesh@gmail.com> 693 */ 694 private function _listIndexLengths() { 695 global $conf; 696 $cachename = $conf['indexdir'].'/lengths'; 697 clearstatcache(); 698 if (@file_exists($cachename.'.idx')) { 699 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 700 if ($lengths !== false) { 701 $idx = array(); 702 foreach ($lengths as $length) 703 $idx[] = (int)$length; 704 return $idx; 705 } 706 } 707 708 $dir = @opendir($conf['indexdir']); 709 if ($dir === false) 710 return array(); 711 $lengths[] = array(); 712 while (($f = readdir($dir)) !== false) { 713 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 714 $i = substr($f, 1, -4); 715 if (is_numeric($i)) 716 $lengths[] = (int)$i; 717 } 718 } 719 closedir($dir); 720 sort($lengths); 721 // save this in a file 722 $fh = @fopen($cachename.'.tmp', 'w'); 723 if (!$fh) { 724 trigger_error("Failed to write index cache", E_USER_ERROR); 725 return; 726 } 727 @fwrite($fh, implode("\n", $lengths)); 728 @fclose($fh); 729 if (isset($conf['fperm'])) 730 chmod($cachename.'.tmp', $conf['fperm']); 731 io_rename($cachename.'.tmp', $cachename.'.idx'); 732 733 return $lengths; 734 } 735 736 /** 737 * Get the word lengths that have been indexed. 738 * 739 * Reads the index directory and returns an array of lengths 740 * that there are indices for. 741 * 742 * @author YoBoY <yoboy.leguesh@gmail.com> 743 */ 744 private function _indexLengths($filter) { 745 global $conf; 746 $idx = array(); 747 if (is_array($filter)) { 748 // testing if index files exist only 749 $path = $conf['indexdir']."/i"; 750 foreach ($filter as $key => $value) { 751 if (@file_exists($path.$key.'.idx')) 752 $idx[] = $key; 753 } 754 } else { 755 $lengths = idx_listIndexLengths(); 756 foreach ($lengths as $key => $length) { 757 // keep all the values equal or superior 758 if ((int)$length >= (int)$filter) 759 $idx[] = $length; 760 } 761 } 762 return $idx; 763 } 764 765 /** 766 * Insert or replace a tuple in a line. 767 * 768 * @author Tom N Harris <tnharris@whoopdedo.org> 769 */ 770 private function _updateTuple($line, $id, $count) { 771 $newLine = $line; 772 if ($newLine !== '') 773 $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); 774 $newLine = trim($newLine, ':'); 775 if ($count) { 776 if (strlen($newLine) > 0) 777 return "$id*$count:".$newLine; 778 else 779 return "$id*$count".$newLine; 780 } 781 return $newLine; 782 } 783 784 /** 785 * Split a line into an array of tuples. 786 * 787 * @author Tom N Harris <tnharris@whoopdedo.org> 788 * @author Andreas Gohr <andi@splitbrain.org> 789 */ 790 private function _parseTuples(&$keys, $line) { 791 $result = array(); 792 if ($line == '') return $result; 793 $parts = explode(':', $line); 794 foreach ($parts as $tuple) { 795 if ($tuple == '') continue; 796 list($key, $cnt) = explode('*', $tuple); 797 if (!$cnt) continue; 798 $key = $keys[$key]; 799 if (!$key) continue; 800 $result[$key] = $cnt; 801 } 802 return $result; 803 } 804} 805 806/** 807 * Create an instance of the indexer. 808 * 809 * @return object a Doku_Indexer 810 * @author Tom N Harris <tnharris@whoopdedo.org> 811 */ 812function idx_get_indexer() { 813 static $Indexer = null; 814 if (is_null($Indexer)) { 815 $Indexer = new Doku_Indexer(); 816 } 817 return $Indexer; 818} 819 820/** 821 * Returns words that will be ignored. 822 * 823 * @return array list of stop words 824 * @author Tom N Harris <tnharris@whoopdedo.org> 825 */ 826function & idx_get_stopwords() { 827 static $stopwords = null; 828 if (is_null($stopwords)) { 829 global $conf; 830 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 831 if(@file_exists($swfile)){ 832 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 833 }else{ 834 $stopwords = array(); 835 } 836 } 837 return $stopwords; 838} 839 840/** 841 * Adds/updates the search index for the given page 842 * 843 * Locking is handled internally. 844 * 845 * @param string $page name of the page to index 846 * @param boolean $verbose print status messages 847 * @return boolean the function completed successfully 848 * @author Tom N Harris <tnharris@whoopdedo.org> 849 */ 850function idx_addPage($page, $verbose=false) { 851 // check if indexing needed 852 $idxtag = metaFN($page,'.indexed'); 853 if(@file_exists($idxtag)){ 854 if(trim(io_readFile($idxtag)) == idx_get_version()){ 855 $last = @filemtime($idxtag); 856 if($last > @filemtime(wikiFN($ID))){ 857 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 858 return false; 859 } 860 } 861 } 862 863 $body = ''; 864 $data = array($page, $body); 865 $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); 866 if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page); 867 $evt->advise_after(); 868 unset($evt); 869 list($page,$body) = $data; 870 871 $Indexer = idx_get_indexer(); 872 $result = $Indexer->addPageWords($page, $body); 873 if ($result == "locked") { 874 if ($verbose) print("Indexer: locked".DOKU_LF); 875 return false; 876 } 877 if ($result) 878 io_saveFile(metaFN($page,'.indexed'), idx_get_version()); 879 if ($verbose) { 880 print("Indexer: finished".DOKU_LF); 881 return true; 882 } 883 return $result; 884} 885 886/** 887 * Find tokens in the fulltext index 888 * 889 * Takes an array of words and will return a list of matching 890 * pages for each one. 891 * 892 * Important: No ACL checking is done here! All results are 893 * returned, regardless of permissions 894 * 895 * @param arrayref $words list of words to search for 896 * @return array list of pages found, associated with the search terms 897 */ 898function idx_lookup(&$words) { 899 $Indexer = idx_get_indexer(); 900 return $Indexer->lookup($words); 901} 902 903/** 904 * Split a string into tokens 905 * 906 */ 907function idx_tokenizer($string, $wc=false) { 908 $Indexer = idx_get_indexer(); 909 return $Indexer->tokenizer($string, $wc); 910} 911 912/* For compatibility */ 913 914/** 915 * Read the list of words in an index (if it exists). 916 * 917 * @author Tom N Harris <tnharris@whoopdedo.org> 918 */ 919function idx_getIndex($idx, $suffix) { 920 global $conf; 921 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 922 if (!@file_exists($fn)) return array(); 923 return file($fn); 924} 925 926/** 927 * Get the list of lengths indexed in the wiki. 928 * 929 * Read the index directory or a cache file and returns 930 * a sorted array of lengths of the words used in the wiki. 931 * 932 * @author YoBoY <yoboy.leguesh@gmail.com> 933 */ 934function idx_listIndexLengths() { 935 global $conf; 936 // testing what we have to do, create a cache file or not. 937 if ($conf['readdircache'] == 0) { 938 $docache = false; 939 } else { 940 clearstatcache(); 941 if (@file_exists($conf['indexdir'].'/lengths.idx') 942 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 943 if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) { 944 $idx = array(); 945 foreach ($lengths as $length) { 946 $idx[] = (int)$length; 947 } 948 return $idx; 949 } 950 } 951 $docache = true; 952 } 953 954 if ($conf['readdircache'] == 0 || $docache) { 955 $dir = @opendir($conf['indexdir']); 956 if ($dir === false) 957 return array(); 958 $idx[] = array(); 959 while (($f = readdir($dir)) !== false) { 960 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 961 $i = substr($f, 1, -4); 962 if (is_numeric($i)) 963 $idx[] = (int)$i; 964 } 965 } 966 closedir($dir); 967 sort($idx); 968 // save this in a file 969 if ($docache) { 970 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 971 @fwrite($handle, implode("\n", $idx)); 972 @fclose($handle); 973 } 974 return $idx; 975 } 976 977 return array(); 978} 979 980/** 981 * Get the word lengths that have been indexed. 982 * 983 * Reads the index directory and returns an array of lengths 984 * that there are indices for. 985 * 986 * @author YoBoY <yoboy.leguesh@gmail.com> 987 */ 988function idx_indexLengths($filter) { 989 global $conf; 990 $idx = array(); 991 if (is_array($filter)) { 992 // testing if index files exist only 993 $path = $conf['indexdir']."/i"; 994 foreach ($filter as $key => $value) { 995 if (@file_exists($path.$key.'.idx')) 996 $idx[] = $key; 997 } 998 } else { 999 $lengths = idx_listIndexLengths(); 1000 foreach ($lengths as $key => $length) { 1001 // keep all the values equal or superior 1002 if ((int)$length >= (int)$filter) 1003 $idx[] = $length; 1004 } 1005 } 1006 return $idx; 1007} 1008 1009/** 1010 * Clean a name of a key for use as a file name. 1011 * 1012 * Romanizes non-latin characters, then strips away anything that's 1013 * not a letter, number, or underscore. 1014 * 1015 * @author Tom N Harris <tnharris@whoopdedo.org> 1016 */ 1017function idx_cleanName($name) { 1018 $name = utf8_romanize(trim((string)$name)); 1019 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1020 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1021 return strtolower($name); 1022} 1023 1024//Setup VIM: ex: et ts=4 : 1025