1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9 10if(!defined('DOKU_INC')) die('meh.'); 11 12// Version tag used to force rebuild on upgrade 13define('INDEXER_VERSION', 4); 14 15// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 16if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 17 18// Asian characters are handled as words. The following regexp defines the 19// Unicode-Ranges for Asian characters 20// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 21// I'm no language expert. If you think some ranges are wrongly chosen or 22// a range is missing, please contact me 23define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 24define('IDX_ASIAN2','['. 25 '\x{2E80}-\x{3040}'. // CJK -> Hangul 26 '\x{309D}-\x{30A0}'. 27 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. 28 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 29 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 30 "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F". // CJK Extension B 31 "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF". // CJK Extension C 32 "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F". // CJK Extension D 33 "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF". // CJK Compatibility Supplement 34 ']'); 35define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 36 '\x{3042}\x{3044}\x{3046}\x{3048}'. 37 '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 38 '\x{3084}\x{3086}\x{3088}-\x{308D}'. 39 '\x{308F}-\x{3094}'. 40 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 41 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 42 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 43 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 44 ']['. 45 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 46 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 47 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 48 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 49 '\x{31F0}-\x{31FF}'. 50 ']?'); 51define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); 52 53/** 54 * Version of the indexer taking into consideration the external tokenizer. 55 * The indexer is only compatible with data written by the same version. 56 * 57 * @triggers INDEXER_VERSION_GET 58 * Plugins that modify what gets indexed should hook this event and 59 * add their version info to the event data like so: 60 * $data[$plugin_name] = $plugin_version; 61 * 62 * @author Tom N Harris <tnharris@whoopdedo.org> 63 * @author Michael Hamann <michael@content-space.de> 64 */ 65function idx_get_version(){ 66 static $indexer_version = null; 67 if ($indexer_version == null) { 68 global $conf; 69 $version = INDEXER_VERSION; 70 71 // DokuWiki version is included for the convenience of plugins 72 $data = array('dokuwiki'=>$version); 73 trigger_event('INDEXER_VERSION_GET', $data, null, false); 74 unset($data['dokuwiki']); // this needs to be first 75 ksort($data); 76 foreach ($data as $plugin=>$vers) 77 $version .= '+'.$plugin.'='.$vers; 78 $indexer_version = $version; 79 } 80 return $indexer_version; 81} 82 83/** 84 * Measure the length of a string. 85 * Differs from strlen in handling of asian characters. 86 * 87 * @author Tom N Harris <tnharris@whoopdedo.org> 88 */ 89function wordlen($w){ 90 $l = strlen($w); 91 // If left alone, all chinese "words" will get put into w3.idx 92 // So the "length" of a "word" is faked 93 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 94 foreach($leadbytes[0] as $b) 95 $l += ord($b) - 0xE1; 96 } 97 return $l; 98} 99 100/** 101 * Class that encapsulates operations on the indexer database. 102 * 103 * @author Tom N Harris <tnharris@whoopdedo.org> 104 */ 105class Doku_Indexer { 106 107 /** 108 * Adds the contents of a page to the fulltext index 109 * 110 * The added text replaces previous words for the same page. 111 * An empty value erases the page. 112 * 113 * @param string $page a page name 114 * @param string $text the body of the page 115 * @return boolean the function completed successfully 116 * @author Tom N Harris <tnharris@whoopdedo.org> 117 * @author Andreas Gohr <andi@splitbrain.org> 118 */ 119 public function addPageWords($page, $text) { 120 if (!$this->lock()) 121 return "locked"; 122 123 // load known documents 124 $pid = $this->addIndexKey('page', '', $page); 125 if ($pid === false) { 126 $this->unlock(); 127 return false; 128 } 129 130 $pagewords = array(); 131 // get word usage in page 132 $words = $this->getPageWords($text); 133 if ($words === false) { 134 $this->unlock(); 135 return false; 136 } 137 138 if (!empty($words)) { 139 foreach (array_keys($words) as $wlen) { 140 $index = $this->getIndex('i', $wlen); 141 foreach ($words[$wlen] as $wid => $freq) { 142 $idx = ($wid<count($index)) ? $index[$wid] : ''; 143 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 144 $pagewords[] = "$wlen*$wid"; 145 } 146 if (!$this->saveIndex('i', $wlen, $index)) { 147 $this->unlock(); 148 return false; 149 } 150 } 151 } 152 153 // Remove obsolete index entries 154 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 155 if ($pageword_idx !== '') { 156 $oldwords = explode(':',$pageword_idx); 157 $delwords = array_diff($oldwords, $pagewords); 158 $upwords = array(); 159 foreach ($delwords as $word) { 160 if ($word != '') { 161 list($wlen,$wid) = explode('*', $word); 162 $wid = (int)$wid; 163 $upwords[$wlen][] = $wid; 164 } 165 } 166 foreach ($upwords as $wlen => $widx) { 167 $index = $this->getIndex('i', $wlen); 168 foreach ($widx as $wid) { 169 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 170 } 171 $this->saveIndex('i', $wlen, $index); 172 } 173 } 174 // Save the reverse index 175 $pageword_idx = join(':', $pagewords); 176 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 177 $this->unlock(); 178 return false; 179 } 180 181 $this->unlock(); 182 return true; 183 } 184 185 /** 186 * Split the words in a page and add them to the index. 187 * 188 * @param string $text content of the page 189 * @return array list of word IDs and number of times used 190 * @author Andreas Gohr <andi@splitbrain.org> 191 * @author Christopher Smith <chris@jalakai.co.uk> 192 * @author Tom N Harris <tnharris@whoopdedo.org> 193 */ 194 protected function getPageWords($text) { 195 global $conf; 196 197 $tokens = $this->tokenizer($text); 198 $tokens = array_count_values($tokens); // count the frequency of each token 199 200 $words = array(); 201 foreach ($tokens as $w=>$c) { 202 $l = wordlen($w); 203 if (isset($words[$l])){ 204 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 205 }else{ 206 $words[$l] = array($w => $c); 207 } 208 } 209 210 // arrive here with $words = array(wordlen => array(word => frequency)) 211 $word_idx_modified = false; 212 $index = array(); //resulting index 213 foreach (array_keys($words) as $wlen) { 214 $word_idx = $this->getIndex('w', $wlen); 215 foreach ($words[$wlen] as $word => $freq) { 216 $wid = array_search($word, $word_idx); 217 if ($wid === false) { 218 $wid = count($word_idx); 219 $word_idx[] = $word; 220 $word_idx_modified = true; 221 } 222 if (!isset($index[$wlen])) 223 $index[$wlen] = array(); 224 $index[$wlen][$wid] = $freq; 225 } 226 // save back the word index 227 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) 228 return false; 229 } 230 231 return $index; 232 } 233 234 /** 235 * Add/update keys to/of the metadata index. 236 * 237 * Adding new keys does not remove other keys for the page. 238 * An empty value will erase the key. 239 * The $key parameter can be an array to add multiple keys. $value will 240 * not be used if $key is an array. 241 * 242 * @param string $page a page name 243 * @param mixed $key a key string or array of key=>value pairs 244 * @param mixed $value the value or list of values 245 * @return boolean the function completed successfully 246 * @author Tom N Harris <tnharris@whoopdedo.org> 247 * @author Michael Hamann <michael@content-space.de> 248 */ 249 public function addMetaKeys($page, $key, $value=null) { 250 if (!is_array($key)) { 251 $key = array($key => $value); 252 } elseif (!is_null($value)) { 253 // $key is array, but $value is not null 254 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 255 } 256 257 if (!$this->lock()) 258 return "locked"; 259 260 // load known documents 261 $pid = $this->addIndexKey('page', '', $page); 262 if ($pid === false) { 263 $this->unlock(); 264 return false; 265 } 266 267 // Special handling for titles so the index file is simpler 268 if (array_key_exists('title', $key)) { 269 $value = $key['title']; 270 if (is_array($value)) 271 $value = $value[0]; 272 $this->saveIndexKey('title', '', $pid, $value); 273 unset($key['title']); 274 } 275 276 foreach ($key as $name => $values) { 277 $metaname = idx_cleanName($name); 278 $this->addIndexKey('metadata', '', $metaname); 279 $metaidx = $this->getIndex($metaname.'_i', ''); 280 $metawords = $this->getIndex($metaname.'_w', ''); 281 $addwords = false; 282 283 if (!is_array($values)) $values = array($values); 284 285 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 286 if ($val_idx != '') { 287 $val_idx = explode(':', $val_idx); 288 // -1 means remove, 0 keep, 1 add 289 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 290 } else { 291 $val_idx = array(); 292 } 293 294 295 foreach ($values as $val) { 296 $val = (string)$val; 297 if ($val !== "") { 298 $id = array_search($val, $metawords); 299 if ($id === false) { 300 $id = count($metawords); 301 $metawords[$id] = $val; 302 $addwords = true; 303 } 304 // test if value is already in the index 305 if (isset($val_idx[$id]) && $val_idx[$id] <= 0) 306 $val_idx[$id] = 0; 307 else // else add it 308 $val_idx[$id] = 1; 309 } 310 } 311 312 if ($addwords) 313 $this->saveIndex($metaname.'_w', '', $metawords); 314 $vals_changed = false; 315 foreach ($val_idx as $id => $action) { 316 if ($action == -1) { 317 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 318 $vals_changed = true; 319 unset($val_idx[$id]); 320 } elseif ($action == 1) { 321 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 322 $vals_changed = true; 323 } 324 } 325 326 if ($vals_changed) { 327 $this->saveIndex($metaname.'_i', '', $metaidx); 328 $val_idx = implode(':', array_keys($val_idx)); 329 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 330 } 331 332 unset($metaidx); 333 unset($metawords); 334 } 335 336 $this->unlock(); 337 return true; 338 } 339 340 /** 341 * Remove a page from the index 342 * 343 * Erases entries in all known indexes. 344 * 345 * @param string $page a page name 346 * @return boolean the function completed successfully 347 * @author Tom N Harris <tnharris@whoopdedo.org> 348 */ 349 public function deletePage($page) { 350 if (!$this->lock()) 351 return "locked"; 352 353 // load known documents 354 $pid = $this->getIndexKey('page', '', $page); 355 if ($pid === false) { 356 $this->unlock(); 357 return false; 358 } 359 360 // Remove obsolete index entries 361 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 362 if ($pageword_idx !== '') { 363 $delwords = explode(':',$pageword_idx); 364 $upwords = array(); 365 foreach ($delwords as $word) { 366 if ($word != '') { 367 list($wlen,$wid) = explode('*', $word); 368 $wid = (int)$wid; 369 $upwords[$wlen][] = $wid; 370 } 371 } 372 foreach ($upwords as $wlen => $widx) { 373 $index = $this->getIndex('i', $wlen); 374 foreach ($widx as $wid) { 375 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 376 } 377 $this->saveIndex('i', $wlen, $index); 378 } 379 } 380 // Save the reverse index 381 if (!$this->saveIndexKey('pageword', '', $pid, "")) { 382 $this->unlock(); 383 return false; 384 } 385 386 $this->saveIndexKey('title', '', $pid, ""); 387 $keyidx = $this->getIndex('metadata', ''); 388 foreach ($keyidx as $metaname) { 389 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 390 $meta_idx = $this->getIndex($metaname.'_i', ''); 391 foreach ($val_idx as $id) { 392 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 393 } 394 $this->saveIndex($metaname.'_i', '', $meta_idx); 395 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 396 } 397 398 $this->unlock(); 399 return true; 400 } 401 402 /** 403 * Split the text into words for fulltext search 404 * 405 * TODO: does this also need &$stopwords ? 406 * 407 * @triggers INDEXER_TEXT_PREPARE 408 * This event allows plugins to modify the text before it gets tokenized. 409 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 410 * 411 * @param string $text plain text 412 * @param boolean $wc are wildcards allowed? 413 * @return array list of words in the text 414 * @author Tom N Harris <tnharris@whoopdedo.org> 415 * @author Andreas Gohr <andi@splitbrain.org> 416 */ 417 public function tokenizer($text, $wc=false) { 418 global $conf; 419 $words = array(); 420 $wc = ($wc) ? '' : '\*'; 421 $stopwords =& idx_get_stopwords(); 422 423 // prepare the text to be tokenized 424 $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text); 425 if ($evt->advise_before(true)) { 426 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 427 // handle asian chars as single words (may fail on older PHP version) 428 $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); 429 if (!is_null($asia)) $text = $asia; // recover from regexp falure 430 } 431 } 432 $evt->advise_after(); 433 unset($evt); 434 435 $text = strtr($text, 436 array( 437 "\r" => ' ', 438 "\n" => ' ', 439 "\t" => ' ', 440 "\xC2\xAD" => '', //soft-hyphen 441 ) 442 ); 443 if (preg_match('/[^0-9A-Za-z ]/u', $text)) 444 $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); 445 446 $wordlist = explode(' ', $text); 447 foreach ($wordlist as $i => &$word) { 448 $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 449 utf8_strtolower($word) : strtolower($word); 450 if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 451 || array_search($word, $stopwords) !== false) 452 unset($wordlist[$i]); 453 } 454 return array_values($wordlist); 455 } 456 457 /** 458 * Find pages in the fulltext index containing the words, 459 * 460 * The search words must be pre-tokenized, meaning only letters and 461 * numbers with an optional wildcard 462 * 463 * The returned array will have the original tokens as key. The values 464 * in the returned list is an array with the page names as keys and the 465 * number of times that token appears on the page as value. 466 * 467 * @param arrayref $tokens list of words to search for 468 * @return array list of page names with usage counts 469 * @author Tom N Harris <tnharris@whoopdedo.org> 470 * @author Andreas Gohr <andi@splitbrain.org> 471 */ 472 public function lookup(&$tokens) { 473 $result = array(); 474 $wids = $this->getIndexWords($tokens, $result); 475 if (empty($wids)) return array(); 476 // load known words and documents 477 $page_idx = $this->getIndex('page', ''); 478 $docs = array(); 479 foreach (array_keys($wids) as $wlen) { 480 $wids[$wlen] = array_unique($wids[$wlen]); 481 $index = $this->getIndex('i', $wlen); 482 foreach($wids[$wlen] as $ixid) { 483 if ($ixid < count($index)) 484 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 485 } 486 } 487 // merge found pages into final result array 488 $final = array(); 489 foreach ($result as $word => $res) { 490 $final[$word] = array(); 491 foreach ($res as $wid) { 492 $hits = &$docs[$wid]; 493 foreach ($hits as $hitkey => $hitcnt) { 494 // make sure the document still exists 495 if (!page_exists($hitkey, '', false)) continue; 496 if (!isset($final[$word][$hitkey])) 497 $final[$word][$hitkey] = $hitcnt; 498 else 499 $final[$word][$hitkey] += $hitcnt; 500 } 501 } 502 } 503 return $final; 504 } 505 506 /** 507 * Find pages containing a metadata key. 508 * 509 * The metadata values are compared as case-sensitive strings. Pass a 510 * callback function that returns true or false to use a different 511 * comparison function. The function will be called with the $value being 512 * searched for as the first argument, and the word in the index as the 513 * second argument. The function preg_match can be used directly if the 514 * values are regexes. 515 * 516 * @param string $key name of the metadata key to look for 517 * @param string $value search term to look for, must be a string or array of strings 518 * @param callback $func comparison function 519 * @return array lists with page names, keys are query values if $value is array 520 * @author Tom N Harris <tnharris@whoopdedo.org> 521 * @author Michael Hamann <michael@content-space.de> 522 */ 523 public function lookupKey($key, &$value, $func=null) { 524 if (!is_array($value)) 525 $value_array = array($value); 526 else 527 $value_array =& $value; 528 529 // the matching ids for the provided value(s) 530 $value_ids = array(); 531 532 $metaname = idx_cleanName($key); 533 534 // get all words in order to search the matching ids 535 if ($key == 'title') { 536 $words = $this->getIndex('title', ''); 537 } else { 538 $words = $this->getIndex($metaname.'_w', ''); 539 } 540 541 if (!is_null($func)) { 542 foreach ($value_array as $val) { 543 foreach ($words as $i => $word) { 544 if (call_user_func_array($func, array($val, $word))) 545 $value_ids[$i][] = $val; 546 } 547 } 548 } else { 549 foreach ($value_array as $val) { 550 $xval = $val; 551 $caret = '^'; 552 $dollar = '$'; 553 // check for wildcards 554 if (substr($xval, 0, 1) == '*') { 555 $xval = substr($xval, 1); 556 $caret = ''; 557 } 558 if (substr($xval, -1, 1) == '*') { 559 $xval = substr($xval, 0, -1); 560 $dollar = ''; 561 } 562 if (!$caret || !$dollar) { 563 $re = $caret.preg_quote($xval, '/').$dollar; 564 foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) 565 $value_ids[$i][] = $val; 566 } else { 567 if (($i = array_search($val, $words)) !== false) 568 $value_ids[$i][] = $val; 569 } 570 } 571 } 572 573 unset($words); // free the used memory 574 575 // initialize the result so it won't be null 576 $result = array(); 577 foreach ($value_array as $val) { 578 $result[$val] = array(); 579 } 580 581 $page_idx = $this->getIndex('page', ''); 582 583 // Special handling for titles 584 if ($key == 'title') { 585 foreach ($value_ids as $pid => $val_list) { 586 $page = $page_idx[$pid]; 587 foreach ($val_list as $val) { 588 $result[$val][] = $page; 589 } 590 } 591 } else { 592 // load all lines and pages so the used lines can be taken and matched with the pages 593 $lines = $this->getIndex($metaname.'_i', ''); 594 595 foreach ($value_ids as $value_id => $val_list) { 596 // parse the tuples of the form page_id*1:page2_id*1 and so on, return value 597 // is an array with page_id => 1, page2_id => 1 etc. so take the keys only 598 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 599 foreach ($val_list as $val) { 600 $result[$val] = array_merge($result[$val], $pages); 601 } 602 } 603 } 604 if (!is_array($value)) $result = $result[$value]; 605 return $result; 606 } 607 608 /** 609 * Find the index ID of each search term. 610 * 611 * The query terms should only contain valid characters, with a '*' at 612 * either the beginning or end of the word (or both). 613 * The $result parameter can be used to merge the index locations with 614 * the appropriate query term. 615 * 616 * @param arrayref $words The query terms. 617 * @param arrayref $result Set to word => array("length*id" ...) 618 * @return array Set to length => array(id ...) 619 * @author Tom N Harris <tnharris@whoopdedo.org> 620 */ 621 protected function getIndexWords(&$words, &$result) { 622 $tokens = array(); 623 $tokenlength = array(); 624 $tokenwild = array(); 625 foreach ($words as $word) { 626 $result[$word] = array(); 627 $caret = '^'; 628 $dollar = '$'; 629 $xword = $word; 630 $wlen = wordlen($word); 631 632 // check for wildcards 633 if (substr($xword, 0, 1) == '*') { 634 $xword = substr($xword, 1); 635 $caret = ''; 636 $wlen -= 1; 637 } 638 if (substr($xword, -1, 1) == '*') { 639 $xword = substr($xword, 0, -1); 640 $dollar = ''; 641 $wlen -= 1; 642 } 643 if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) 644 continue; 645 if (!isset($tokens[$xword])) 646 $tokenlength[$wlen][] = $xword; 647 if (!$caret || !$dollar) { 648 $re = $caret.preg_quote($xword, '/').$dollar; 649 $tokens[$xword][] = array($word, '/'.$re.'/'); 650 if (!isset($tokenwild[$xword])) 651 $tokenwild[$xword] = $wlen; 652 } else { 653 $tokens[$xword][] = array($word, null); 654 } 655 } 656 asort($tokenwild); 657 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 658 // $tokenlength = array( base word length => base word ... ) 659 // $tokenwild = array( base word => base word length ... ) 660 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 661 $indexes_known = $this->indexLengths($length_filter); 662 if (!empty($tokenwild)) sort($indexes_known); 663 // get word IDs 664 $wids = array(); 665 foreach ($indexes_known as $ixlen) { 666 $word_idx = $this->getIndex('w', $ixlen); 667 // handle exact search 668 if (isset($tokenlength[$ixlen])) { 669 foreach ($tokenlength[$ixlen] as $xword) { 670 $wid = array_search($xword, $word_idx); 671 if ($wid !== false) { 672 $wids[$ixlen][] = $wid; 673 foreach ($tokens[$xword] as $w) 674 $result[$w[0]][] = "$ixlen*$wid"; 675 } 676 } 677 } 678 // handle wildcard search 679 foreach ($tokenwild as $xword => $wlen) { 680 if ($wlen >= $ixlen) break; 681 foreach ($tokens[$xword] as $w) { 682 if (is_null($w[1])) continue; 683 foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 684 $wids[$ixlen][] = $wid; 685 $result[$w[0]][] = "$ixlen*$wid"; 686 } 687 } 688 } 689 } 690 return $wids; 691 } 692 693 /** 694 * Return a list of all pages 695 * Warning: pages may not exist! 696 * 697 * @param string $key list only pages containing the metadata key (optional) 698 * @return array list of page names 699 * @author Tom N Harris <tnharris@whoopdedo.org> 700 */ 701 public function getPages($key=null) { 702 $page_idx = $this->getIndex('page', ''); 703 if (is_null($key)) return $page_idx; 704 705 $metaname = idx_cleanName($key); 706 707 // Special handling for titles 708 if ($key == 'title') { 709 $title_idx = $this->getIndex('title', ''); 710 array_splice($page_idx, count($title_idx)); 711 foreach ($title_idx as $i => $title) 712 if ($title === "") unset($page_idx[$i]); 713 return array_values($page_idx); 714 } 715 716 $pages = array(); 717 $lines = $this->getIndex($metaname.'_i', ''); 718 foreach ($lines as $line) { 719 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 720 } 721 return array_keys($pages); 722 } 723 724 /** 725 * Return a list of words sorted by number of times used 726 * 727 * @param int $min bottom frequency threshold 728 * @param int $max upper frequency limit. No limit if $max<$min 729 * @param int $length minimum length of words to count 730 * @param string $key metadata key to list. Uses the fulltext index if not given 731 * @return array list of words as the keys and frequency as values 732 * @author Tom N Harris <tnharris@whoopdedo.org> 733 */ 734 public function histogram($min=1, $max=0, $minlen=3, $key=null) { 735 if ($min < 1) 736 $min = 1; 737 if ($max < $min) 738 $max = 0; 739 740 $result = array(); 741 742 if ($key == 'title') { 743 $index = $this->getIndex('title', ''); 744 $index = array_count_values($index); 745 foreach ($index as $val => $cnt) { 746 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) 747 $result[$val] = $cnt; 748 } 749 } 750 elseif (!is_null($key)) { 751 $metaname = idx_cleanName($key); 752 $index = $this->getIndex($metaname.'_i', ''); 753 $val_idx = array(); 754 foreach ($index as $wid => $line) { 755 $freq = $this->countTuples($line); 756 if ($freq >= $min && (!$max || $freq <= $max) && strlen($val) >= $minlen) 757 $val_idx[$wid] = $freq; 758 } 759 if (!empty($val_idx)) { 760 $words = $this->getIndex($metaname.'_w', ''); 761 foreach ($val_idx as $wid => $freq) 762 $result[$words[$wid]] = $freq; 763 } 764 } 765 else { 766 $lengths = idx_listIndexLengths(); 767 foreach ($lengths as $length) { 768 if ($length < $minlen) continue; 769 $index = $this->getIndex('i', $length); 770 $words = null; 771 foreach ($index as $wid => $line) { 772 $freq = $this->countTuples($line); 773 if ($freq >= $min && (!$max || $freq <= $max)) { 774 if ($words === null) 775 $words = $this->getIndex('w', $length); 776 $result[$words[$wid]] = $freq; 777 } 778 } 779 } 780 } 781 782 arsort($result); 783 return $result; 784 } 785 786 /** 787 * Lock the indexer. 788 * 789 * @author Tom N Harris <tnharris@whoopdedo.org> 790 */ 791 protected function lock() { 792 global $conf; 793 $status = true; 794 $run = 0; 795 $lock = $conf['lockdir'].'/_indexer.lock'; 796 while (!@mkdir($lock, $conf['dmode'])) { 797 usleep(50); 798 if(is_dir($lock) && time()-@filemtime($lock) > 60*5){ 799 // looks like a stale lock - remove it 800 if (!@rmdir($lock)) { 801 $status = "removing the stale lock failed"; 802 return false; 803 } else { 804 $status = "stale lock removed"; 805 } 806 }elseif($run++ == 1000){ 807 // we waited 5 seconds for that lock 808 return false; 809 } 810 } 811 if ($conf['dperm']) 812 chmod($lock, $conf['dperm']); 813 return $status; 814 } 815 816 /** 817 * Release the indexer lock. 818 * 819 * @author Tom N Harris <tnharris@whoopdedo.org> 820 */ 821 protected function unlock() { 822 global $conf; 823 @rmdir($conf['lockdir'].'/_indexer.lock'); 824 return true; 825 } 826 827 /** 828 * Retrieve the entire index. 829 * 830 * The $suffix argument is for an index that is split into 831 * multiple parts. Different index files should use different 832 * base names. 833 * 834 * @param string $idx name of the index 835 * @param string $suffix subpart identifier 836 * @return array list of lines without CR or LF 837 * @author Tom N Harris <tnharris@whoopdedo.org> 838 */ 839 protected function getIndex($idx, $suffix) { 840 global $conf; 841 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 842 if (!@file_exists($fn)) return array(); 843 return file($fn, FILE_IGNORE_NEW_LINES); 844 } 845 846 /** 847 * Replace the contents of the index with an array. 848 * 849 * @param string $idx name of the index 850 * @param string $suffix subpart identifier 851 * @param arrayref $linex list of lines without LF 852 * @author Tom N Harris <tnharris@whoopdedo.org> 853 */ 854 protected function saveIndex($idx, $suffix, &$lines) { 855 global $conf; 856 $fn = $conf['indexdir'].'/'.$idx.$suffix; 857 $fh = @fopen($fn.'.tmp', 'w'); 858 if (!$fh) return false; 859 fwrite($fh, join("\n", $lines)); 860 fclose($fh); 861 if (isset($conf['fperm'])) 862 chmod($fn.'.tmp', $conf['fperm']); 863 io_rename($fn.'.tmp', $fn.'.idx'); 864 if ($suffix !== '') 865 $this->cacheIndexDir($idx, $suffix, empty($lines)); 866 return true; 867 } 868 869 /** 870 * Retrieve a line from the index. 871 * 872 * @param string $idx name of the index 873 * @param string $suffix subpart identifier 874 * @param int $id the line number 875 * @return string a line with trailing whitespace removed 876 * @author Tom N Harris <tnharris@whoopdedo.org> 877 */ 878 protected function getIndexKey($idx, $suffix, $id) { 879 global $conf; 880 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 881 if (!@file_exists($fn)) return ''; 882 $fh = @fopen($fn, 'r'); 883 if (!$fh) return ''; 884 $ln = -1; 885 while (($line = fgets($fh)) !== false) { 886 if (++$ln == $id) break; 887 } 888 fclose($fh); 889 return rtrim((string)$line); 890 } 891 892 /** 893 * Write a line into the index. 894 * 895 * @param string $idx name of the index 896 * @param string $suffix subpart identifier 897 * @param int $id the line number 898 * @param string $line line to write 899 * @author Tom N Harris <tnharris@whoopdedo.org> 900 */ 901 protected function saveIndexKey($idx, $suffix, $id, $line) { 902 global $conf; 903 if (substr($line, -1) != "\n") 904 $line .= "\n"; 905 $fn = $conf['indexdir'].'/'.$idx.$suffix; 906 $fh = @fopen($fn.'.tmp', 'w'); 907 if (!fh) return false; 908 $ih = @fopen($fn.'.idx', 'r'); 909 if ($ih) { 910 $ln = -1; 911 while (($curline = fgets($ih)) !== false) { 912 fwrite($fh, (++$ln == $id) ? $line : $curline); 913 } 914 if ($id > $ln) { 915 while ($id > ++$ln) 916 fwrite($fh, "\n"); 917 fwrite($fh, $line); 918 } 919 fclose($ih); 920 } else { 921 $ln = -1; 922 while ($id > ++$ln) 923 fwrite($fh, "\n"); 924 fwrite($fh, $line); 925 } 926 fclose($fh); 927 if (isset($conf['fperm'])) 928 chmod($fn.'.tmp', $conf['fperm']); 929 io_rename($fn.'.tmp', $fn.'.idx'); 930 if ($suffix !== '') 931 $this->cacheIndexDir($idx, $suffix); 932 return true; 933 } 934 935 /** 936 * Retrieve or insert a value in the index. 937 * 938 * @param string $idx name of the index 939 * @param string $suffix subpart identifier 940 * @param string $value line to find in the index 941 * @return int line number of the value in the index 942 * @author Tom N Harris <tnharris@whoopdedo.org> 943 */ 944 protected function addIndexKey($idx, $suffix, $value) { 945 $index = $this->getIndex($idx, $suffix); 946 $id = array_search($value, $index); 947 if ($id === false) { 948 $id = count($index); 949 $index[$id] = $value; 950 if (!$this->saveIndex($idx, $suffix, $index)) { 951 trigger_error("Failed to write $idx index", E_USER_ERROR); 952 return false; 953 } 954 } 955 return $id; 956 } 957 958 protected function cacheIndexDir($idx, $suffix, $delete=false) { 959 global $conf; 960 if ($idx == 'i') 961 $cachename = $conf['indexdir'].'/lengths'; 962 else 963 $cachename = $conf['indexdir'].'/'.$idx.'lengths'; 964 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 965 if ($lengths === false) $lengths = array(); 966 $old = array_search((string)$suffix, $lengths); 967 if (empty($lines)) { 968 if ($old === false) return; 969 unset($lengths[$old]); 970 } else { 971 if ($old !== false) return; 972 $lengths[] = $suffix; 973 sort($lengths); 974 } 975 $fh = @fopen($cachename.'.tmp', 'w'); 976 if (!$fh) { 977 trigger_error("Failed to write index cache", E_USER_ERROR); 978 return; 979 } 980 @fwrite($fh, implode("\n", $lengths)); 981 @fclose($fh); 982 if (isset($conf['fperm'])) 983 chmod($cachename.'.tmp', $conf['fperm']); 984 io_rename($cachename.'.tmp', $cachename.'.idx'); 985 } 986 987 /** 988 * Get the list of lengths indexed in the wiki. 989 * 990 * Read the index directory or a cache file and returns 991 * a sorted array of lengths of the words used in the wiki. 992 * 993 * @author YoBoY <yoboy.leguesh@gmail.com> 994 */ 995 protected function listIndexLengths() { 996 global $conf; 997 $cachename = $conf['indexdir'].'/lengths'; 998 clearstatcache(); 999 if (@file_exists($cachename.'.idx')) { 1000 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 1001 if ($lengths !== false) { 1002 $idx = array(); 1003 foreach ($lengths as $length) 1004 $idx[] = (int)$length; 1005 return $idx; 1006 } 1007 } 1008 1009 $dir = @opendir($conf['indexdir']); 1010 if ($dir === false) 1011 return array(); 1012 $lengths[] = array(); 1013 while (($f = readdir($dir)) !== false) { 1014 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1015 $i = substr($f, 1, -4); 1016 if (is_numeric($i)) 1017 $lengths[] = (int)$i; 1018 } 1019 } 1020 closedir($dir); 1021 sort($lengths); 1022 // save this in a file 1023 $fh = @fopen($cachename.'.tmp', 'w'); 1024 if (!$fh) { 1025 trigger_error("Failed to write index cache", E_USER_ERROR); 1026 return; 1027 } 1028 @fwrite($fh, implode("\n", $lengths)); 1029 @fclose($fh); 1030 if (isset($conf['fperm'])) 1031 chmod($cachename.'.tmp', $conf['fperm']); 1032 io_rename($cachename.'.tmp', $cachename.'.idx'); 1033 1034 return $lengths; 1035 } 1036 1037 /** 1038 * Get the word lengths that have been indexed. 1039 * 1040 * Reads the index directory and returns an array of lengths 1041 * that there are indices for. 1042 * 1043 * @author YoBoY <yoboy.leguesh@gmail.com> 1044 */ 1045 protected function indexLengths($filter) { 1046 global $conf; 1047 $idx = array(); 1048 if (is_array($filter)) { 1049 // testing if index files exist only 1050 $path = $conf['indexdir']."/i"; 1051 foreach ($filter as $key => $value) { 1052 if (@file_exists($path.$key.'.idx')) 1053 $idx[] = $key; 1054 } 1055 } else { 1056 $lengths = idx_listIndexLengths(); 1057 foreach ($lengths as $key => $length) { 1058 // keep all the values equal or superior 1059 if ((int)$length >= (int)$filter) 1060 $idx[] = $length; 1061 } 1062 } 1063 return $idx; 1064 } 1065 1066 /** 1067 * Insert or replace a tuple in a line. 1068 * 1069 * @author Tom N Harris <tnharris@whoopdedo.org> 1070 */ 1071 protected function updateTuple($line, $id, $count) { 1072 $newLine = $line; 1073 if ($newLine !== '') 1074 $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); 1075 $newLine = trim($newLine, ':'); 1076 if ($count) { 1077 if (strlen($newLine) > 0) 1078 return "$id*$count:".$newLine; 1079 else 1080 return "$id*$count".$newLine; 1081 } 1082 return $newLine; 1083 } 1084 1085 /** 1086 * Split a line into an array of tuples. 1087 * 1088 * @author Tom N Harris <tnharris@whoopdedo.org> 1089 * @author Andreas Gohr <andi@splitbrain.org> 1090 */ 1091 protected function parseTuples(&$keys, $line) { 1092 $result = array(); 1093 if ($line == '') return $result; 1094 $parts = explode(':', $line); 1095 foreach ($parts as $tuple) { 1096 if ($tuple === '') continue; 1097 list($key, $cnt) = explode('*', $tuple); 1098 if (!$cnt) continue; 1099 $key = $keys[$key]; 1100 if (!$key) continue; 1101 $result[$key] = $cnt; 1102 } 1103 return $result; 1104 } 1105 1106 /** 1107 * Sum the counts in a list of tuples. 1108 * 1109 * @author Tom N Harris <tnharris@whoopdedo.org> 1110 */ 1111 protected function countTuples($line) { 1112 $freq = 0; 1113 $parts = explode(':', $line); 1114 foreach ($parts as $tuple) { 1115 if ($tuple === '') continue; 1116 list($pid, $cnt) = explode('*', $tuple); 1117 $freq += (int)$cnt; 1118 } 1119 return $freq; 1120 } 1121} 1122 1123/** 1124 * Create an instance of the indexer. 1125 * 1126 * @return object a Doku_Indexer 1127 * @author Tom N Harris <tnharris@whoopdedo.org> 1128 */ 1129function idx_get_indexer() { 1130 static $Indexer = null; 1131 if (is_null($Indexer)) { 1132 $Indexer = new Doku_Indexer(); 1133 } 1134 return $Indexer; 1135} 1136 1137/** 1138 * Returns words that will be ignored. 1139 * 1140 * @return array list of stop words 1141 * @author Tom N Harris <tnharris@whoopdedo.org> 1142 */ 1143function & idx_get_stopwords() { 1144 static $stopwords = null; 1145 if (is_null($stopwords)) { 1146 global $conf; 1147 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 1148 if(@file_exists($swfile)){ 1149 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 1150 }else{ 1151 $stopwords = array(); 1152 } 1153 } 1154 return $stopwords; 1155} 1156 1157/** 1158 * Adds/updates the search index for the given page 1159 * 1160 * Locking is handled internally. 1161 * 1162 * @param string $page name of the page to index 1163 * @param boolean $verbose print status messages 1164 * @return boolean the function completed successfully 1165 * @author Tom N Harris <tnharris@whoopdedo.org> 1166 */ 1167function idx_addPage($page, $verbose=false) { 1168 // check if indexing needed 1169 $idxtag = metaFN($page,'.indexed'); 1170 if(@file_exists($idxtag)){ 1171 if(trim(io_readFile($idxtag)) == idx_get_version()){ 1172 $last = @filemtime($idxtag); 1173 if($last > @filemtime(wikiFN($page))){ 1174 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 1175 return false; 1176 } 1177 } 1178 } 1179 1180 if (!page_exists($page)) { 1181 if (!@file_exists($idxtag)) { 1182 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 1183 return false; 1184 } 1185 $Indexer = idx_get_indexer(); 1186 $result = $Indexer->deletePage($page); 1187 if ($result === "locked") { 1188 if ($verbose) print("Indexer: locked".DOKU_LF); 1189 return false; 1190 } 1191 @unlink($idxtag); 1192 return $result; 1193 } 1194 $indexenabled = p_get_metadata($page, 'internal index', false); 1195 if ($indexenabled === false) { 1196 $result = false; 1197 if (@file_exists($idxtag)) { 1198 $Indexer = idx_get_indexer(); 1199 $result = $Indexer->deletePage($page); 1200 if ($result === "locked") { 1201 if ($verbose) print("Indexer: locked".DOKU_LF); 1202 return false; 1203 } 1204 @unlink($idxtag); 1205 } 1206 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 1207 return $result; 1208 } 1209 1210 $body = ''; 1211 $metadata = array(); 1212 $metadata['title'] = p_get_metadata($page, 'title', false); 1213 if (($references = p_get_metadata($page, 'relation references', false)) !== null) 1214 $metadata['relation_references'] = array_keys($references); 1215 else 1216 $metadata['relation_references'] = array(); 1217 $data = compact('page', 'body', 'metadata'); 1218 $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); 1219 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 1220 $evt->advise_after(); 1221 unset($evt); 1222 extract($data); 1223 1224 $Indexer = idx_get_indexer(); 1225 $result = $Indexer->addPageWords($page, $body); 1226 if ($result === "locked") { 1227 if ($verbose) print("Indexer: locked".DOKU_LF); 1228 return false; 1229 } 1230 1231 if ($result) { 1232 $result = $Indexer->addMetaKeys($page, $metadata); 1233 if ($result === "locked") { 1234 if ($verbose) print("Indexer: locked".DOKU_LF); 1235 return false; 1236 } 1237 } 1238 1239 if ($result) 1240 io_saveFile(metaFN($page,'.indexed'), idx_get_version()); 1241 if ($verbose) { 1242 print("Indexer: finished".DOKU_LF); 1243 return true; 1244 } 1245 return $result; 1246} 1247 1248/** 1249 * Find tokens in the fulltext index 1250 * 1251 * Takes an array of words and will return a list of matching 1252 * pages for each one. 1253 * 1254 * Important: No ACL checking is done here! All results are 1255 * returned, regardless of permissions 1256 * 1257 * @param arrayref $words list of words to search for 1258 * @return array list of pages found, associated with the search terms 1259 */ 1260function idx_lookup(&$words) { 1261 $Indexer = idx_get_indexer(); 1262 return $Indexer->lookup($words); 1263} 1264 1265/** 1266 * Split a string into tokens 1267 * 1268 */ 1269function idx_tokenizer($string, $wc=false) { 1270 $Indexer = idx_get_indexer(); 1271 return $Indexer->tokenizer($string, $wc); 1272} 1273 1274/* For compatibility */ 1275 1276/** 1277 * Read the list of words in an index (if it exists). 1278 * 1279 * @author Tom N Harris <tnharris@whoopdedo.org> 1280 */ 1281function idx_getIndex($idx, $suffix) { 1282 global $conf; 1283 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1284 if (!@file_exists($fn)) return array(); 1285 return file($fn); 1286} 1287 1288/** 1289 * Get the list of lengths indexed in the wiki. 1290 * 1291 * Read the index directory or a cache file and returns 1292 * a sorted array of lengths of the words used in the wiki. 1293 * 1294 * @author YoBoY <yoboy.leguesh@gmail.com> 1295 */ 1296function idx_listIndexLengths() { 1297 global $conf; 1298 // testing what we have to do, create a cache file or not. 1299 if ($conf['readdircache'] == 0) { 1300 $docache = false; 1301 } else { 1302 clearstatcache(); 1303 if (@file_exists($conf['indexdir'].'/lengths.idx') 1304 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 1305 if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) { 1306 $idx = array(); 1307 foreach ($lengths as $length) { 1308 $idx[] = (int)$length; 1309 } 1310 return $idx; 1311 } 1312 } 1313 $docache = true; 1314 } 1315 1316 if ($conf['readdircache'] == 0 || $docache) { 1317 $dir = @opendir($conf['indexdir']); 1318 if ($dir === false) 1319 return array(); 1320 $idx[] = array(); 1321 while (($f = readdir($dir)) !== false) { 1322 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1323 $i = substr($f, 1, -4); 1324 if (is_numeric($i)) 1325 $idx[] = (int)$i; 1326 } 1327 } 1328 closedir($dir); 1329 sort($idx); 1330 // save this in a file 1331 if ($docache) { 1332 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 1333 @fwrite($handle, implode("\n", $idx)); 1334 @fclose($handle); 1335 } 1336 return $idx; 1337 } 1338 1339 return array(); 1340} 1341 1342/** 1343 * Get the word lengths that have been indexed. 1344 * 1345 * Reads the index directory and returns an array of lengths 1346 * that there are indices for. 1347 * 1348 * @author YoBoY <yoboy.leguesh@gmail.com> 1349 */ 1350function idx_indexLengths($filter) { 1351 global $conf; 1352 $idx = array(); 1353 if (is_array($filter)) { 1354 // testing if index files exist only 1355 $path = $conf['indexdir']."/i"; 1356 foreach ($filter as $key => $value) { 1357 if (@file_exists($path.$key.'.idx')) 1358 $idx[] = $key; 1359 } 1360 } else { 1361 $lengths = idx_listIndexLengths(); 1362 foreach ($lengths as $key => $length) { 1363 // keep all the values equal or superior 1364 if ((int)$length >= (int)$filter) 1365 $idx[] = $length; 1366 } 1367 } 1368 return $idx; 1369} 1370 1371/** 1372 * Clean a name of a key for use as a file name. 1373 * 1374 * Romanizes non-latin characters, then strips away anything that's 1375 * not a letter, number, or underscore. 1376 * 1377 * @author Tom N Harris <tnharris@whoopdedo.org> 1378 */ 1379function idx_cleanName($name) { 1380 $name = utf8_romanize(trim((string)$name)); 1381 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1382 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1383 return strtolower($name); 1384} 1385 1386//Setup VIM: ex: et ts=4 : 1387