1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9 10// Version tag used to force rebuild on upgrade 11define('INDEXER_VERSION', 8); 12 13// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 14if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 15 16// Asian characters are handled as words. The following regexp defines the 17// Unicode-Ranges for Asian characters 18// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 19// I'm no language expert. If you think some ranges are wrongly chosen or 20// a range is missing, please contact me 21define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 22define('IDX_ASIAN2','['. 23 '\x{2E80}-\x{3040}'. // CJK -> Hangul 24 '\x{309D}-\x{30A0}'. 25 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. 26 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 27 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 28 "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F". // CJK Extension B 29 "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF". // CJK Extension C 30 "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F". // CJK Extension D 31 "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF". // CJK Compatibility Supplement 32 ']'); 33define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 34 '\x{3042}\x{3044}\x{3046}\x{3048}'. 35 '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 36 '\x{3084}\x{3086}\x{3088}-\x{308D}'. 37 '\x{308F}-\x{3094}'. 38 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 39 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 40 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 41 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 42 ']['. 43 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 44 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 45 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 46 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 47 '\x{31F0}-\x{31FF}'. 48 ']?'); 49define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); 50 51/** 52 * Version of the indexer taking into consideration the external tokenizer. 53 * The indexer is only compatible with data written by the same version. 54 * 55 * @triggers INDEXER_VERSION_GET 56 * Plugins that modify what gets indexed should hook this event and 57 * add their version info to the event data like so: 58 * $data[$plugin_name] = $plugin_version; 59 * 60 * @author Tom N Harris <tnharris@whoopdedo.org> 61 * @author Michael Hamann <michael@content-space.de> 62 * 63 * @return int|string 64 */ 65function idx_get_version(){ 66 static $indexer_version = null; 67 if ($indexer_version == null) { 68 $version = INDEXER_VERSION; 69 70 // DokuWiki version is included for the convenience of plugins 71 $data = array('dokuwiki'=>$version); 72 trigger_event('INDEXER_VERSION_GET', $data, null, false); 73 unset($data['dokuwiki']); // this needs to be first 74 ksort($data); 75 foreach ($data as $plugin=>$vers) 76 $version .= '+'.$plugin.'='.$vers; 77 $indexer_version = $version; 78 } 79 return $indexer_version; 80} 81 82/** 83 * Measure the length of a string. 84 * Differs from strlen in handling of asian characters. 85 * 86 * @author Tom N Harris <tnharris@whoopdedo.org> 87 * 88 * @param string $w 89 * @return int 90 */ 91function wordlen($w){ 92 $l = strlen($w); 93 // If left alone, all chinese "words" will get put into w3.idx 94 // So the "length" of a "word" is faked 95 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 96 foreach($leadbytes[0] as $b) 97 $l += ord($b) - 0xE1; 98 } 99 return $l; 100} 101 102/** 103 * Class that encapsulates operations on the indexer database. 104 * 105 * @author Tom N Harris <tnharris@whoopdedo.org> 106 */ 107class Doku_Indexer { 108 /** 109 * @var array $pidCache Cache for getPID() 110 */ 111 protected $pidCache = array(); 112 113 /** 114 * Adds the contents of a page to the fulltext index 115 * 116 * The added text replaces previous words for the same page. 117 * An empty value erases the page. 118 * 119 * @param string $page a page name 120 * @param string $text the body of the page 121 * @return string|boolean the function completed successfully 122 * 123 * @author Tom N Harris <tnharris@whoopdedo.org> 124 * @author Andreas Gohr <andi@splitbrain.org> 125 */ 126 public function addPageWords($page, $text) { 127 if (!$this->lock()) 128 return "locked"; 129 130 // load known documents 131 $pid = $this->getPIDNoLock($page); 132 if ($pid === false) { 133 $this->unlock(); 134 return false; 135 } 136 137 $pagewords = array(); 138 // get word usage in page 139 $words = $this->getPageWords($text); 140 if ($words === false) { 141 $this->unlock(); 142 return false; 143 } 144 145 if (!empty($words)) { 146 foreach (array_keys($words) as $wlen) { 147 $index = $this->getIndex('i', $wlen); 148 foreach ($words[$wlen] as $wid => $freq) { 149 $idx = ($wid<count($index)) ? $index[$wid] : ''; 150 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 151 $pagewords[] = "$wlen*$wid"; 152 } 153 if (!$this->saveIndex('i', $wlen, $index)) { 154 $this->unlock(); 155 return false; 156 } 157 } 158 } 159 160 // Remove obsolete index entries 161 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 162 if ($pageword_idx !== '') { 163 $oldwords = explode(':',$pageword_idx); 164 $delwords = array_diff($oldwords, $pagewords); 165 $upwords = array(); 166 foreach ($delwords as $word) { 167 if ($word != '') { 168 list($wlen,$wid) = explode('*', $word); 169 $wid = (int)$wid; 170 $upwords[$wlen][] = $wid; 171 } 172 } 173 foreach ($upwords as $wlen => $widx) { 174 $index = $this->getIndex('i', $wlen); 175 foreach ($widx as $wid) { 176 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 177 } 178 $this->saveIndex('i', $wlen, $index); 179 } 180 } 181 // Save the reverse index 182 $pageword_idx = join(':', $pagewords); 183 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 184 $this->unlock(); 185 return false; 186 } 187 188 $this->unlock(); 189 return true; 190 } 191 192 /** 193 * Split the words in a page and add them to the index. 194 * 195 * @param string $text content of the page 196 * @return array list of word IDs and number of times used 197 * 198 * @author Andreas Gohr <andi@splitbrain.org> 199 * @author Christopher Smith <chris@jalakai.co.uk> 200 * @author Tom N Harris <tnharris@whoopdedo.org> 201 */ 202 protected function getPageWords($text) { 203 204 $tokens = $this->tokenizer($text); 205 $tokens = array_count_values($tokens); // count the frequency of each token 206 207 $words = array(); 208 foreach ($tokens as $w=>$c) { 209 $l = wordlen($w); 210 if (isset($words[$l])){ 211 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 212 }else{ 213 $words[$l] = array($w => $c); 214 } 215 } 216 217 // arrive here with $words = array(wordlen => array(word => frequency)) 218 $word_idx_modified = false; 219 $index = array(); //resulting index 220 foreach (array_keys($words) as $wlen) { 221 $word_idx = $this->getIndex('w', $wlen); 222 foreach ($words[$wlen] as $word => $freq) { 223 $word = (string)$word; 224 $wid = array_search($word, $word_idx, true); 225 if ($wid === false) { 226 $wid = count($word_idx); 227 $word_idx[] = $word; 228 $word_idx_modified = true; 229 } 230 if (!isset($index[$wlen])) 231 $index[$wlen] = array(); 232 $index[$wlen][$wid] = $freq; 233 } 234 // save back the word index 235 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) 236 return false; 237 } 238 239 return $index; 240 } 241 242 /** 243 * Add/update keys to/of the metadata index. 244 * 245 * Adding new keys does not remove other keys for the page. 246 * An empty value will erase the key. 247 * The $key parameter can be an array to add multiple keys. $value will 248 * not be used if $key is an array. 249 * 250 * @param string $page a page name 251 * @param mixed $key a key string or array of key=>value pairs 252 * @param mixed $value the value or list of values 253 * @return boolean|string the function completed successfully 254 * 255 * @author Tom N Harris <tnharris@whoopdedo.org> 256 * @author Michael Hamann <michael@content-space.de> 257 */ 258 public function addMetaKeys($page, $key, $value=null) { 259 if (!is_array($key)) { 260 $key = array($key => $value); 261 } elseif (!is_null($value)) { 262 // $key is array, but $value is not null 263 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 264 } 265 266 if (!$this->lock()) 267 return "locked"; 268 269 // load known documents 270 $pid = $this->getPIDNoLock($page); 271 if ($pid === false) { 272 $this->unlock(); 273 return false; 274 } 275 276 // Special handling for titles so the index file is simpler 277 if (array_key_exists('title', $key)) { 278 $value = $key['title']; 279 if (is_array($value)) { 280 $value = $value[0]; 281 } 282 $this->saveIndexKey('title', '', $pid, $value); 283 unset($key['title']); 284 } 285 286 foreach ($key as $name => $values) { 287 $metaname = idx_cleanName($name); 288 $this->addIndexKey('metadata', '', $metaname); 289 $metaidx = $this->getIndex($metaname.'_i', ''); 290 $metawords = $this->getIndex($metaname.'_w', ''); 291 $addwords = false; 292 293 if (!is_array($values)) $values = array($values); 294 295 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 296 if ($val_idx != '') { 297 $val_idx = explode(':', $val_idx); 298 // -1 means remove, 0 keep, 1 add 299 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 300 } else { 301 $val_idx = array(); 302 } 303 304 foreach ($values as $val) { 305 $val = (string)$val; 306 if ($val !== "") { 307 $id = array_search($val, $metawords, true); 308 if ($id === false) { 309 // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx 310 $id = count($metawords); 311 $metawords[$id] = $val; 312 $metaidx[$id] = ''; 313 $addwords = true; 314 } 315 // test if value is already in the index 316 if (isset($val_idx[$id]) && $val_idx[$id] <= 0){ 317 $val_idx[$id] = 0; 318 } else { // else add it 319 $val_idx[$id] = 1; 320 } 321 } 322 } 323 324 if ($addwords) { 325 $this->saveIndex($metaname.'_w', '', $metawords); 326 } 327 $vals_changed = false; 328 foreach ($val_idx as $id => $action) { 329 if ($action == -1) { 330 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 331 $vals_changed = true; 332 unset($val_idx[$id]); 333 } elseif ($action == 1) { 334 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 335 $vals_changed = true; 336 } 337 } 338 339 if ($vals_changed) { 340 $this->saveIndex($metaname.'_i', '', $metaidx); 341 $val_idx = implode(':', array_keys($val_idx)); 342 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 343 } 344 345 unset($metaidx); 346 unset($metawords); 347 } 348 349 $this->unlock(); 350 return true; 351 } 352 353 /** 354 * Rename a page in the search index without changing the indexed content. This function doesn't check if the 355 * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the 356 * indexer and it deletes all previously indexed content of the new page. 357 * 358 * @param string $oldpage The old page name 359 * @param string $newpage The new page name 360 * @return string|bool If the page was successfully renamed, can be a message in the case of an error 361 */ 362 public function renamePage($oldpage, $newpage) { 363 if (!$this->lock()) return 'locked'; 364 365 $pages = $this->getPages(); 366 367 $id = array_search($oldpage, $pages, true); 368 if ($id === false) { 369 $this->unlock(); 370 return 'page is not in index'; 371 } 372 373 $new_id = array_search($newpage, $pages, true); 374 if ($new_id !== false) { 375 // make sure the page is not in the index anymore 376 if ($this->deletePageNoLock($newpage) !== true) { 377 return false; 378 } 379 380 $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 381 } 382 383 $pages[$id] = $newpage; 384 385 // update index 386 if (!$this->saveIndex('page', '', $pages)) { 387 $this->unlock(); 388 return false; 389 } 390 391 // reset the pid cache 392 $this->pidCache = array(); 393 394 $this->unlock(); 395 return true; 396 } 397 398 /** 399 * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages 400 * will be updated. 401 * 402 * @param string $key The metadata key of which a value shall be changed 403 * @param string $oldvalue The old value that shall be renamed 404 * @param string $newvalue The new value to which the old value shall be renamed, if exists values will be merged 405 * @return bool|string If renaming the value has been successful, false or error message on error. 406 */ 407 public function renameMetaValue($key, $oldvalue, $newvalue) { 408 if (!$this->lock()) return 'locked'; 409 410 // change the relation references index 411 $metavalues = $this->getIndex($key, '_w'); 412 $oldid = array_search($oldvalue, $metavalues, true); 413 if ($oldid !== false) { 414 $newid = array_search($newvalue, $metavalues, true); 415 if ($newid !== false) { 416 // free memory 417 unset ($metavalues); 418 419 // okay, now we have two entries for the same value. we need to merge them. 420 $indexline = $this->getIndexKey($key.'_i', '', $oldid); 421 if ($indexline != '') { 422 $newindexline = $this->getIndexKey($key.'_i', '', $newid); 423 $pagekeys = $this->getIndex($key.'_p', ''); 424 $parts = explode(':', $indexline); 425 foreach ($parts as $part) { 426 list($id, $count) = explode('*', $part); 427 $newindexline = $this->updateTuple($newindexline, $id, $count); 428 429 $keyline = explode(':', $pagekeys[$id]); 430 // remove old meta value 431 $keyline = array_diff($keyline, array($oldid)); 432 // add new meta value when not already present 433 if (!in_array($newid, $keyline)) { 434 array_push($keyline, $newid); 435 } 436 $pagekeys[$id] = implode(':', $keyline); 437 } 438 $this->saveIndex($key.'_p', '', $pagekeys); 439 unset($pagekeys); 440 $this->saveIndexKey($key.'_i', '', $oldid, ''); 441 $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 442 } 443 } else { 444 $metavalues[$oldid] = $newvalue; 445 if (!$this->saveIndex($key.'_w', '', $metavalues)) { 446 $this->unlock(); 447 return false; 448 } 449 } 450 } 451 452 $this->unlock(); 453 return true; 454 } 455 456 /** 457 * Remove a page from the index 458 * 459 * Erases entries in all known indexes. 460 * 461 * @param string $page a page name 462 * @return string|boolean the function completed successfully 463 * 464 * @author Tom N Harris <tnharris@whoopdedo.org> 465 */ 466 public function deletePage($page) { 467 if (!$this->lock()) 468 return "locked"; 469 470 $result = $this->deletePageNoLock($page); 471 472 $this->unlock(); 473 474 return $result; 475 } 476 477 /** 478 * Remove a page from the index without locking the index, only use this function if the index is already locked 479 * 480 * Erases entries in all known indexes. 481 * 482 * @param string $page a page name 483 * @return boolean the function completed successfully 484 * 485 * @author Tom N Harris <tnharris@whoopdedo.org> 486 */ 487 protected function deletePageNoLock($page) { 488 // load known documents 489 $pid = $this->getPIDNoLock($page); 490 if ($pid === false) { 491 return false; 492 } 493 494 // Remove obsolete index entries 495 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 496 if ($pageword_idx !== '') { 497 $delwords = explode(':',$pageword_idx); 498 $upwords = array(); 499 foreach ($delwords as $word) { 500 if ($word != '') { 501 list($wlen,$wid) = explode('*', $word); 502 $wid = (int)$wid; 503 $upwords[$wlen][] = $wid; 504 } 505 } 506 foreach ($upwords as $wlen => $widx) { 507 $index = $this->getIndex('i', $wlen); 508 foreach ($widx as $wid) { 509 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 510 } 511 $this->saveIndex('i', $wlen, $index); 512 } 513 } 514 // Save the reverse index 515 if (!$this->saveIndexKey('pageword', '', $pid, "")) { 516 return false; 517 } 518 519 $this->saveIndexKey('title', '', $pid, ""); 520 $keyidx = $this->getIndex('metadata', ''); 521 foreach ($keyidx as $metaname) { 522 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 523 $meta_idx = $this->getIndex($metaname.'_i', ''); 524 foreach ($val_idx as $id) { 525 if ($id === '') continue; 526 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 527 } 528 $this->saveIndex($metaname.'_i', '', $meta_idx); 529 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 530 } 531 532 return true; 533 } 534 535 /** 536 * Clear the whole index 537 * 538 * @return bool If the index has been cleared successfully 539 */ 540 public function clear() { 541 global $conf; 542 543 if (!$this->lock()) return false; 544 545 @unlink($conf['indexdir'].'/page.idx'); 546 @unlink($conf['indexdir'].'/title.idx'); 547 @unlink($conf['indexdir'].'/pageword.idx'); 548 @unlink($conf['indexdir'].'/metadata.idx'); 549 $dir = @opendir($conf['indexdir']); 550 if($dir!==false){ 551 while(($f = readdir($dir)) !== false){ 552 if(substr($f,-4)=='.idx' && 553 (substr($f,0,1)=='i' || substr($f,0,1)=='w' 554 || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx')) 555 @unlink($conf['indexdir']."/$f"); 556 } 557 } 558 @unlink($conf['indexdir'].'/lengths.idx'); 559 560 // clear the pid cache 561 $this->pidCache = array(); 562 563 $this->unlock(); 564 return true; 565 } 566 567 /** 568 * Split the text into words for fulltext search 569 * 570 * TODO: does this also need &$stopwords ? 571 * 572 * @triggers INDEXER_TEXT_PREPARE 573 * This event allows plugins to modify the text before it gets tokenized. 574 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 575 * 576 * @param string $text plain text 577 * @param boolean $wc are wildcards allowed? 578 * @return array list of words in the text 579 * 580 * @author Tom N Harris <tnharris@whoopdedo.org> 581 * @author Andreas Gohr <andi@splitbrain.org> 582 */ 583 public function tokenizer($text, $wc=false) { 584 $wc = ($wc) ? '' : '\*'; 585 $stopwords =& idx_get_stopwords(); 586 587 // prepare the text to be tokenized 588 $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text); 589 if ($evt->advise_before(true)) { 590 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 591 // handle asian chars as single words (may fail on older PHP version) 592 $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); 593 if (!is_null($asia)) $text = $asia; // recover from regexp falure 594 } 595 } 596 $evt->advise_after(); 597 unset($evt); 598 599 $text = strtr($text, 600 array( 601 "\r" => ' ', 602 "\n" => ' ', 603 "\t" => ' ', 604 "\xC2\xAD" => '', //soft-hyphen 605 ) 606 ); 607 if (preg_match('/[^0-9A-Za-z ]/u', $text)) 608 $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); 609 610 $wordlist = explode(' ', $text); 611 foreach ($wordlist as $i => $word) { 612 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 613 utf8_strtolower($word) : strtolower($word); 614 } 615 616 foreach ($wordlist as $i => $word) { 617 if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 618 || array_search($word, $stopwords, true) !== false) 619 unset($wordlist[$i]); 620 } 621 return array_values($wordlist); 622 } 623 624 /** 625 * Get the numeric PID of a page 626 * 627 * @param string $page The page to get the PID for 628 * @return bool|int The page id on success, false on error 629 */ 630 public function getPID($page) { 631 // return PID without locking when it is in the cache 632 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 633 634 if (!$this->lock()) 635 return false; 636 637 // load known documents 638 $pid = $this->getPIDNoLock($page); 639 if ($pid === false) { 640 $this->unlock(); 641 return false; 642 } 643 644 $this->unlock(); 645 return $pid; 646 } 647 648 /** 649 * Get the numeric PID of a page without locking the index. 650 * Only use this function when the index is already locked. 651 * 652 * @param string $page The page to get the PID for 653 * @return bool|int The page id on success, false on error 654 */ 655 protected function getPIDNoLock($page) { 656 // avoid expensive addIndexKey operation for the most recently requested pages by using a cache 657 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 658 $pid = $this->addIndexKey('page', '', $page); 659 // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently 660 // added item will be requested again 661 if (count($this->pidCache) > 10) array_shift($this->pidCache); 662 $this->pidCache[$page] = $pid; 663 return $pid; 664 } 665 666 /** 667 * Get the page id of a numeric PID 668 * 669 * @param int $pid The PID to get the page id for 670 * @return string The page id 671 */ 672 public function getPageFromPID($pid) { 673 return $this->getIndexKey('page', '', $pid); 674 } 675 676 /** 677 * Find pages in the fulltext index containing the words, 678 * 679 * The search words must be pre-tokenized, meaning only letters and 680 * numbers with an optional wildcard 681 * 682 * The returned array will have the original tokens as key. The values 683 * in the returned list is an array with the page names as keys and the 684 * number of times that token appears on the page as value. 685 * 686 * @param array $tokens list of words to search for 687 * @return array list of page names with usage counts 688 * 689 * @author Tom N Harris <tnharris@whoopdedo.org> 690 * @author Andreas Gohr <andi@splitbrain.org> 691 */ 692 public function lookup(&$tokens) { 693 $result = array(); 694 $wids = $this->getIndexWords($tokens, $result); 695 if (empty($wids)) return array(); 696 // load known words and documents 697 $page_idx = $this->getIndex('page', ''); 698 $docs = array(); 699 foreach (array_keys($wids) as $wlen) { 700 $wids[$wlen] = array_unique($wids[$wlen]); 701 $index = $this->getIndex('i', $wlen); 702 foreach($wids[$wlen] as $ixid) { 703 if ($ixid < count($index)) 704 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 705 } 706 } 707 // merge found pages into final result array 708 $final = array(); 709 foreach ($result as $word => $res) { 710 $final[$word] = array(); 711 foreach ($res as $wid) { 712 // handle the case when ($ixid < count($index)) has been false 713 // and thus $docs[$wid] hasn't been set. 714 if (!isset($docs[$wid])) continue; 715 $hits = &$docs[$wid]; 716 foreach ($hits as $hitkey => $hitcnt) { 717 // make sure the document still exists 718 if (!page_exists($hitkey, '', false)) continue; 719 if (!isset($final[$word][$hitkey])) 720 $final[$word][$hitkey] = $hitcnt; 721 else 722 $final[$word][$hitkey] += $hitcnt; 723 } 724 } 725 } 726 return $final; 727 } 728 729 /** 730 * Find pages containing a metadata key. 731 * 732 * The metadata values are compared as case-sensitive strings. Pass a 733 * callback function that returns true or false to use a different 734 * comparison function. The function will be called with the $value being 735 * searched for as the first argument, and the word in the index as the 736 * second argument. The function preg_match can be used directly if the 737 * values are regexes. 738 * 739 * @param string $key name of the metadata key to look for 740 * @param string $value search term to look for, must be a string or array of strings 741 * @param callback $func comparison function 742 * @return array lists with page names, keys are query values if $value is array 743 * 744 * @author Tom N Harris <tnharris@whoopdedo.org> 745 * @author Michael Hamann <michael@content-space.de> 746 */ 747 public function lookupKey($key, &$value, $func=null) { 748 if (!is_array($value)) 749 $value_array = array($value); 750 else 751 $value_array =& $value; 752 753 // the matching ids for the provided value(s) 754 $value_ids = array(); 755 756 $metaname = idx_cleanName($key); 757 758 // get all words in order to search the matching ids 759 if ($key == 'title') { 760 $words = $this->getIndex('title', ''); 761 } else { 762 $words = $this->getIndex($metaname.'_w', ''); 763 } 764 765 if (!is_null($func)) { 766 foreach ($value_array as $val) { 767 foreach ($words as $i => $word) { 768 if (call_user_func_array($func, array($val, $word))) 769 $value_ids[$i][] = $val; 770 } 771 } 772 } else { 773 foreach ($value_array as $val) { 774 $xval = $val; 775 $caret = '^'; 776 $dollar = '$'; 777 // check for wildcards 778 if (substr($xval, 0, 1) == '*') { 779 $xval = substr($xval, 1); 780 $caret = ''; 781 } 782 if (substr($xval, -1, 1) == '*') { 783 $xval = substr($xval, 0, -1); 784 $dollar = ''; 785 } 786 if (!$caret || !$dollar) { 787 $re = $caret.preg_quote($xval, '/').$dollar; 788 foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) 789 $value_ids[$i][] = $val; 790 } else { 791 if (($i = array_search($val, $words, true)) !== false) 792 $value_ids[$i][] = $val; 793 } 794 } 795 } 796 797 unset($words); // free the used memory 798 799 // initialize the result so it won't be null 800 $result = array(); 801 foreach ($value_array as $val) { 802 $result[$val] = array(); 803 } 804 805 $page_idx = $this->getIndex('page', ''); 806 807 // Special handling for titles 808 if ($key == 'title') { 809 foreach ($value_ids as $pid => $val_list) { 810 $page = $page_idx[$pid]; 811 foreach ($val_list as $val) { 812 $result[$val][] = $page; 813 } 814 } 815 } else { 816 // load all lines and pages so the used lines can be taken and matched with the pages 817 $lines = $this->getIndex($metaname.'_i', ''); 818 819 foreach ($value_ids as $value_id => $val_list) { 820 // parse the tuples of the form page_id*1:page2_id*1 and so on, return value 821 // is an array with page_id => 1, page2_id => 1 etc. so take the keys only 822 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 823 foreach ($val_list as $val) { 824 $result[$val] = array_merge($result[$val], $pages); 825 } 826 } 827 } 828 if (!is_array($value)) $result = $result[$value]; 829 return $result; 830 } 831 832 /** 833 * Find the index ID of each search term. 834 * 835 * The query terms should only contain valid characters, with a '*' at 836 * either the beginning or end of the word (or both). 837 * The $result parameter can be used to merge the index locations with 838 * the appropriate query term. 839 * 840 * @param array $words The query terms. 841 * @param array $result Set to word => array("length*id" ...) 842 * @return array Set to length => array(id ...) 843 * 844 * @author Tom N Harris <tnharris@whoopdedo.org> 845 */ 846 protected function getIndexWords(&$words, &$result) { 847 $tokens = array(); 848 $tokenlength = array(); 849 $tokenwild = array(); 850 foreach ($words as $word) { 851 $result[$word] = array(); 852 $caret = '^'; 853 $dollar = '$'; 854 $xword = $word; 855 $wlen = wordlen($word); 856 857 // check for wildcards 858 if (substr($xword, 0, 1) == '*') { 859 $xword = substr($xword, 1); 860 $caret = ''; 861 $wlen -= 1; 862 } 863 if (substr($xword, -1, 1) == '*') { 864 $xword = substr($xword, 0, -1); 865 $dollar = ''; 866 $wlen -= 1; 867 } 868 if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) 869 continue; 870 if (!isset($tokens[$xword])) 871 $tokenlength[$wlen][] = $xword; 872 if (!$caret || !$dollar) { 873 $re = $caret.preg_quote($xword, '/').$dollar; 874 $tokens[$xword][] = array($word, '/'.$re.'/'); 875 if (!isset($tokenwild[$xword])) 876 $tokenwild[$xword] = $wlen; 877 } else { 878 $tokens[$xword][] = array($word, null); 879 } 880 } 881 asort($tokenwild); 882 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 883 // $tokenlength = array( base word length => base word ... ) 884 // $tokenwild = array( base word => base word length ... ) 885 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 886 $indexes_known = $this->indexLengths($length_filter); 887 if (!empty($tokenwild)) sort($indexes_known); 888 // get word IDs 889 $wids = array(); 890 foreach ($indexes_known as $ixlen) { 891 $word_idx = $this->getIndex('w', $ixlen); 892 // handle exact search 893 if (isset($tokenlength[$ixlen])) { 894 foreach ($tokenlength[$ixlen] as $xword) { 895 $wid = array_search($xword, $word_idx, true); 896 if ($wid !== false) { 897 $wids[$ixlen][] = $wid; 898 foreach ($tokens[$xword] as $w) 899 $result[$w[0]][] = "$ixlen*$wid"; 900 } 901 } 902 } 903 // handle wildcard search 904 foreach ($tokenwild as $xword => $wlen) { 905 if ($wlen >= $ixlen) break; 906 foreach ($tokens[$xword] as $w) { 907 if (is_null($w[1])) continue; 908 foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 909 $wids[$ixlen][] = $wid; 910 $result[$w[0]][] = "$ixlen*$wid"; 911 } 912 } 913 } 914 } 915 return $wids; 916 } 917 918 /** 919 * Return a list of all pages 920 * Warning: pages may not exist! 921 * 922 * @param string $key list only pages containing the metadata key (optional) 923 * @return array list of page names 924 * 925 * @author Tom N Harris <tnharris@whoopdedo.org> 926 */ 927 public function getPages($key=null) { 928 $page_idx = $this->getIndex('page', ''); 929 if (is_null($key)) return $page_idx; 930 931 $metaname = idx_cleanName($key); 932 933 // Special handling for titles 934 if ($key == 'title') { 935 $title_idx = $this->getIndex('title', ''); 936 array_splice($page_idx, count($title_idx)); 937 foreach ($title_idx as $i => $title) 938 if ($title === "") unset($page_idx[$i]); 939 return array_values($page_idx); 940 } 941 942 $pages = array(); 943 $lines = $this->getIndex($metaname.'_i', ''); 944 foreach ($lines as $line) { 945 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 946 } 947 return array_keys($pages); 948 } 949 950 /** 951 * Return a list of words sorted by number of times used 952 * 953 * @param int $min bottom frequency threshold 954 * @param int $max upper frequency limit. No limit if $max<$min 955 * @param int $minlen minimum length of words to count 956 * @param string $key metadata key to list. Uses the fulltext index if not given 957 * @return array list of words as the keys and frequency as values 958 * 959 * @author Tom N Harris <tnharris@whoopdedo.org> 960 */ 961 public function histogram($min=1, $max=0, $minlen=3, $key=null) { 962 if ($min < 1) 963 $min = 1; 964 if ($max < $min) 965 $max = 0; 966 967 $result = array(); 968 969 if ($key == 'title') { 970 $index = $this->getIndex('title', ''); 971 $index = array_count_values($index); 972 foreach ($index as $val => $cnt) { 973 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) 974 $result[$val] = $cnt; 975 } 976 } 977 elseif (!is_null($key)) { 978 $metaname = idx_cleanName($key); 979 $index = $this->getIndex($metaname.'_i', ''); 980 $val_idx = array(); 981 foreach ($index as $wid => $line) { 982 $freq = $this->countTuples($line); 983 if ($freq >= $min && (!$max || $freq <= $max)) 984 $val_idx[$wid] = $freq; 985 } 986 if (!empty($val_idx)) { 987 $words = $this->getIndex($metaname.'_w', ''); 988 foreach ($val_idx as $wid => $freq) { 989 if (strlen($words[$wid]) >= $minlen) 990 $result[$words[$wid]] = $freq; 991 } 992 } 993 } 994 else { 995 $lengths = idx_listIndexLengths(); 996 foreach ($lengths as $length) { 997 if ($length < $minlen) continue; 998 $index = $this->getIndex('i', $length); 999 $words = null; 1000 foreach ($index as $wid => $line) { 1001 $freq = $this->countTuples($line); 1002 if ($freq >= $min && (!$max || $freq <= $max)) { 1003 if ($words === null) 1004 $words = $this->getIndex('w', $length); 1005 $result[$words[$wid]] = $freq; 1006 } 1007 } 1008 } 1009 } 1010 1011 arsort($result); 1012 return $result; 1013 } 1014 1015 /** 1016 * Lock the indexer. 1017 * 1018 * @author Tom N Harris <tnharris@whoopdedo.org> 1019 * 1020 * @return bool|string 1021 */ 1022 protected function lock() { 1023 global $conf; 1024 $status = true; 1025 $run = 0; 1026 $lock = $conf['lockdir'].'/_indexer.lock'; 1027 while (!@mkdir($lock, $conf['dmode'])) { 1028 usleep(50); 1029 if(is_dir($lock) && time()-@filemtime($lock) > 60*5){ 1030 // looks like a stale lock - remove it 1031 if (!@rmdir($lock)) { 1032 $status = "removing the stale lock failed"; 1033 return false; 1034 } else { 1035 $status = "stale lock removed"; 1036 } 1037 }elseif($run++ == 1000){ 1038 // we waited 5 seconds for that lock 1039 return false; 1040 } 1041 } 1042 if (!empty($conf['dperm'])) { 1043 chmod($lock, $conf['dperm']); 1044 } 1045 return $status; 1046 } 1047 1048 /** 1049 * Release the indexer lock. 1050 * 1051 * @author Tom N Harris <tnharris@whoopdedo.org> 1052 * 1053 * @return bool 1054 */ 1055 protected function unlock() { 1056 global $conf; 1057 @rmdir($conf['lockdir'].'/_indexer.lock'); 1058 return true; 1059 } 1060 1061 /** 1062 * Retrieve the entire index. 1063 * 1064 * The $suffix argument is for an index that is split into 1065 * multiple parts. Different index files should use different 1066 * base names. 1067 * 1068 * @param string $idx name of the index 1069 * @param string $suffix subpart identifier 1070 * @return array list of lines without CR or LF 1071 * 1072 * @author Tom N Harris <tnharris@whoopdedo.org> 1073 */ 1074 protected function getIndex($idx, $suffix) { 1075 global $conf; 1076 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1077 if (!file_exists($fn)) return array(); 1078 return file($fn, FILE_IGNORE_NEW_LINES); 1079 } 1080 1081 /** 1082 * Replace the contents of the index with an array. 1083 * 1084 * @param string $idx name of the index 1085 * @param string $suffix subpart identifier 1086 * @param array $lines list of lines without LF 1087 * @return bool If saving succeeded 1088 * 1089 * @author Tom N Harris <tnharris@whoopdedo.org> 1090 */ 1091 protected function saveIndex($idx, $suffix, &$lines) { 1092 global $conf; 1093 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1094 $fh = @fopen($fn.'.tmp', 'w'); 1095 if (!$fh) return false; 1096 fwrite($fh, join("\n", $lines)); 1097 if (!empty($lines)) 1098 fwrite($fh, "\n"); 1099 fclose($fh); 1100 if (isset($conf['fperm'])) 1101 chmod($fn.'.tmp', $conf['fperm']); 1102 io_rename($fn.'.tmp', $fn.'.idx'); 1103 return true; 1104 } 1105 1106 /** 1107 * Retrieve a line from the index. 1108 * 1109 * @param string $idx name of the index 1110 * @param string $suffix subpart identifier 1111 * @param int $id the line number 1112 * @return string a line with trailing whitespace removed 1113 * 1114 * @author Tom N Harris <tnharris@whoopdedo.org> 1115 */ 1116 protected function getIndexKey($idx, $suffix, $id) { 1117 global $conf; 1118 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1119 if (!file_exists($fn)) return ''; 1120 $fh = @fopen($fn, 'r'); 1121 if (!$fh) return ''; 1122 $ln = -1; 1123 while (($line = fgets($fh)) !== false) { 1124 if (++$ln == $id) break; 1125 } 1126 fclose($fh); 1127 return rtrim((string)$line); 1128 } 1129 1130 /** 1131 * Write a line into the index. 1132 * 1133 * @param string $idx name of the index 1134 * @param string $suffix subpart identifier 1135 * @param int $id the line number 1136 * @param string $line line to write 1137 * @return bool If saving succeeded 1138 * 1139 * @author Tom N Harris <tnharris@whoopdedo.org> 1140 */ 1141 protected function saveIndexKey($idx, $suffix, $id, $line) { 1142 global $conf; 1143 if (substr($line, -1) != "\n") 1144 $line .= "\n"; 1145 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1146 $fh = @fopen($fn.'.tmp', 'w'); 1147 if (!$fh) return false; 1148 $ih = @fopen($fn.'.idx', 'r'); 1149 if ($ih) { 1150 $ln = -1; 1151 while (($curline = fgets($ih)) !== false) { 1152 fwrite($fh, (++$ln == $id) ? $line : $curline); 1153 } 1154 if ($id > $ln) { 1155 while ($id > ++$ln) 1156 fwrite($fh, "\n"); 1157 fwrite($fh, $line); 1158 } 1159 fclose($ih); 1160 } else { 1161 $ln = -1; 1162 while ($id > ++$ln) 1163 fwrite($fh, "\n"); 1164 fwrite($fh, $line); 1165 } 1166 fclose($fh); 1167 if (isset($conf['fperm'])) 1168 chmod($fn.'.tmp', $conf['fperm']); 1169 io_rename($fn.'.tmp', $fn.'.idx'); 1170 return true; 1171 } 1172 1173 /** 1174 * Retrieve or insert a value in the index. 1175 * 1176 * @param string $idx name of the index 1177 * @param string $suffix subpart identifier 1178 * @param string $value line to find in the index 1179 * @return int|bool line number of the value in the index or false if writing the index failed 1180 * 1181 * @author Tom N Harris <tnharris@whoopdedo.org> 1182 */ 1183 protected function addIndexKey($idx, $suffix, $value) { 1184 $index = $this->getIndex($idx, $suffix); 1185 $id = array_search($value, $index, true); 1186 if ($id === false) { 1187 $id = count($index); 1188 $index[$id] = $value; 1189 if (!$this->saveIndex($idx, $suffix, $index)) { 1190 trigger_error("Failed to write $idx index", E_USER_ERROR); 1191 return false; 1192 } 1193 } 1194 return $id; 1195 } 1196 1197 /** 1198 * Get the list of lengths indexed in the wiki. 1199 * 1200 * Read the index directory or a cache file and returns 1201 * a sorted array of lengths of the words used in the wiki. 1202 * 1203 * @author YoBoY <yoboy.leguesh@gmail.com> 1204 * 1205 * @return array 1206 */ 1207 protected function listIndexLengths() { 1208 return idx_listIndexLengths(); 1209 } 1210 1211 /** 1212 * Get the word lengths that have been indexed. 1213 * 1214 * Reads the index directory and returns an array of lengths 1215 * that there are indices for. 1216 * 1217 * @author YoBoY <yoboy.leguesh@gmail.com> 1218 * 1219 * @param array|int $filter 1220 * @return array 1221 */ 1222 protected function indexLengths($filter) { 1223 global $conf; 1224 $idx = array(); 1225 if (is_array($filter)) { 1226 // testing if index files exist only 1227 $path = $conf['indexdir']."/i"; 1228 foreach ($filter as $key => $value) { 1229 if (file_exists($path.$key.'.idx')) 1230 $idx[] = $key; 1231 } 1232 } else { 1233 $lengths = idx_listIndexLengths(); 1234 foreach ($lengths as $key => $length) { 1235 // keep all the values equal or superior 1236 if ((int)$length >= (int)$filter) 1237 $idx[] = $length; 1238 } 1239 } 1240 return $idx; 1241 } 1242 1243 /** 1244 * Insert or replace a tuple in a line. 1245 * 1246 * @author Tom N Harris <tnharris@whoopdedo.org> 1247 * 1248 * @param string $line 1249 * @param string|int $id 1250 * @param int $count 1251 * @return string 1252 */ 1253 protected function updateTuple($line, $id, $count) { 1254 if ($line != ''){ 1255 $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1256 } 1257 $line = trim($line, ':'); 1258 if ($count) { 1259 if ($line) { 1260 return "$id*$count:".$line; 1261 } else { 1262 return "$id*$count"; 1263 } 1264 } 1265 return $line; 1266 } 1267 1268 /** 1269 * Split a line into an array of tuples. 1270 * 1271 * @author Tom N Harris <tnharris@whoopdedo.org> 1272 * @author Andreas Gohr <andi@splitbrain.org> 1273 * 1274 * @param array $keys 1275 * @param string $line 1276 * @return array 1277 */ 1278 protected function parseTuples(&$keys, $line) { 1279 $result = array(); 1280 if ($line == '') return $result; 1281 $parts = explode(':', $line); 1282 foreach ($parts as $tuple) { 1283 if ($tuple === '') continue; 1284 list($key, $cnt) = explode('*', $tuple); 1285 if (!$cnt) continue; 1286 $key = $keys[$key]; 1287 if (!$key) continue; 1288 $result[$key] = $cnt; 1289 } 1290 return $result; 1291 } 1292 1293 /** 1294 * Sum the counts in a list of tuples. 1295 * 1296 * @author Tom N Harris <tnharris@whoopdedo.org> 1297 * 1298 * @param string $line 1299 * @return int 1300 */ 1301 protected function countTuples($line) { 1302 $freq = 0; 1303 $parts = explode(':', $line); 1304 foreach ($parts as $tuple) { 1305 if ($tuple === '') continue; 1306 list(/* $pid */, $cnt) = explode('*', $tuple); 1307 $freq += (int)$cnt; 1308 } 1309 return $freq; 1310 } 1311} 1312 1313/** 1314 * Create an instance of the indexer. 1315 * 1316 * @return Doku_Indexer a Doku_Indexer 1317 * 1318 * @author Tom N Harris <tnharris@whoopdedo.org> 1319 */ 1320function idx_get_indexer() { 1321 static $Indexer; 1322 if (!isset($Indexer)) { 1323 $Indexer = new Doku_Indexer(); 1324 } 1325 return $Indexer; 1326} 1327 1328/** 1329 * Returns words that will be ignored. 1330 * 1331 * @return array list of stop words 1332 * 1333 * @author Tom N Harris <tnharris@whoopdedo.org> 1334 */ 1335function & idx_get_stopwords() { 1336 static $stopwords = null; 1337 if (is_null($stopwords)) { 1338 global $conf; 1339 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 1340 if(file_exists($swfile)){ 1341 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 1342 }else{ 1343 $stopwords = array(); 1344 } 1345 } 1346 return $stopwords; 1347} 1348 1349/** 1350 * Adds/updates the search index for the given page 1351 * 1352 * Locking is handled internally. 1353 * 1354 * @param string $page name of the page to index 1355 * @param boolean $verbose print status messages 1356 * @param boolean $force force reindexing even when the index is up to date 1357 * @return string|boolean the function completed successfully 1358 * 1359 * @author Tom N Harris <tnharris@whoopdedo.org> 1360 */ 1361function idx_addPage($page, $verbose=false, $force=false) { 1362 $idxtag = metaFN($page,'.indexed'); 1363 // check if page was deleted but is still in the index 1364 if (!page_exists($page)) { 1365 if (!file_exists($idxtag)) { 1366 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 1367 return false; 1368 } 1369 $Indexer = idx_get_indexer(); 1370 $result = $Indexer->deletePage($page); 1371 if ($result === "locked") { 1372 if ($verbose) print("Indexer: locked".DOKU_LF); 1373 return false; 1374 } 1375 @unlink($idxtag); 1376 return $result; 1377 } 1378 1379 // check if indexing needed 1380 if(!$force && file_exists($idxtag)){ 1381 if(trim(io_readFile($idxtag)) == idx_get_version()){ 1382 $last = @filemtime($idxtag); 1383 if($last > @filemtime(wikiFN($page))){ 1384 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 1385 return false; 1386 } 1387 } 1388 } 1389 1390 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 1391 if ($indexenabled === false) { 1392 $result = false; 1393 if (file_exists($idxtag)) { 1394 $Indexer = idx_get_indexer(); 1395 $result = $Indexer->deletePage($page); 1396 if ($result === "locked") { 1397 if ($verbose) print("Indexer: locked".DOKU_LF); 1398 return false; 1399 } 1400 @unlink($idxtag); 1401 } 1402 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 1403 return $result; 1404 } 1405 1406 $Indexer = idx_get_indexer(); 1407 $pid = $Indexer->getPID($page); 1408 if ($pid === false) { 1409 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 1410 return false; 1411 } 1412 $body = ''; 1413 $metadata = array(); 1414 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 1415 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 1416 $metadata['relation_references'] = array_keys($references); 1417 else 1418 $metadata['relation_references'] = array(); 1419 1420 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 1421 $metadata['relation_media'] = array_keys($media); 1422 else 1423 $metadata['relation_media'] = array(); 1424 1425 $data = compact('page', 'body', 'metadata', 'pid'); 1426 $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); 1427 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 1428 $evt->advise_after(); 1429 unset($evt); 1430 extract($data); 1431 1432 $result = $Indexer->addPageWords($page, $body); 1433 if ($result === "locked") { 1434 if ($verbose) print("Indexer: locked".DOKU_LF); 1435 return false; 1436 } 1437 1438 if ($result) { 1439 $result = $Indexer->addMetaKeys($page, $metadata); 1440 if ($result === "locked") { 1441 if ($verbose) print("Indexer: locked".DOKU_LF); 1442 return false; 1443 } 1444 } 1445 1446 if ($result) 1447 io_saveFile(metaFN($page,'.indexed'), idx_get_version()); 1448 if ($verbose) { 1449 print("Indexer: finished".DOKU_LF); 1450 return true; 1451 } 1452 return $result; 1453} 1454 1455/** 1456 * Find tokens in the fulltext index 1457 * 1458 * Takes an array of words and will return a list of matching 1459 * pages for each one. 1460 * 1461 * Important: No ACL checking is done here! All results are 1462 * returned, regardless of permissions 1463 * 1464 * @param array $words list of words to search for 1465 * @return array list of pages found, associated with the search terms 1466 */ 1467function idx_lookup(&$words) { 1468 $Indexer = idx_get_indexer(); 1469 return $Indexer->lookup($words); 1470} 1471 1472/** 1473 * Split a string into tokens 1474 * 1475 * @param string $string 1476 * @param bool $wc 1477 * 1478 * @return array 1479 */ 1480function idx_tokenizer($string, $wc=false) { 1481 $Indexer = idx_get_indexer(); 1482 return $Indexer->tokenizer($string, $wc); 1483} 1484 1485/* For compatibility */ 1486 1487/** 1488 * Read the list of words in an index (if it exists). 1489 * 1490 * @author Tom N Harris <tnharris@whoopdedo.org> 1491 * 1492 * @param string $idx 1493 * @param string $suffix 1494 * @return array 1495 */ 1496function idx_getIndex($idx, $suffix) { 1497 global $conf; 1498 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1499 if (!file_exists($fn)) return array(); 1500 return file($fn); 1501} 1502 1503/** 1504 * Get the list of lengths indexed in the wiki. 1505 * 1506 * Read the index directory or a cache file and returns 1507 * a sorted array of lengths of the words used in the wiki. 1508 * 1509 * @author YoBoY <yoboy.leguesh@gmail.com> 1510 * 1511 * @return array 1512 */ 1513function idx_listIndexLengths() { 1514 global $conf; 1515 // testing what we have to do, create a cache file or not. 1516 if ($conf['readdircache'] == 0) { 1517 $docache = false; 1518 } else { 1519 clearstatcache(); 1520 if (file_exists($conf['indexdir'].'/lengths.idx') 1521 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 1522 if ( 1523 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 1524 !== false 1525 ) { 1526 $idx = array(); 1527 foreach ($lengths as $length) { 1528 $idx[] = (int)$length; 1529 } 1530 return $idx; 1531 } 1532 } 1533 $docache = true; 1534 } 1535 1536 if ($conf['readdircache'] == 0 || $docache) { 1537 $dir = @opendir($conf['indexdir']); 1538 if ($dir === false) 1539 return array(); 1540 $idx = array(); 1541 while (($f = readdir($dir)) !== false) { 1542 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1543 $i = substr($f, 1, -4); 1544 if (is_numeric($i)) 1545 $idx[] = (int)$i; 1546 } 1547 } 1548 closedir($dir); 1549 sort($idx); 1550 // save this in a file 1551 if ($docache) { 1552 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 1553 @fwrite($handle, implode("\n", $idx)); 1554 @fclose($handle); 1555 } 1556 return $idx; 1557 } 1558 1559 return array(); 1560} 1561 1562/** 1563 * Get the word lengths that have been indexed. 1564 * 1565 * Reads the index directory and returns an array of lengths 1566 * that there are indices for. 1567 * 1568 * @author YoBoY <yoboy.leguesh@gmail.com> 1569 * 1570 * @param array|int $filter 1571 * @return array 1572 */ 1573function idx_indexLengths($filter) { 1574 global $conf; 1575 $idx = array(); 1576 if (is_array($filter)) { 1577 // testing if index files exist only 1578 $path = $conf['indexdir']."/i"; 1579 foreach ($filter as $key => $value) { 1580 if (file_exists($path.$key.'.idx')) 1581 $idx[] = $key; 1582 } 1583 } else { 1584 $lengths = idx_listIndexLengths(); 1585 foreach ($lengths as $key => $length) { 1586 // keep all the values equal or superior 1587 if ((int)$length >= (int)$filter) 1588 $idx[] = $length; 1589 } 1590 } 1591 return $idx; 1592} 1593 1594/** 1595 * Clean a name of a key for use as a file name. 1596 * 1597 * Romanizes non-latin characters, then strips away anything that's 1598 * not a letter, number, or underscore. 1599 * 1600 * @author Tom N Harris <tnharris@whoopdedo.org> 1601 * 1602 * @param string $name 1603 * @return string 1604 */ 1605function idx_cleanName($name) { 1606 $name = utf8_romanize(trim((string)$name)); 1607 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1608 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1609 return strtolower($name); 1610} 1611 1612//Setup VIM: ex: et ts=4 : 1613