1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9 10if(!defined('DOKU_INC')) die('meh.'); 11 12// Version tag used to force rebuild on upgrade 13define('INDEXER_VERSION', 3); 14 15// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 16if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 17 18// Asian characters are handled as words. The following regexp defines the 19// Unicode-Ranges for Asian characters 20// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 21// I'm no language expert. If you think some ranges are wrongly chosen or 22// a range is missing, please contact me 23define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 24define('IDX_ASIAN2','['. 25 '\x{2E80}-\x{3040}'. // CJK -> Hangul 26 '\x{309D}-\x{30A0}'. 27 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. 28 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 29 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 30 ']'); 31define('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 32 '\x{3042}\x{3044}\x{3046}\x{3048}'. 33 '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 34 '\x{3084}\x{3086}\x{3088}-\x{308D}'. 35 '\x{308F}-\x{3094}'. 36 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 37 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 38 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 39 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 40 ']['. 41 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 42 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 43 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 44 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 45 '\x{31F0}-\x{31FF}'. 46 ']?'); 47define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); 48 49/** 50 * Version of the indexer taking into consideration the external tokenizer. 51 * The indexer is only compatible with data written by the same version. 52 * 53 * @author Tom N Harris <tnharris@whoopdedo.org> 54 */ 55function idx_get_version(){ 56 global $conf; 57 if($conf['external_tokenizer']) 58 return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); 59 else 60 return INDEXER_VERSION; 61} 62 63/** 64 * Measure the length of a string. 65 * Differs from strlen in handling of asian characters. 66 * 67 * @author Tom N Harris <tnharris@whoopdedo.org> 68 */ 69function wordlen($w){ 70 $l = strlen($w); 71 // If left alone, all chinese "words" will get put into w3.idx 72 // So the "length" of a "word" is faked 73 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 74 foreach($leadbytes[0] as $b) 75 $l += ord($b) - 0xE1; 76 } 77 return $l; 78} 79 80/** 81 * Class that encapsulates operations on the indexer database. 82 * 83 * @author Tom N Harris <tnharris@whoopdedo.org> 84 */ 85class Doku_Indexer { 86 87 /** 88 * Adds the contents of a page to the fulltext index 89 * 90 * The added text replaces previous words for the same page. 91 * An empty value erases the page. 92 * 93 * @param string $page a page name 94 * @param string $text the body of the page 95 * @return boolean the function completed successfully 96 * @author Tom N Harris <tnharris@whoopdedo.org> 97 * @author Andreas Gohr <andi@splitbrain.org> 98 */ 99 public function addPageWords($page, $text) { 100 $this->_lock(); 101 102 // load known documents 103 $page_idx = $this->_addIndexKey('page', '', $page); 104 if ($page_idx === false) { 105 $this->_unlock(); 106 return false; 107 } 108 109 $pagewords = array(); 110 // get word usage in page 111 $words = $this->_getPageWords($text); 112 if ($words === false) { 113 $this->_unlock(); 114 return false; 115 } 116 117 if (!empty($words)) { 118 foreach (array_keys($words) as $wlen) { 119 $index = $this->_getIndex('i', $wlen); 120 foreach ($words[$wlen] as $wid => $freq) { 121 $idx = ($wid<count($index)) ? $index[$wid] : ''; 122 $index[$wid] = $this->_updateTuple($idx, $pid, $freq); 123 $pagewords[] = "$wlen*$wid"; 124 } 125 if (!$this->_saveIndex('i', $wlen, $index)) { 126 $this->_unlock(); 127 return false; 128 } 129 } 130 } 131 132 // Remove obsolete index entries 133 $pageword_idx = $this->_getIndexKey('pageword', '', $pid); 134 if ($pageword_idx !== '') { 135 $oldwords = explode(':',$pageword_idx); 136 $delwords = array_diff($oldwords, $pagewords); 137 $upwords = array(); 138 foreach ($delwords as $word) { 139 if ($word != '') { 140 list($wlen,$wid) = explode('*', $word); 141 $wid = (int)$wid; 142 $upwords[$wlen][] = $wid; 143 } 144 } 145 foreach ($upwords as $wlen => $widx) { 146 $index = $this->_getIndex('i', $wlen); 147 foreach ($widx as $wid) { 148 $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0); 149 } 150 $this->_saveIndex('i', $wlen, $index); 151 } 152 } 153 // Save the reverse index 154 $pageword_idx = join(':', $pagewords); 155 if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) { 156 $this->_unlock(); 157 return false; 158 } 159 160 $this->_unlock(); 161 return true; 162 } 163 164 /** 165 * Split the words in a page and add them to the index. 166 * 167 * @author Andreas Gohr <andi@splitbrain.org> 168 * @author Christopher Smith <chris@jalakai.co.uk> 169 * @author Tom N Harris <tnharris@whoopdedo.org> 170 */ 171 private function _getPageWords($text) { 172 global $conf; 173 174 $tokens = $this->tokenizer($text); 175 $tokens = array_count_values($tokens); // count the frequency of each token 176 177 $words = array(); 178 foreach ($tokens as $w=>$c) { 179 $l = wordlen($w); 180 if (isset($words[$l])){ 181 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 182 }else{ 183 $words[$l] = array($w => $c); 184 } 185 } 186 187 // arrive here with $words = array(wordlen => array(word => frequency)) 188 $word_idx_modified = false; 189 $index = array(); //resulting index 190 foreach (array_keys($words) as $wlen) { 191 $word_idx = $this->_getIndex('w', $wlen); 192 foreach ($words[$wlen] as $word => $freq) { 193 $wid = array_search($word, $word_idx); 194 if ($wid === false) { 195 $wid = count($word_idx); 196 $word_idx[] = $word; 197 $word_idx_modified = true; 198 } 199 if (!isset($index[$wlen])) 200 $index[$wlen] = array(); 201 $index[$wlen][$wid] = $freq; 202 } 203 // save back the word index 204 if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx)) 205 return false; 206 } 207 208 return $index; 209 } 210 211 /** 212 * Add keys to the metadata index. 213 * 214 * Adding new keys does not remove other keys for the page. 215 * An empty value will erase the key. 216 * The $key parameter can be an array to add multiple keys. $value will 217 * not be used if $key is an array. 218 * 219 * @param string $page a page name 220 * @param mixed $key a key string or array of key=>value pairs 221 * @param mixed $value the value or list of values 222 * @return boolean the function completed successfully 223 * @author Tom N Harris <tnharris@whoopdedo.org> 224 */ 225 public function addMetaKeys($page, $key, $value=null) { 226 if (!is_array($key)) { 227 $key = array($key => $value); 228 } elseif (!is_null($value)) { 229 // $key is array, but $value is not null 230 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 231 } 232 233 $this->_lock(); 234 235 // load known documents 236 $pid = $this->_addIndexKey('page', '', $page); 237 if ($pid === false) { 238 $this->_unlock(); 239 return false; 240 } 241 242 foreach ($key as $name => $values) { 243 $metaname = idx_cleanName($name); 244 $metaidx = $this->_getIndex($metaname, '_i'); 245 $metawords = $this->_getIndex($metaname, '_w'); 246 $addwords = false; 247 $update = array(); 248 if (!is_array($val)) $values = array($values); 249 foreach ($values as $val) { 250 $val = (string)$val; 251 if ($val !== "") { 252 $id = array_search($val, $metawords); 253 if ($id === false) { 254 $id = count($metawords); 255 $metawords[$id] = $val; 256 $addwords = true; 257 } 258 $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1); 259 $update[$id] = 1; 260 } else { 261 $id = array_search($val, $metawords); 262 if ($id !== false) { 263 $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); 264 $update[$id] = 0; 265 } 266 } 267 } 268 if (!empty($update)) { 269 if ($addwords) 270 $this->_saveIndex($metaname.'_w', '', $metawords); 271 $this->_saveIndex($metaname.'_i', '', $metaidx); 272 $val_idx = $this->_getIndexKey($metaname, '_p', $pid); 273 $val_idx = array_flip(explode(':', $val_idx)); 274 foreach ($update as $id => $add) { 275 if ($add) $val_idx[$id] = 1; 276 else unset($val_idx[$id]); 277 } 278 $val_idx = array_keys($val_idx); 279 $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx); 280 } 281 unset($metaidx); 282 unset($metawords); 283 } 284 return true; 285 } 286 287 /** 288 * Remove a page from the index 289 * 290 * Erases entries in all known indexes. 291 * 292 * @param string $page a page name 293 * @return boolean the function completed successfully 294 * @author Tom N Harris <tnharris@whoopdedo.org> 295 */ 296 public function deletePage($page) { 297 } 298 299 /** 300 * Split the text into words for fulltext search 301 * 302 * TODO: does this also need &$stopwords ? 303 * 304 * @param string $text plain text 305 * @param boolean $wc are wildcards allowed? 306 * @return array list of words in the text 307 * @author Tom N Harris <tnharris@whoopdedo.org> 308 * @author Andreas Gohr <andi@splitbrain.org> 309 */ 310 public function tokenizer($text, $wc=false) { 311 global $conf; 312 $words = array(); 313 $wc = ($wc) ? '' : '\*'; 314 $stopwords =& idx_get_stopwords(); 315 316 if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') { 317 if (0 == io_exec($conf['tokenizer_cmd'], $text, $output)) 318 $text = $output; 319 } else { 320 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 321 // handle asian chars as single words (may fail on older PHP version) 322 $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); 323 if (!is_null($asia)) $text = $asia; // recover from regexp falure 324 } 325 } 326 $text = strtr($text, "\r\n\t", ' '); 327 if (preg_match('/[^0-9A-Za-z ]/u', $text)) 328 $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); 329 330 $wordlist = explode(' ', $text); 331 foreach ($wordlist as $word) { 332 $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 333 utf8_strtolower($word) : strtolower($word); 334 if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; 335 if (array_search($word, $stopwords) !== false) continue; 336 $words[] = $word; 337 } 338 return $words; 339 } 340 341 /** 342 * Find pages in the fulltext index containing the words, 343 * 344 * The search words must be pre-tokenized, meaning only letters and 345 * numbers with an optional wildcard 346 * 347 * The returned array will have the original tokens as key. The values 348 * in the returned list is an array with the page names as keys and the 349 * number of times that token appeas on the page as value. 350 * 351 * @param array $tokens list of words to search for 352 * @return array list of page names with usage counts 353 * @author Tom N Harris <tnharris@whoopdedo.org> 354 * @author Andreas Gohr <andi@splitbrain.org> 355 */ 356 public function lookup($tokens) { 357 $result = array(); 358 $wids = $this->_getIndexWords($tokens, $result); 359 if (empty($wids)) return array(); 360 // load known words and documents 361 $page_idx = $this->_getIndex('page', ''); 362 $docs = array(); 363 foreach (array_keys($wids) as $wlen) { 364 $wids[$wlen] = array_unique($wids[$wlen]); 365 $index = $this->_getIndex('i', $wlen); 366 foreach($wids[$wlen] as $ixid) { 367 if ($ixid < count($index)) 368 $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]); 369 } 370 } 371 // merge found pages into final result array 372 $final = array(); 373 foreach ($result as $word => $res) { 374 $final[$word] = array(); 375 foreach ($res as $wid) { 376 $hits = &$docs[$wid]; 377 foreach ($hits as $hitkey => $hitcnt) { 378 // make sure the document still exists 379 if (!page_exists($hitkey, '', false)) continue; 380 if (!isset($final[$word][$hitkey])) 381 $final[$word][$hitkey] = $hitcnt; 382 else 383 $final[$word][$hitkey] += $hitcnt; 384 } 385 } 386 } 387 return $final; 388 } 389 390 /** 391 * Find pages containing a metadata key. 392 * 393 * The metadata values are compared as case-sensitive strings. Pass a 394 * callback function that returns true or false to use a different 395 * comparison function 396 * 397 * @param string $key name of the metadata key to look for 398 * @param string $value search term to look for 399 * @param callback $func comparison function 400 * @return array list with page names 401 * @author Tom N Harris <tnharris@whoopdedo.org> 402 */ 403 public function lookupKey($key, $value, $func=null) { 404 } 405 406 /** 407 * Find the index ID of each search term. 408 * 409 * The query terms should only contain valid characters, with a '*' at 410 * either the beginning or end of the word (or both). 411 * The $result parameter can be used to merge the index locations with 412 * the appropriate query term. 413 * 414 * @param array $words The query terms. 415 * @param arrayref $result Set to word => array("length*id" ...) 416 * @return array Set to length => array(id ...) 417 * @author Tom N Harris <tnharris@whoopdedo.org> 418 */ 419 private function _getIndexWords($words, &$result) { 420 $tokens = array(); 421 $tokenlength = array(); 422 $tokenwild = array(); 423 foreach ($words as $word) { 424 $result[$word] = array(); 425 $caret = false; 426 $dollar = false; 427 $xword = $word; 428 $wlen = wordlen($word); 429 430 // check for wildcards 431 if (substr($xword, 0, 1) == '*') { 432 $xword = substr($xword, 1); 433 $caret = true; 434 $wlen -= 1; 435 } 436 if (substr($xword, -1, 1) == '*') { 437 $xword = substr($xword, 0, -1); 438 $dollar = true; 439 $wlen -= 1; 440 } 441 if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) 442 continue; 443 if (!isset($tokens[$xword])) 444 $tokenlength[$wlen][] = $xword; 445 if ($caret || $dollar) { 446 $re = preg_quote($xword, '/'); 447 if ($caret) $re = '^'.$re; 448 if ($dollar) $re = $re.'$'; 449 $tokens[$xword][] = array($word, '/'.$re.'/'); 450 if (!isset($tokenwild[$xword])) 451 $tokenwild[$xword] = $wlen; 452 } else { 453 $tokens[$xword][] = array($word, null); 454 } 455 } 456 asort($tokenwild); 457 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 458 // $tokenlength = array( base word length => base word ... ) 459 // $tokenwild = array( base word => base word length ... ) 460 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 461 $indexes_known = $this->_indexLengths($length_filter); 462 if (!empty($tokenwild)) sort($indexes_known); 463 // get word IDs 464 $wids = array(); 465 foreach ($indexes_known as $ixlen) { 466 $word_idx = $this->_getIndex('w', $ixlen); 467 // handle exact search 468 if (isset($tokenlength[$ixlen])) { 469 foreach ($tokenlength[$ixlen] as $xword) { 470 $wid = array_search($xword, $word_idx); 471 if ($wid !== false) { 472 $wids[$ixlen][] = $wid; 473 foreach ($tokens[$xword] as $w) 474 $result[$w[0]][] = "$ixlen*$wid"; 475 } 476 } 477 } 478 // handle wildcard search 479 foreach ($tokenwild as $xword => $wlen) { 480 if ($wlen >= $ixlen) break; 481 foreach ($tokens[$xword] as $w) { 482 if (is_null($w[1])) continue; 483 foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 484 $wids[$ixlen][] = $wid; 485 $result[$w[0]][] = "$ixlen*$wid"; 486 } 487 } 488 } 489 } 490 return $wids; 491 } 492 493 /** 494 * Return a list of all pages 495 * 496 * @param string $key list only pages containing the metadata key (optional) 497 * @return array list of page names 498 * @author Tom N Harris <tnharris@whoopdedo.org> 499 */ 500 public function getPages($key=null) { 501 $page_idx = $this->_getIndex('page', ''); 502 if (is_null($key)) return $page_idx; 503 } 504 505 /** 506 * Return a list of words sorted by number of times used 507 * 508 * @param int $min bottom frequency threshold 509 * @param int $max upper frequency limit. No limit if $max<$min 510 * @param string $key metadata key to list. Uses the fulltext index if not given 511 * @return array list of words as the keys and frequency as values 512 * @author Tom N Harris <tnharris@whoopdedo.org> 513 */ 514 public function histogram($min=1, $max=0, $key=null) { 515 } 516 517 /** 518 * Lock the indexer. 519 * 520 * @author Tom N Harris <tnharris@whoopdedo.org> 521 */ 522 private function _lock() { 523 global $conf; 524 $status = true; 525 $lock = $conf['lockdir'].'/_indexer.lock'; 526 while (!@mkdir($lock, $conf['dmode'])) { 527 usleep(50); 528 if (time() - @filemtime($lock) > 60*5) { 529 // looks like a stale lock, remove it 530 @rmdir($lock); 531 $status = "stale lock removed"; 532 } else { 533 return false; 534 } 535 } 536 if ($conf['dperm']) 537 chmod($lock, $conf['dperm']); 538 return $status; 539 } 540 541 /** 542 * Release the indexer lock. 543 * 544 * @author Tom N Harris <tnharris@whoopdedo.org> 545 */ 546 private function _unlock() { 547 global $conf; 548 @rmdir($conf['lockdir'].'/_indexer.lock'); 549 return true; 550 } 551 552 /** 553 * Retrieve the entire index. 554 * 555 * @author Tom N Harris <tnharris@whoopdedo.org> 556 */ 557 private function _getIndex($idx, $suffix) { 558 global $conf; 559 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 560 if (!@file_exists($fn, FILE_IGNORE_NEW_LINES)) return array(); 561 return file($fn); 562 } 563 564 /** 565 * Replace the contents of the index with an array. 566 * 567 * @author Tom N Harris <tnharris@whoopdedo.org> 568 */ 569 private function _saveIndex($idx, $suffix, &$lines) { 570 global $conf; 571 $fn = $conf['indexdir'].'/'.$idx.$suffix; 572 $fh = @fopen($fn.'.tmp', 'w'); 573 if (!$fh) return false; 574 fwrite($fh, join("\n", $lines)); 575 fclose($fh); 576 if (isset($conf['fperm'])) 577 chmod($fn.'.tmp', $conf['fperm']); 578 io_rename($fn.'.tmp', $fn.'.idx'); 579 if ($suffix !== '') 580 $this->_cacheIndexDir($idx, $suffix, empty($lines)); 581 return true; 582 } 583 584 /** 585 * Retrieve a line from the index. 586 * 587 * @author Tom N Harris <tnharris@whoopdedo.org> 588 */ 589 private function _getIndexKey($idx, $suffix, $id) { 590 global $conf; 591 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 592 if (!@file_exists($fn)) return ''; 593 $fh = @fopen($fn, 'r'); 594 if (!$fh) return ''; 595 $ln = -1; 596 while (($line = fgets($fh)) !== false) { 597 if (++$ln == $id) break; 598 } 599 fclose($fh); 600 return rtrim((string)$line); 601 } 602 603 /** 604 * Write a line into the index. 605 * 606 * @author Tom N Harris <tnharris@whoopdedo.org> 607 */ 608 private function _saveIndexKey($idx, $suffix, $id, $line) { 609 global $conf; 610 if (substr($line, -1) != "\n") 611 $line .= "\n"; 612 $fn = $conf['indexdir'].'/'.$idx.$suffix; 613 $fh = @fopen($fn.'.tmp', 'w'); 614 if (!fh) return false; 615 $ih = @fopen($fn.'.idx', 'r'); 616 if ($ih) { 617 $ln = -1; 618 while (($curline = fgets($ih)) !== false) { 619 fwrite($fh, (++$ln == $id) ? $line : $curline); 620 } 621 if ($id > $ln) 622 fwrite($fh, $line); 623 fclose($ih); 624 } else { 625 fwrite($fh, $line); 626 } 627 fclose($fh); 628 if (isset($conf['fperm'])) 629 chmod($fn.'.tmp', $conf['fperm']); 630 io_rename($fn.'.tmp', $fn.'.idx'); 631 if ($suffix !== '') 632 $this->_cacheIndexDir($idx, $suffix); 633 return true; 634 } 635 636 /** 637 * Retrieve or insert a value in the index. 638 * 639 * @author Tom N Harris <tnharris@whoopdedo.org> 640 */ 641 private function _addIndexKey($idx, $suffix, $value) { 642 $index = $this->_getIndex($idx, $suffix); 643 $id = array_search($value, $index); 644 if ($id === false) { 645 $id = count($index); 646 $index[$id] = $value; 647 if (!$this->_saveIndex($idx, $suffix, $index)) { 648 trigger_error("Failed to write $idx index", E_USER_ERROR); 649 return false; 650 } 651 } 652 return $id; 653 } 654 655 private function _cacheIndexDir($idx, $suffix, $delete=false) { 656 global $conf; 657 if ($idx == 'i') 658 $cachename = $conf['indexdir'].'/lengths'; 659 else 660 $cachename = $conf['indexdir'].'/'.$idx.'lengths'; 661 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 662 if ($lengths === false) $lengths = array(); 663 $old = array_search((string)$suffix, $lengths); 664 if (empty($lines)) { 665 if ($old === false) return; 666 unset($lengths[$old]); 667 } else { 668 if ($old !== false) return; 669 $lengths[] = $suffix; 670 sort($lengths); 671 } 672 $fh = @fopen($cachename.'.tmp', 'w'); 673 if (!$fh) { 674 trigger_error("Failed to write index cache", E_USER_ERROR); 675 return; 676 } 677 @fwrite($fh, implode("\n", $lengths)); 678 @fclose($fh); 679 if (isset($conf['fperm'])) 680 chmod($cachename.'.tmp', $conf['fperm']); 681 io_rename($cachename.'.tmp', $cachename.'.idx'); 682 } 683 684 /** 685 * Get the list of lengths indexed in the wiki. 686 * 687 * Read the index directory or a cache file and returns 688 * a sorted array of lengths of the words used in the wiki. 689 * 690 * @author YoBoY <yoboy.leguesh@gmail.com> 691 */ 692 private function _listIndexLengths() { 693 global $conf; 694 $cachename = $conf['indexdir'].'/lengths'; 695 clearstatcache(); 696 if (@file_exists($cachename.'.idx')) { 697 $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 698 if ($lengths !== false) { 699 $idx = array(); 700 foreach ($lengths as $length) 701 $idx[] = (int)$length; 702 return $idx; 703 } 704 } 705 706 $dir = @opendir($conf['indexdir']); 707 if ($dir === false) 708 return array(); 709 $lengths[] = array(); 710 while (($f = readdir($dir)) !== false) { 711 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 712 $i = substr($f, 1, -4); 713 if (is_numeric($i)) 714 $lengths[] = (int)$i; 715 } 716 } 717 closedir($dir); 718 sort($lengths); 719 // save this in a file 720 $fh = @fopen($cachename.'.tmp', 'w'); 721 if (!$fh) { 722 trigger_error("Failed to write index cache", E_USER_ERROR); 723 return; 724 } 725 @fwrite($fh, implode("\n", $lengths)); 726 @fclose($fh); 727 if (isset($conf['fperm'])) 728 chmod($cachename.'.tmp', $conf['fperm']); 729 io_rename($cachename.'.tmp', $cachename.'.idx'); 730 731 return $lengths; 732 } 733 734 /** 735 * Get the word lengths that have been indexed. 736 * 737 * Reads the index directory and returns an array of lengths 738 * that there are indices for. 739 * 740 * @author YoBoY <yoboy.leguesh@gmail.com> 741 */ 742 private function _indexLengths($filter) { 743 global $conf; 744 $idx = array(); 745 if (is_array($filter)) { 746 // testing if index files exist only 747 $path = $conf['indexdir']."/i"; 748 foreach ($filter as $key => $value) { 749 if (@file_exists($path.$key.'.idx')) 750 $idx[] = $key; 751 } 752 } else { 753 $lengths = idx_listIndexLengths(); 754 foreach ($lengths as $key => $length) { 755 // keep all the values equal or superior 756 if ((int)$length >= (int)$filter) 757 $idx[] = $length; 758 } 759 } 760 return $idx; 761 } 762 763 /** 764 * Insert or replace a tuple in a line. 765 * 766 * @author Tom N Harris <tnharris@whoopdedo.org> 767 */ 768 private function _updateTuple($line, $id, $count) { 769 $newLine = $line; 770 if ($newLine !== '') 771 $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); 772 $newLine = trim($newLine, ':'); 773 if ($count) { 774 if ($strlen($newLine) > 0) 775 return "$id*$count:".$newLine; 776 else 777 return "$id*$count".$newLine; 778 } 779 return $newLine; 780 } 781 782 /** 783 * Split a line into an array of tuples. 784 * 785 * @author Tom N Harris <tnharris@whoopdedo.org> 786 * @author Andreas Gohr <andi@splitbrain.org> 787 */ 788 private function _parseTuples(&$keys, $line) { 789 $result = array(); 790 if ($line == '') return $result; 791 $parts = explode(':', $line); 792 foreach ($parts as $tuple) { 793 if ($tuple == '') continue; 794 list($key, $cnt) = explode('*', $tuple); 795 if (!$cnd) continue; 796 $key = $keys[$key]; 797 if (!$key) continue; 798 $result[$key] = $cnt; 799 } 800 return $result; 801 } 802} 803 804/** 805 * Create an instance of the indexer. 806 * 807 * @return object a Doku_Indexer 808 * @author Tom N Harris <tnharris@whoopdedo.org> 809 */ 810function & idx_get_indexer() { 811 static $Indexer = null; 812 if (is_null($Indexer)) { 813 $Indexer = new Doku_Indexer(); 814 } 815 return $Indexer; 816} 817 818/** 819 * Returns words that will be ignored. 820 * 821 * @return array list of stop words 822 * @author Tom N Harris <tnharris@whoopdedo.org> 823 */ 824function & idx_get_stopwords() { 825 static $stopwords = null; 826 if (is_null($stopwords)) { 827 global $conf; 828 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 829 if(@file_exists($swfile)){ 830 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 831 }else{ 832 $stopwords = array(); 833 } 834 } 835 return $stopwords; 836} 837 838/** 839 * Adds/updates the search index for the given page 840 * 841 * Locking is handled internally. 842 * 843 * @param string $page name of the page to index 844 * @return boolean the function completed successfully 845 * @author Tom N Harris <tnharris@whoopdedo.org> 846 */ 847function idx_addPage($page) { 848 $body = ''; 849 $data = array($page, $body); 850 $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); 851 if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page); 852 $evt->advise_after(); 853 unset($evt); 854 list($page,$body) = $data; 855 856 $Indexer =& idx_get_indexer(); 857 return $Indexer->addPageWords($page, $body); 858} 859 860/** 861 * Find tokens in the fulltext index 862 * 863 * Takes an array of words and will return a list of matching 864 * pages for each one. 865 * 866 * Important: No ACL checking is done here! All results are 867 * returned, regardless of permissions 868 * 869 * @param array $words list of words to search for 870 * @return array list of pages found, associated with the search terms 871 */ 872function idx_lookup($words) { 873 $Indexer =& idx_get_indexer(); 874 return $Indexer->lookup($words); 875} 876 877/** 878 * Split a string into tokens 879 * 880 */ 881function idx_tokenizer($string, $wc=false) { 882 $Indexer =& idx_get_indexer(); 883 return $Indexer->tokenizer($string, $wc); 884} 885 886/* For compatibility */ 887 888/** 889 * Read the list of words in an index (if it exists). 890 * 891 * @author Tom N Harris <tnharris@whoopdedo.org> 892 */ 893function idx_getIndex($idx, $suffix) { 894 global $conf; 895 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 896 if (!@file_exists($fn)) return array(); 897 return file($fn); 898} 899 900/** 901 * Get the list of lengths indexed in the wiki. 902 * 903 * Read the index directory or a cache file and returns 904 * a sorted array of lengths of the words used in the wiki. 905 * 906 * @author YoBoY <yoboy.leguesh@gmail.com> 907 */ 908function idx_listIndexLengths() { 909 global $conf; 910 // testing what we have to do, create a cache file or not. 911 if ($conf['readdircache'] == 0) { 912 $docache = false; 913 } else { 914 clearstatcache(); 915 if (@file_exists($conf['indexdir'].'/lengths.idx') 916 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 917 if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) { 918 $idx = array(); 919 foreach ($lengths as $length) { 920 $idx[] = (int)$length; 921 } 922 return $idx; 923 } 924 } 925 $docache = true; 926 } 927 928 if ($conf['readdircache'] == 0 || $docache) { 929 $dir = @opendir($conf['indexdir']); 930 if ($dir === false) 931 return array(); 932 $idx[] = array(); 933 while (($f = readdir($dir)) !== false) { 934 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 935 $i = substr($f, 1, -4); 936 if (is_numeric($i)) 937 $idx[] = (int)$i; 938 } 939 } 940 closedir($dir); 941 sort($idx); 942 // save this in a file 943 if ($docache) { 944 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 945 @fwrite($handle, implode("\n", $idx)); 946 @fclose($handle); 947 } 948 return $idx; 949 } 950 951 return array(); 952} 953 954/** 955 * Get the word lengths that have been indexed. 956 * 957 * Reads the index directory and returns an array of lengths 958 * that there are indices for. 959 * 960 * @author YoBoY <yoboy.leguesh@gmail.com> 961 */ 962function idx_indexLengths($filter) { 963 global $conf; 964 $idx = array(); 965 if (is_array($filter)) { 966 // testing if index files exist only 967 $path = $conf['indexdir']."/i"; 968 foreach ($filter as $key => $value) { 969 if (@file_exists($path.$key.'.idx')) 970 $idx[] = $key; 971 } 972 } else { 973 $lengths = idx_listIndexLengths(); 974 foreach ($lengths as $key => $length) { 975 // keep all the values equal or superior 976 if ((int)$length >= (int)$filter) 977 $idx[] = $length; 978 } 979 } 980 return $idx; 981} 982 983/** 984 * Clean a name of a key for use as a file name. 985 * 986 * Romanizes non-latin characters, then strips away anything that's 987 * not a letter, number, or underscore. 988 * 989 * @author Tom N Harris <tnharris@whoopdedo.org> 990 */ 991function idx_cleanName($name) { 992 $name = utf8_romanize(trim((string)$name)); 993 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 994 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 995 return strtolower($name); 996} 997 998//Setup VIM: ex: et ts=4 : 999