1<?php 2namespace dokuwiki\Search; 3 4use dokuwiki\Extension\Event; 5 6// Version tag used to force rebuild on upgrade 7define('INDEXER_VERSION', 8); 8 9// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 10if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 11 12 13/** 14 * Class DokuWIki Indexer for Fulltext Search 15 * 16 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 17 * @author Andreas Gohr <andi@splitbrain.org> 18 * @author Tom N Harris <tnharris@whoopdedo.org> 19 */ 20class Indexer { 21 22 /** @var Indexer */ 23 protected static $instance = null; 24 25 /** @var array $pidCache Cache for getPID() */ 26 protected $pidCache = array(); 27 28 /** @var array $Stopwords Words that indexer ignores */ 29 protected $Stopwords; 30 31 /** 32 * Indexer constructor. Singleton, thus protected! 33 */ 34 protected function __construct() {} 35 36 /** 37 * Get new or existing singleton instance of the Indexer 38 * 39 * @return Indexer 40 */ 41 public static function getInstance() 42 { 43 if (is_null(static::$instance)) { 44 static::$instance = new static(); 45 } 46 return static::$instance; 47 } 48 49 /** 50 * Returns words that will be ignored 51 * 52 * @return array list of stop words 53 * 54 * @author Tom N Harris <tnharris@whoopdedo.org> 55 */ 56 public function getStopwords() 57 { 58 if (!isset($this->Stopwords)) { 59 global $conf; 60 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 61 if (file_exists($swfile)) { 62 $this->Stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 63 } else { 64 $this->Stopwords = array(); 65 } 66 } 67 return $this->Stopwords; 68 } 69 70 /** 71 * Measure the length of a string. 72 * Differs from strlen in handling of asian characters. 73 * 74 * @author Tom N Harris <tnharris@whoopdedo.org> 75 * 76 * @param string $w 77 * @return int 78 */ 79 public static function wordlen($w) { 80 $l = strlen($w); 81 // If left alone, all chinese "words" will get put into w3.idx 82 // So the "length" of a "word" is faked 83 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 84 foreach ($leadbytes[0] as $b) { 85 $l += ord($b) - 0xE1; 86 } 87 } 88 return $l; 89 } 90 91 /** 92 * Version of the indexer taking into consideration the external tokenizer. 93 * The indexer is only compatible with data written by the same version. 94 * 95 * @triggers INDEXER_VERSION_GET 96 * Plugins that modify what gets indexed should hook this event and 97 * add their version info to the event data like so: 98 * $data[$plugin_name] = $plugin_version; 99 * 100 * @author Tom N Harris <tnharris@whoopdedo.org> 101 * @author Michael Hamann <michael@content-space.de> 102 * 103 * @return int|string 104 */ 105 public function getVersion() 106 { 107 static $indexer_version = null; 108 if ($indexer_version == null) { 109 $version = INDEXER_VERSION; 110 111 // DokuWiki version is included for the convenience of plugins 112 $data = array('dokuwiki'=>$version); 113 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 114 unset($data['dokuwiki']); // this needs to be first 115 ksort($data); 116 foreach ($data as $plugin=>$vers) { 117 $version .= '+'.$plugin.'='.$vers; 118 } 119 $indexer_version = $version; 120 } 121 return $indexer_version; 122 } 123 124 /** 125 * Adds/updates the search index for the given page 126 * 127 * Locking is handled internally. 128 * 129 * @param string $page name of the page to index 130 * @param boolean $verbose print status messages 131 * @param boolean $force force reindexing even when the index is up to date 132 * @return string|boolean the function completed successfully 133 * 134 * @author Tom N Harris <tnharris@whoopdedo.org> 135 */ 136 public function addPage($page, $verbose=false, $force=false) 137 { 138 $idxtag = metaFN($page,'.indexed'); 139 // check if page was deleted but is still in the index 140 if (!page_exists($page)) { 141 if (!file_exists($idxtag)) { 142 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 143 return false; 144 } 145 $result = $this->deletePage($page); 146 if ($result === 'locked') { 147 if ($verbose) print("Indexer: locked".DOKU_LF); 148 return false; 149 } 150 @unlink($idxtag); 151 return $result; 152 } 153 154 // check if indexing needed 155 if (!$force && file_exists($idxtag)) { 156 if (trim(io_readFile($idxtag)) == $this->getVersion()) { 157 $last = @filemtime($idxtag); 158 if ($last > @filemtime(wikiFN($page))) { 159 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 160 return false; 161 } 162 } 163 } 164 165 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 166 if ($indexenabled === false) { 167 $result = false; 168 if (file_exists($idxtag)) { 169 $result = $this->deletePage($page); 170 if ($result === 'locked') { 171 if ($verbose) print("Indexer: locked".DOKU_LF); 172 return false; 173 } 174 @unlink($idxtag); 175 } 176 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 177 return $result; 178 } 179 180 $pid = $this->getPID($page); 181 if ($pid === false) { 182 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 183 return false; 184 } 185 $body = ''; 186 $metadata = array(); 187 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 188 189 $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED); 190 $metadata['relation_references'] = ($references !== null) ? 191 array_keys($references) : array(); 192 193 $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED); 194 $metadata['relation_media'] = ($media !== null) ? 195 array_keys($media) : array(); 196 197 $data = compact('page', 'body', 'metadata', 'pid'); 198 $evt = new Event('INDEXER_PAGE_ADD', $data); 199 if ($evt->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page); 200 $evt->advise_after(); 201 unset($evt); 202 extract($data); 203 204 $result = $this->addPageWords($page, $body); 205 if ($result === 'locked') { 206 if ($verbose) print("Indexer: locked".DOKU_LF); 207 return false; 208 } 209 210 if ($result) { 211 $result = $this->addMetaKeys($page, $metadata); 212 if ($result === 'locked') { 213 if ($verbose) print("Indexer: locked".DOKU_LF); 214 return false; 215 } 216 } 217 218 if ($result) { 219 io_saveFile(metaFN($page,'.indexed'), $this->getVersion()); 220 } 221 if ($verbose) { 222 print("Indexer: finished".DOKU_LF); 223 return true; 224 } 225 return $result; 226 } 227 228 /** 229 * Adds the contents of a page to the fulltext index 230 * 231 * The added text replaces previous words for the same page. 232 * An empty value erases the page. 233 * 234 * @param string $page a page name 235 * @param string $text the body of the page 236 * @return string|boolean the function completed successfully 237 * 238 * @author Tom N Harris <tnharris@whoopdedo.org> 239 * @author Andreas Gohr <andi@splitbrain.org> 240 */ 241 public function addPageWords($page, $text) 242 { 243 if (!$this->lock()) return 'locked'; 244 245 // load known documents 246 $pid = $this->getPIDNoLock($page); 247 if ($pid === false) { 248 $this->unlock(); 249 return false; 250 } 251 252 $pagewords = array(); 253 // get word usage in page 254 $words = $this->getPageWords($text); 255 if ($words === false) { 256 $this->unlock(); 257 return false; 258 } 259 260 if (!empty($words)) { 261 foreach (array_keys($words) as $wlen) { 262 $index = $this->getIndex('i', $wlen); 263 foreach ($words[$wlen] as $wid => $freq) { 264 $idx = ($wid<count($index)) ? $index[$wid] : ''; 265 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 266 $pagewords[] = "$wlen*$wid"; 267 } 268 if (!$this->saveIndex('i', $wlen, $index)) { 269 $this->unlock(); 270 return false; 271 } 272 } 273 } 274 275 // Remove obsolete index entries 276 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 277 if ($pageword_idx !== '') { 278 $oldwords = explode(':',$pageword_idx); 279 $delwords = array_diff($oldwords, $pagewords); 280 $upwords = array(); 281 foreach ($delwords as $word) { 282 if ($word != '') { 283 list($wlen,$wid) = explode('*', $word); 284 $wid = (int)$wid; 285 $upwords[$wlen][] = $wid; 286 } 287 } 288 foreach ($upwords as $wlen => $widx) { 289 $index = $this->getIndex('i', $wlen); 290 foreach ($widx as $wid) { 291 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 292 } 293 $this->saveIndex('i', $wlen, $index); 294 } 295 } 296 // Save the reverse index 297 $pageword_idx = join(':', $pagewords); 298 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 299 $this->unlock(); 300 return false; 301 } 302 303 $this->unlock(); 304 return true; 305 } 306 307 /** 308 * Split the words in a page and add them to the index. 309 * 310 * @param string $text content of the page 311 * @return array list of word IDs and number of times used 312 * 313 * @author Andreas Gohr <andi@splitbrain.org> 314 * @author Christopher Smith <chris@jalakai.co.uk> 315 * @author Tom N Harris <tnharris@whoopdedo.org> 316 */ 317 protected function getPageWords($text) 318 { 319 $tokens = $this->tokenizer($text); 320 $tokens = array_count_values($tokens); // count the frequency of each token 321 322 $words = array(); 323 foreach ($tokens as $w=>$c) { 324 $l = static::wordlen($w); 325 if (isset($words[$l])) { 326 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 327 } else { 328 $words[$l] = array($w => $c); 329 } 330 } 331 332 // arrive here with $words = array(wordlen => array(word => frequency)) 333 $word_idx_modified = false; 334 $index = array(); //resulting index 335 foreach (array_keys($words) as $wlen) { 336 $word_idx = $this->getIndex('w', $wlen); 337 foreach ($words[$wlen] as $word => $freq) { 338 $word = (string)$word; 339 $wid = array_search($word, $word_idx, true); 340 if ($wid === false) { 341 $wid = count($word_idx); 342 $word_idx[] = $word; 343 $word_idx_modified = true; 344 } 345 if (!isset($index[$wlen])) { 346 $index[$wlen] = array(); 347 } 348 $index[$wlen][$wid] = $freq; 349 } 350 // save back the word index 351 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) { 352 return false; 353 } 354 } 355 356 return $index; 357 } 358 359 /** 360 * Add/update keys to/of the metadata index. 361 * 362 * Adding new keys does not remove other keys for the page. 363 * An empty value will erase the key. 364 * The $key parameter can be an array to add multiple keys. $value will 365 * not be used if $key is an array. 366 * 367 * @param string $page a page name 368 * @param mixed $key a key string or array of key=>value pairs 369 * @param mixed $value the value or list of values 370 * @return boolean|string the function completed successfully 371 * 372 * @author Tom N Harris <tnharris@whoopdedo.org> 373 * @author Michael Hamann <michael@content-space.de> 374 */ 375 public function addMetaKeys($page, $key, $value=null) 376 { 377 if (!is_array($key)) { 378 $key = array($key => $value); 379 } elseif (!is_null($value)) { 380 // $key is array, but $value is not null 381 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 382 } 383 384 if (!$this->lock()) return 'locked'; 385 386 // load known documents 387 $pid = $this->getPIDNoLock($page); 388 if ($pid === false) { 389 $this->unlock(); 390 return false; 391 } 392 393 // Special handling for titles so the index file is simpler 394 if (array_key_exists('title', $key)) { 395 $value = $key['title']; 396 if (is_array($value)) { 397 $value = $value[0]; 398 } 399 $this->saveIndexKey('title', '', $pid, $value); 400 unset($key['title']); 401 } 402 403 foreach ($key as $name => $values) { 404 $metaname = $this->cleanName($name); 405 $this->addIndexKey('metadata', '', $metaname); 406 $metaidx = $this->getIndex($metaname.'_i', ''); 407 $metawords = $this->getIndex($metaname.'_w', ''); 408 $addwords = false; 409 410 if (!is_array($values)) $values = array($values); 411 412 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 413 if ($val_idx !== '') { 414 $val_idx = explode(':', $val_idx); 415 // -1 means remove, 0 keep, 1 add 416 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 417 } else { 418 $val_idx = array(); 419 } 420 421 foreach ($values as $val) { 422 $val = (string)$val; 423 if ($val !== '') { 424 $id = array_search($val, $metawords, true); 425 if ($id === false) { 426 // didn't find $val, so we'll add it to the end of metawords 427 // and create a placeholder in metaidx 428 $id = count($metawords); 429 $metawords[$id] = $val; 430 $metaidx[$id] = ''; 431 $addwords = true; 432 } 433 // test if value is already in the index 434 if (isset($val_idx[$id]) && $val_idx[$id] <= 0) { 435 $val_idx[$id] = 0; 436 } else { // else add it 437 $val_idx[$id] = 1; 438 } 439 } 440 } 441 442 if ($addwords) { 443 $this->saveIndex($metaname.'_w', '', $metawords); 444 } 445 $vals_changed = false; 446 foreach ($val_idx as $id => $action) { 447 if ($action == -1) { 448 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 449 $vals_changed = true; 450 unset($val_idx[$id]); 451 } elseif ($action == 1) { 452 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 453 $vals_changed = true; 454 } 455 } 456 457 if ($vals_changed) { 458 $this->saveIndex($metaname.'_i', '', $metaidx); 459 $val_idx = implode(':', array_keys($val_idx)); 460 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 461 } 462 463 unset($metaidx); 464 unset($metawords); 465 } 466 467 $this->unlock(); 468 return true; 469 } 470 471 /** 472 * Rename a page in the search index without changing the indexed content. 473 * This function doesn't check if the old or new name exists in the filesystem. 474 * It returns an error if the old page isn't in the page list of the indexer 475 * and it deletes all previously indexed content of the new page. 476 * 477 * @param string $oldpage The old page name 478 * @param string $newpage The new page name 479 * @return string|bool If the page was successfully renamed, 480 * can be a message in the case of an error 481 */ 482 public function renamePage($oldpage, $newpage) 483 { 484 if (!$this->lock()) return 'locked'; 485 486 $pages = $this->getPages(); 487 488 $id = array_search($oldpage, $pages, true); 489 if ($id === false) { 490 $this->unlock(); 491 return 'page is not in index'; 492 } 493 494 $new_id = array_search($newpage, $pages, true); 495 if ($new_id !== false) { 496 // make sure the page is not in the index anymore 497 if ($this->deletePageNoLock($newpage) !== true) { 498 return false; 499 } 500 501 $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 502 } 503 504 $pages[$id] = $newpage; 505 506 // update index 507 if (!$this->saveIndex('page', '', $pages)) { 508 $this->unlock(); 509 return false; 510 } 511 512 // reset the pid cache 513 $this->pidCache = array(); 514 515 $this->unlock(); 516 return true; 517 } 518 519 /** 520 * Renames a meta value in the index. 521 * This doesn't change the meta value in the pages, it assumes that 522 * all pages will be updated. 523 * 524 * @param string $key The metadata key of which a value shall be changed 525 * @param string $oldvalue The old value that shall be renamed 526 * @param string $newvalue The new value to which the old value shall be renamed, 527 * if exists values will be merged 528 * @return bool|string If renaming the value has been successful, false 529 * or error message on error. 530 */ 531 public function renameMetaValue($key, $oldvalue, $newvalue) 532 { 533 if (!$this->lock()) return 'locked'; 534 535 // change the relation references index 536 $metavalues = $this->getIndex($key, '_w'); 537 $oldid = array_search($oldvalue, $metavalues, true); 538 if ($oldid !== false) { 539 $newid = array_search($newvalue, $metavalues, true); 540 if ($newid !== false) { 541 // free memory 542 unset ($metavalues); 543 544 // okay, now we have two entries for the same value. we need to merge them. 545 $indexline = $this->getIndexKey($key.'_i', '', $oldid); 546 if ($indexline != '') { 547 $newindexline = $this->getIndexKey($key.'_i', '', $newid); 548 $pagekeys = $this->getIndex($key.'_p', ''); 549 $parts = explode(':', $indexline); 550 foreach ($parts as $part) { 551 list($id, $count) = explode('*', $part); 552 $newindexline = $this->updateTuple($newindexline, $id, $count); 553 554 $keyline = explode(':', $pagekeys[$id]); 555 // remove old meta value 556 $keyline = array_diff($keyline, array($oldid)); 557 // add new meta value when not already present 558 if (!in_array($newid, $keyline)) { 559 array_push($keyline, $newid); 560 } 561 $pagekeys[$id] = implode(':', $keyline); 562 } 563 $this->saveIndex($key.'_p', '', $pagekeys); 564 unset($pagekeys); 565 $this->saveIndexKey($key.'_i', '', $oldid, ''); 566 $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 567 } 568 } else { 569 $metavalues[$oldid] = $newvalue; 570 if (!$this->saveIndex($key.'_w', '', $metavalues)) { 571 $this->unlock(); 572 return false; 573 } 574 } 575 } 576 577 $this->unlock(); 578 return true; 579 } 580 581 /** 582 * Remove a page from the index 583 * 584 * Erases entries in all known indexes. 585 * 586 * @param string $page a page name 587 * @return string|boolean the function completed successfully 588 * 589 * @author Tom N Harris <tnharris@whoopdedo.org> 590 */ 591 public function deletePage($page) 592 { 593 if (!$this->lock()) return 'locked'; 594 595 $result = $this->deletePageNoLock($page); 596 $this->unlock(); 597 return $result; 598 } 599 600 /** 601 * Remove a page from the index without locking the index, 602 * only use this function if the index is already locked 603 * 604 * Erases entries in all known indexes. 605 * 606 * @param string $page a page name 607 * @return boolean the function completed successfully 608 * 609 * @author Tom N Harris <tnharris@whoopdedo.org> 610 */ 611 protected function deletePageNoLock($page) 612 { 613 // load known documents 614 $pid = $this->getPIDNoLock($page); 615 if ($pid === false) { 616 return false; 617 } 618 619 // Remove obsolete index entries 620 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 621 if ($pageword_idx !== '') { 622 $delwords = explode(':',$pageword_idx); 623 $upwords = array(); 624 foreach ($delwords as $word) { 625 if ($word != '') { 626 list($wlen,$wid) = explode('*', $word); 627 $wid = (int)$wid; 628 $upwords[$wlen][] = $wid; 629 } 630 } 631 foreach ($upwords as $wlen => $widx) { 632 $index = $this->getIndex('i', $wlen); 633 foreach ($widx as $wid) { 634 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 635 } 636 $this->saveIndex('i', $wlen, $index); 637 } 638 } 639 // Save the reverse index 640 if (!$this->saveIndexKey('pageword', '', $pid, '')) { 641 return false; 642 } 643 644 $this->saveIndexKey('title', '', $pid, ''); 645 $keyidx = $this->getIndex('metadata', ''); 646 foreach ($keyidx as $metaname) { 647 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 648 $meta_idx = $this->getIndex($metaname.'_i', ''); 649 foreach ($val_idx as $id) { 650 if ($id === '') continue; 651 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 652 } 653 $this->saveIndex($metaname.'_i', '', $meta_idx); 654 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 655 } 656 657 return true; 658 } 659 660 /** 661 * Clear the whole index 662 * 663 * @return bool If the index has been cleared successfully 664 */ 665 public function clear() 666 { 667 global $conf; 668 669 if (!$this->lock()) return false; 670 671 @unlink($conf['indexdir'].'/page.idx'); 672 @unlink($conf['indexdir'].'/title.idx'); 673 @unlink($conf['indexdir'].'/pageword.idx'); 674 @unlink($conf['indexdir'].'/metadata.idx'); 675 $dir = @opendir($conf['indexdir']); 676 if ($dir !== false) { 677 while (($f = readdir($dir)) !== false) { 678 if (in_array($f[0], ['i', 'w']) && substr($f, -4) == '.idx') { 679 // fulltext index 680 @unlink($conf['indexdir']."/$f"); 681 } elseif (in_array(substr($f, -6), ['_w.idx','_i.idx','_p.idx'])) { 682 // metadata index 683 @unlink($conf['indexdir']."/$f"); 684 } 685 } 686 } 687 @unlink($conf['indexdir'].'/lengths.idx'); 688 689 // clear the pid cache 690 $this->pidCache = array(); 691 692 $this->unlock(); 693 return true; 694 } 695 696 /** 697 * Split the text into words for fulltext search 698 * 699 * @triggers INDEXER_TEXT_PREPARE 700 * This event allows plugins to modify the text before it gets tokenized. 701 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 702 * 703 * @param string $text plain text 704 * @param boolean $wc are wildcards allowed? 705 * @return array list of words in the text 706 * 707 * @author Tom N Harris <tnharris@whoopdedo.org> 708 * @author Andreas Gohr <andi@splitbrain.org> 709 */ 710 public function tokenizer($text, $wc=false) 711 { 712 $wc = ($wc) ? '' : '\*'; 713 714 // prepare the text to be tokenized 715 $evt = new Event('INDEXER_TEXT_PREPARE', $text); 716 if ($evt->advise_before(true)) { 717 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 718 $text = \dokuwiki\Utf8\Asian::separateAsianWords($text); 719 } 720 } 721 $evt->advise_after(); 722 unset($evt); 723 724 $text = strtr($text, 725 array( 726 "\r" => ' ', 727 "\n" => ' ', 728 "\t" => ' ', 729 "\xC2\xAD" => '', //soft-hyphen 730 ) 731 ); 732 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 733 $text = \dokuwiki\Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 734 } 735 736 $wordlist = explode(' ', $text); 737 foreach ($wordlist as $i => $word) { 738 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 739 \dokuwiki\Utf8\PhpString::strtolower($word) : strtolower($word); 740 } 741 742 foreach ($wordlist as $i => $word) { 743 if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 744 || array_search($word, $this->getStopwords(), true) !== false) { 745 unset($wordlist[$i]); 746 } 747 } 748 return array_values($wordlist); 749 } 750 751 /** 752 * Get the numeric PID of a page 753 * 754 * @param string $page The page to get the PID for 755 * @return bool|int The page id on success, false on error 756 */ 757 public function getPID($page) 758 { 759 // return PID without locking when it is in the cache 760 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 761 762 if (!$this->lock()) return false; 763 764 // load known documents 765 $pid = $this->getPIDNoLock($page); 766 if ($pid === false) { 767 $this->unlock(); 768 return false; 769 } 770 771 $this->unlock(); 772 return $pid; 773 } 774 775 /** 776 * Get the numeric PID of a page without locking the index. 777 * Only use this function when the index is already locked. 778 * 779 * @param string $page The page to get the PID for 780 * @return bool|int The page id on success, false on error 781 */ 782 protected function getPIDNoLock($page) 783 { 784 // avoid expensive addIndexKey operation for the most recently 785 // requested pages by using a cache 786 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 787 $pid = $this->addIndexKey('page', '', $page); 788 // limit cache to 10 entries by discarding the oldest element 789 // as in DokuWiki usually only the most recently 790 // added item will be requested again 791 if (count($this->pidCache) > 10) array_shift($this->pidCache); 792 $this->pidCache[$page] = $pid; 793 return $pid; 794 } 795 796 /** 797 * Get the page id of a numeric PID 798 * 799 * @param int $pid The PID to get the page id for 800 * @return string The page id 801 */ 802 public function getPageFromPID($pid) 803 { 804 return $this->getIndexKey('page', '', $pid); 805 } 806 807 /** 808 * Find pages in the fulltext index containing the words, 809 * 810 * The search words must be pre-tokenized, meaning only letters and 811 * numbers with an optional wildcard 812 * 813 * The returned array will have the original tokens as key. The values 814 * in the returned list is an array with the page names as keys and the 815 * number of times that token appears on the page as value. 816 * 817 * @param array $tokens list of words to search for 818 * @return array list of page names with usage counts 819 * 820 * @author Tom N Harris <tnharris@whoopdedo.org> 821 * @author Andreas Gohr <andi@splitbrain.org> 822 */ 823 public function lookup(&$tokens) 824 { 825 $result = array(); 826 $wids = $this->getIndexWords($tokens, $result); 827 if (empty($wids)) return array(); 828 // load known words and documents 829 $page_idx = $this->getIndex('page', ''); 830 $docs = array(); 831 foreach (array_keys($wids) as $wlen) { 832 $wids[$wlen] = array_unique($wids[$wlen]); 833 $index = $this->getIndex('i', $wlen); 834 foreach ($wids[$wlen] as $ixid) { 835 if ($ixid < count($index)) { 836 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 837 } 838 } 839 } 840 // merge found pages into final result array 841 $final = array(); 842 foreach ($result as $word => $res) { 843 $final[$word] = array(); 844 foreach ($res as $wid) { 845 // handle the case when ($ixid < count($index)) has been false 846 // and thus $docs[$wid] hasn't been set. 847 if (!isset($docs[$wid])) continue; 848 $hits = &$docs[$wid]; 849 foreach ($hits as $hitkey => $hitcnt) { 850 // make sure the document still exists 851 if (!page_exists($hitkey, '', false)) continue; 852 if (!isset($final[$word][$hitkey])) { 853 $final[$word][$hitkey] = $hitcnt; 854 } else { 855 $final[$word][$hitkey] += $hitcnt; 856 } 857 } 858 } 859 } 860 return $final; 861 } 862 863 /** 864 * Find pages containing a metadata key. 865 * 866 * The metadata values are compared as case-sensitive strings. Pass a 867 * callback function that returns true or false to use a different 868 * comparison function. The function will be called with the $value being 869 * searched for as the first argument, and the word in the index as the 870 * second argument. The function preg_match can be used directly if the 871 * values are regexes. 872 * 873 * @param string $key name of the metadata key to look for 874 * @param string $value search term to look for, must be a string or array of strings 875 * @param callback $func comparison function 876 * @return array lists with page names, keys are query values if $value is array 877 * 878 * @author Tom N Harris <tnharris@whoopdedo.org> 879 * @author Michael Hamann <michael@content-space.de> 880 */ 881 public function lookupKey($key, &$value, $func=null) 882 { 883 if (!is_array($value)) { 884 $value_array = array($value); 885 } else { 886 $value_array =& $value; 887 } 888 889 // the matching ids for the provided value(s) 890 $value_ids = array(); 891 892 $metaname = $this->cleanName($key); 893 894 // get all words in order to search the matching ids 895 if ($key == 'title') { 896 $words = $this->getIndex('title', ''); 897 } else { 898 $words = $this->getIndex($metaname.'_w', ''); 899 } 900 901 if (!is_null($func)) { 902 foreach ($value_array as $val) { 903 foreach ($words as $i => $word) { 904 if (call_user_func_array($func, array($val, $word))) { 905 $value_ids[$i][] = $val; 906 } 907 } 908 } 909 } else { 910 foreach ($value_array as $val) { 911 $xval = $val; 912 $caret = '^'; 913 $dollar = '$'; 914 // check for wildcards 915 if (substr($xval, 0, 1) == '*') { 916 $xval = substr($xval, 1); 917 $caret = ''; 918 } 919 if (substr($xval, -1, 1) == '*') { 920 $xval = substr($xval, 0, -1); 921 $dollar = ''; 922 } 923 if (!$caret || !$dollar) { 924 $re = $caret.preg_quote($xval, '/').$dollar; 925 foreach (array_keys(preg_grep('/'.$re.'/', $words)) as $i) { 926 $value_ids[$i][] = $val; 927 } 928 } else { 929 if (($i = array_search($val, $words, true)) !== false) { 930 $value_ids[$i][] = $val; 931 } 932 } 933 } 934 } 935 936 unset($words); // free the used memory 937 938 // initialize the result so it won't be null 939 $result = array(); 940 foreach ($value_array as $val) { 941 $result[$val] = array(); 942 } 943 944 $page_idx = $this->getIndex('page', ''); 945 946 // Special handling for titles 947 if ($key == 'title') { 948 foreach ($value_ids as $pid => $val_list) { 949 $page = $page_idx[$pid]; 950 foreach ($val_list as $val) { 951 $result[$val][] = $page; 952 } 953 } 954 } else { 955 // load all lines and pages so the used lines can be taken 956 // and matched with the pages 957 $lines = $this->getIndex($metaname.'_i', ''); 958 959 foreach ($value_ids as $value_id => $val_list) { 960 // parse the tuples of the form page_id*1:page2_id*1 and so on, 961 // return value is an array with page_id => 1, page2_id => 1 etc. 962 // so take the keys only 963 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 964 foreach ($val_list as $val) { 965 $result[$val] = array_merge($result[$val], $pages); 966 } 967 } 968 } 969 if (!is_array($value)) $result = $result[$value]; 970 return $result; 971 } 972 973 /** 974 * Find the index ID of each search term. 975 * 976 * The query terms should only contain valid characters, with a '*' at 977 * either the beginning or end of the word (or both). 978 * The $result parameter can be used to merge the index locations with 979 * the appropriate query term. 980 * 981 * @param array $words The query terms. 982 * @param array $result Set to word => array("length*id" ...) 983 * @return array Set to length => array(id ...) 984 * 985 * @author Tom N Harris <tnharris@whoopdedo.org> 986 */ 987 protected function getIndexWords(&$words, &$result) 988 { 989 $tokens = array(); 990 $tokenlength = array(); 991 $tokenwild = array(); 992 foreach ($words as $word) { 993 $result[$word] = array(); 994 $caret = '^'; 995 $dollar = '$'; 996 $xword = $word; 997 $wlen = static::wordlen($word); 998 999 // check for wildcards 1000 if (substr($xword, 0, 1) == '*') { 1001 $xword = substr($xword, 1); 1002 $caret = ''; 1003 $wlen -= 1; 1004 } 1005 if (substr($xword, -1, 1) == '*') { 1006 $xword = substr($xword, 0, -1); 1007 $dollar = ''; 1008 $wlen -= 1; 1009 } 1010 if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) { 1011 continue; 1012 } 1013 if (!isset($tokens[$xword])) { 1014 $tokenlength[$wlen][] = $xword; 1015 } 1016 if (!$caret || !$dollar) { 1017 $re = $caret.preg_quote($xword, '/').$dollar; 1018 $tokens[$xword][] = array($word, '/'.$re.'/'); 1019 if (!isset($tokenwild[$xword])) { 1020 $tokenwild[$xword] = $wlen; 1021 } 1022 } else { 1023 $tokens[$xword][] = array($word, null); 1024 } 1025 } 1026 asort($tokenwild); 1027 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 1028 // $tokenlength = array( base word length => base word ... ) 1029 // $tokenwild = array( base word => base word length ... ) 1030 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 1031 $indexes_known = $this->indexLengths($length_filter); 1032 if (!empty($tokenwild)) sort($indexes_known); 1033 // get word IDs 1034 $wids = array(); 1035 foreach ($indexes_known as $ixlen) { 1036 $word_idx = $this->getIndex('w', $ixlen); 1037 // handle exact search 1038 if (isset($tokenlength[$ixlen])) { 1039 foreach ($tokenlength[$ixlen] as $xword) { 1040 $wid = array_search($xword, $word_idx, true); 1041 if ($wid !== false) { 1042 $wids[$ixlen][] = $wid; 1043 foreach ($tokens[$xword] as $w) 1044 $result[$w[0]][] = "$ixlen*$wid"; 1045 } 1046 } 1047 } 1048 // handle wildcard search 1049 foreach ($tokenwild as $xword => $wlen) { 1050 if ($wlen >= $ixlen) break; 1051 foreach ($tokens[$xword] as $w) { 1052 if (is_null($w[1])) continue; 1053 foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { 1054 $wids[$ixlen][] = $wid; 1055 $result[$w[0]][] = "$ixlen*$wid"; 1056 } 1057 } 1058 } 1059 } 1060 return $wids; 1061 } 1062 1063 /** 1064 * Return a list of all pages 1065 * Warning: pages may not exist! 1066 * 1067 * @param string $key list only pages containing the metadata key (optional) 1068 * @return array list of page names 1069 * 1070 * @author Tom N Harris <tnharris@whoopdedo.org> 1071 */ 1072 public function getPages($key=null) 1073 { 1074 $page_idx = $this->getIndex('page', ''); 1075 if (is_null($key)) return $page_idx; 1076 1077 $metaname = $this->cleanName($key); 1078 1079 // Special handling for titles 1080 if ($key == 'title') { 1081 $title_idx = $this->getIndex('title', ''); 1082 array_splice($page_idx, count($title_idx)); 1083 foreach ($title_idx as $i => $title) { 1084 if ($title === '') unset($page_idx[$i]); 1085 } 1086 return array_values($page_idx); 1087 } 1088 1089 $pages = array(); 1090 $lines = $this->getIndex($metaname.'_i', ''); 1091 foreach ($lines as $line) { 1092 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 1093 } 1094 return array_keys($pages); 1095 } 1096 1097 /** 1098 * Return a list of words sorted by number of times used 1099 * 1100 * @param int $min bottom frequency threshold 1101 * @param int $max upper frequency limit. No limit if $max<$min 1102 * @param int $minlen minimum length of words to count 1103 * @param string $key metadata key to list. Uses the fulltext index if not given 1104 * @return array list of words as the keys and frequency as values 1105 * 1106 * @author Tom N Harris <tnharris@whoopdedo.org> 1107 */ 1108 public function histogram($min=1, $max=0, $minlen=3, $key=null) 1109 { 1110 if ($min < 1) $min = 1; 1111 if ($max < $min) $max = 0; 1112 1113 $result = array(); 1114 1115 if ($key == 'title') { 1116 $index = $this->getIndex('title', ''); 1117 $index = array_count_values($index); 1118 foreach ($index as $val => $cnt) { 1119 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) { 1120 $result[$val] = $cnt; 1121 } 1122 } 1123 } elseif (!is_null($key)) { 1124 $metaname = $this->cleanName($key); 1125 $index = $this->getIndex($metaname.'_i', ''); 1126 $val_idx = array(); 1127 foreach ($index as $wid => $line) { 1128 $freq = $this->countTuples($line); 1129 if ($freq >= $min && (!$max || $freq <= $max)) { 1130 $val_idx[$wid] = $freq; 1131 } 1132 } 1133 if (!empty($val_idx)) { 1134 $words = $this->getIndex($metaname.'_w', ''); 1135 foreach ($val_idx as $wid => $freq) { 1136 if (strlen($words[$wid]) >= $minlen) { 1137 $result[$words[$wid]] = $freq; 1138 } 1139 } 1140 } 1141 } else { 1142 $lengths = $this->listIndexLengths(); 1143 foreach ($lengths as $length) { 1144 if ($length < $minlen) continue; 1145 $index = $this->getIndex('i', $length); 1146 $words = null; 1147 foreach ($index as $wid => $line) { 1148 $freq = $this->countTuples($line); 1149 if ($freq >= $min && (!$max || $freq <= $max)) { 1150 if ($words === null) { 1151 $words = $this->getIndex('w', $length); 1152 } 1153 $result[$words[$wid]] = $freq; 1154 } 1155 } 1156 } 1157 } 1158 1159 arsort($result); 1160 return $result; 1161 } 1162 1163 /** 1164 * Clean a name of a key for use as a file name. 1165 * 1166 * Romanizes non-latin characters, then strips away anything that's 1167 * not a letter, number, or underscore. 1168 * 1169 * @author Tom N Harris <tnharris@whoopdedo.org> 1170 * 1171 * @param string $name 1172 * @return string 1173 */ 1174 protected function cleanName($name) 1175 { 1176 $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name)); 1177 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1178 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1179 return strtolower($name); 1180 } 1181 1182 /** 1183 * Lock the indexer. 1184 * 1185 * @author Tom N Harris <tnharris@whoopdedo.org> 1186 * 1187 * @return bool|string 1188 */ 1189 protected function lock() 1190 { 1191 global $conf; 1192 $status = true; 1193 $run = 0; 1194 $lock = $conf['lockdir'].'/_indexer.lock'; 1195 while (!@mkdir($lock, $conf['dmode'])) { 1196 usleep(50); 1197 if (is_dir($lock) && time() - @filemtime($lock) > 60*5) { 1198 // looks like a stale lock - remove it 1199 if (!@rmdir($lock)) { 1200 $status = "removing the stale lock failed"; 1201 return false; 1202 } else { 1203 $status = "stale lock removed"; 1204 } 1205 } elseif ($run++ == 1000) { 1206 // we waited 5 seconds for that lock 1207 return false; 1208 } 1209 } 1210 if (!empty($conf['dperm'])) { 1211 chmod($lock, $conf['dperm']); 1212 } 1213 return $status; 1214 } 1215 1216 /** 1217 * Release the indexer lock. 1218 * 1219 * @author Tom N Harris <tnharris@whoopdedo.org> 1220 * 1221 * @return bool 1222 */ 1223 protected function unlock() 1224 { 1225 global $conf; 1226 @rmdir($conf['lockdir'].'/_indexer.lock'); 1227 return true; 1228 } 1229 1230 /** 1231 * Retrieve the entire index. 1232 * 1233 * The $suffix argument is for an index that is split into multiple parts. 1234 * Different index files should use different base names. 1235 * 1236 * @param string $idx name of the index 1237 * @param string $suffix subpart identifier 1238 * @return array list of lines without CR or LF 1239 * 1240 * @author Tom N Harris <tnharris@whoopdedo.org> 1241 */ 1242 protected function getIndex($idx, $suffix) 1243 { 1244 global $conf; 1245 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1246 if (!file_exists($fn)) return array(); 1247 return file($fn, FILE_IGNORE_NEW_LINES); 1248 } 1249 1250 /** 1251 * Replace the contents of the index with an array. 1252 * 1253 * @param string $idx name of the index 1254 * @param string $suffix subpart identifier 1255 * @param array $lines list of lines without LF 1256 * @return bool If saving succeeded 1257 * 1258 * @author Tom N Harris <tnharris@whoopdedo.org> 1259 */ 1260 protected function saveIndex($idx, $suffix, &$lines) 1261 { 1262 global $conf; 1263 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1264 $fh = @fopen($fn.'.tmp', 'w'); 1265 if (!$fh) return false; 1266 fwrite($fh, join("\n", $lines)); 1267 if (!empty($lines)) { 1268 fwrite($fh, "\n"); 1269 } 1270 fclose($fh); 1271 if (isset($conf['fperm'])) { 1272 chmod($fn.'.tmp', $conf['fperm']); 1273 } 1274 io_rename($fn.'.tmp', $fn.'.idx'); 1275 return true; 1276 } 1277 1278 /** 1279 * Retrieve a line from the index. 1280 * 1281 * @param string $idx name of the index 1282 * @param string $suffix subpart identifier 1283 * @param int $id the line number 1284 * @return string a line with trailing whitespace removed 1285 * 1286 * @author Tom N Harris <tnharris@whoopdedo.org> 1287 */ 1288 protected function getIndexKey($idx, $suffix, $id) 1289 { 1290 global $conf; 1291 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1292 if (!file_exists($fn)) return ''; 1293 $fh = @fopen($fn, 'r'); 1294 if (!$fh) return ''; 1295 $ln = -1; 1296 while (($line = fgets($fh)) !== false) { 1297 if (++$ln == $id) break; 1298 } 1299 fclose($fh); 1300 return rtrim((string)$line); 1301 } 1302 1303 /** 1304 * Write a line into the index. 1305 * 1306 * @param string $idx name of the index 1307 * @param string $suffix subpart identifier 1308 * @param int $id the line number 1309 * @param string $line line to write 1310 * @return bool If saving succeeded 1311 * 1312 * @author Tom N Harris <tnharris@whoopdedo.org> 1313 */ 1314 protected function saveIndexKey($idx, $suffix, $id, $line) 1315 { 1316 global $conf; 1317 if (substr($line, -1) !== "\n") { 1318 $line .= "\n"; 1319 } 1320 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1321 $fh = @fopen($fn.'.tmp', 'w'); 1322 if (!$fh) return false; 1323 $ih = @fopen($fn.'.idx', 'r'); 1324 if ($ih) { 1325 $ln = -1; 1326 while (($curline = fgets($ih)) !== false) { 1327 fwrite($fh, (++$ln == $id) ? $line : $curline); 1328 } 1329 if ($id > $ln) { 1330 while ($id > ++$ln) { 1331 fwrite($fh, "\n"); 1332 } 1333 fwrite($fh, $line); 1334 } 1335 fclose($ih); 1336 } else { 1337 $ln = -1; 1338 while ($id > ++$ln) { 1339 fwrite($fh, "\n"); 1340 } 1341 fwrite($fh, $line); 1342 } 1343 fclose($fh); 1344 if (isset($conf['fperm'])) { 1345 chmod($fn.'.tmp', $conf['fperm']); 1346 } 1347 io_rename($fn.'.tmp', $fn.'.idx'); 1348 return true; 1349 } 1350 1351 /** 1352 * Retrieve or insert a value in the index. 1353 * 1354 * @param string $idx name of the index 1355 * @param string $suffix subpart identifier 1356 * @param string $value line to find in the index 1357 * @return int|bool line number of the value in the index 1358 * or false if writing the index failed 1359 * 1360 * @author Tom N Harris <tnharris@whoopdedo.org> 1361 */ 1362 protected function addIndexKey($idx, $suffix, $value) 1363 { 1364 $index = $this->getIndex($idx, $suffix); 1365 $id = array_search($value, $index, true); 1366 if ($id === false) { 1367 $id = count($index); 1368 $index[$id] = $value; 1369 if (!$this->saveIndex($idx, $suffix, $index)) { 1370 trigger_error("Failed to write $idx index", E_USER_ERROR); 1371 return false; 1372 } 1373 } 1374 return $id; 1375 } 1376 1377 /** 1378 * Get the list of lengths indexed in the wiki. 1379 * 1380 * Read the index directory or a cache file and returns 1381 * a sorted array of lengths of the words used in the wiki. 1382 * 1383 * @author YoBoY <yoboy.leguesh@gmail.com> 1384 * 1385 * @return array 1386 */ 1387 protected function listIndexLengths() 1388 { 1389 global $conf; 1390 // testing what we have to do, create a cache file or not. 1391 if ($conf['readdircache'] == 0) { 1392 $docache = false; 1393 } else { 1394 clearstatcache(); 1395 $lengthsFile = $conf['indexdir'].'/lengths.idx'; 1396 if (file_exists($lengthsFile) 1397 && (time() < @filemtime($lengthsFile) + $conf['readdircache']) 1398 ) { 1399 if ( 1400 ($lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 1401 !== false 1402 ) { 1403 $idx = array(); 1404 foreach ($lengths as $length) { 1405 $idx[] = (int)$length; 1406 } 1407 return $idx; 1408 } 1409 } 1410 $docache = true; 1411 } 1412 1413 if ($conf['readdircache'] == 0 || $docache) { 1414 $dir = @opendir($conf['indexdir']); 1415 if ($dir === false) return array(); 1416 $idx = array(); 1417 while (($f = readdir($dir)) !== false) { 1418 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1419 $i = substr($f, 1, -4); 1420 if (is_numeric($i)) $idx[] = (int)$i; 1421 } 1422 } 1423 closedir($dir); 1424 sort($idx); 1425 // save this in a file 1426 if ($docache) { 1427 $handle = @fopen($lengthsFile, 'w'); 1428 @fwrite($handle, implode("\n", $idx)); 1429 @fclose($handle); 1430 } 1431 return $idx; 1432 } 1433 return array(); 1434 } 1435 1436 /** 1437 * Get the word lengths that have been indexed. 1438 * 1439 * Reads the index directory and returns an array of lengths 1440 * that there are indices for. 1441 * 1442 * @author YoBoY <yoboy.leguesh@gmail.com> 1443 * 1444 * @param array|int $filter 1445 * @return array 1446 */ 1447 protected function indexLengths($filter) 1448 { 1449 global $conf; 1450 $idx = array(); 1451 if (is_array($filter)) { 1452 // testing if index files exist only 1453 $path = $conf['indexdir']."/i"; 1454 foreach ($filter as $key => $value) { 1455 if (file_exists($path.$key.'.idx')) { 1456 $idx[] = $key; 1457 } 1458 } 1459 } else { 1460 $lengths = $this->listIndexLengths(); 1461 foreach ($lengths as $key => $length) { 1462 // keep all the values equal or superior 1463 if ((int)$length >= (int)$filter) { 1464 $idx[] = $length; 1465 } 1466 } 1467 } 1468 return $idx; 1469 } 1470 1471 /** 1472 * Insert or replace a tuple in a line. 1473 * 1474 * @author Tom N Harris <tnharris@whoopdedo.org> 1475 * 1476 * @param string $line 1477 * @param string|int $id 1478 * @param int $count 1479 * @return string 1480 */ 1481 protected function updateTuple($line, $id, $count) 1482 { 1483 if ($line != '') { 1484 $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1485 } 1486 $line = trim($line, ':'); 1487 if ($count) { 1488 if ($line) { 1489 return "$id*$count:".$line; 1490 } else { 1491 return "$id*$count"; 1492 } 1493 } 1494 return $line; 1495 } 1496 1497 /** 1498 * Split a line into an array of tuples. 1499 * 1500 * @author Tom N Harris <tnharris@whoopdedo.org> 1501 * @author Andreas Gohr <andi@splitbrain.org> 1502 * 1503 * @param array $keys 1504 * @param string $line 1505 * @return array 1506 */ 1507 protected function parseTuples(&$keys, $line) 1508 { 1509 $result = array(); 1510 if ($line == '') return $result; 1511 $parts = explode(':', $line); 1512 foreach ($parts as $tuple) { 1513 if ($tuple === '') continue; 1514 list($key, $cnt) = explode('*', $tuple); 1515 if (!$cnt) continue; 1516 $key = $keys[$key]; 1517 if ($key === false || is_null($key)) continue; 1518 $result[$key] = $cnt; 1519 } 1520 return $result; 1521 } 1522 1523 /** 1524 * Sum the counts in a list of tuples. 1525 * 1526 * @author Tom N Harris <tnharris@whoopdedo.org> 1527 * 1528 * @param string $line 1529 * @return int 1530 */ 1531 protected function countTuples($line) 1532 { 1533 $freq = 0; 1534 $parts = explode(':', $line); 1535 foreach ($parts as $tuple) { 1536 if ($tuple === '') continue; 1537 list(/* $pid */, $cnt) = explode('*', $tuple); 1538 $freq += (int)$cnt; 1539 } 1540 return $freq; 1541 } 1542} 1543