1<?php 2namespace dokuwiki\Search; 3 4use dokuwiki\Extension\Event; 5use dokuwiki\Utf8; 6 7// Version tag used to force rebuild on upgrade 8define('INDEXER_VERSION', 8); 9 10// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 11if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 12 13 14/** 15 * Class DokuWIki Indexer for Fulltext Search 16 * 17 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18 * @author Andreas Gohr <andi@splitbrain.org> 19 * @author Tom N Harris <tnharris@whoopdedo.org> 20 */ 21class Indexer { 22 23 /** @var Indexer */ 24 protected static $instance = null; 25 26 /** @var array $pidCache Cache for getPID() */ 27 protected $pidCache = array(); 28 29 /** @var array $Stopwords Words that indexer ignores */ 30 protected $Stopwords; 31 32 /** 33 * Indexer constructor. Singleton, thus protected! 34 */ 35 protected function __construct() {} 36 37 /** 38 * Get new or existing singleton instance of the Indexer 39 * 40 * @return Indexer 41 */ 42 public static function getInstance() 43 { 44 if (is_null(static::$instance)) { 45 static::$instance = new static(); 46 } 47 return static::$instance; 48 } 49 50 /** 51 * Returns words that will be ignored 52 * 53 * @return array list of stop words 54 * 55 * @author Tom N Harris <tnharris@whoopdedo.org> 56 */ 57 public function getStopwords() 58 { 59 if (!isset($this->Stopwords)) { 60 global $conf; 61 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 62 if (file_exists($swfile)) { 63 $this->Stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 64 } else { 65 $this->Stopwords = array(); 66 } 67 } 68 return $this->Stopwords; 69 } 70 71 /** 72 * Measure the length of a string. 73 * Differs from strlen in handling of asian characters. 74 * 75 * @author Tom N Harris <tnharris@whoopdedo.org> 76 * 77 * @param string $w 78 * @return int 79 */ 80 public static function wordlen($w) { 81 $l = strlen($w); 82 // If left alone, all chinese "words" will get put into w3.idx 83 // So the "length" of a "word" is faked 84 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 85 foreach ($leadbytes[0] as $b) { 86 $l += ord($b) - 0xE1; 87 } 88 } 89 return $l; 90 } 91 92 /** 93 * Version of the indexer taking into consideration the external tokenizer. 94 * The indexer is only compatible with data written by the same version. 95 * 96 * @triggers INDEXER_VERSION_GET 97 * Plugins that modify what gets indexed should hook this event and 98 * add their version info to the event data like so: 99 * $data[$plugin_name] = $plugin_version; 100 * 101 * @author Tom N Harris <tnharris@whoopdedo.org> 102 * @author Michael Hamann <michael@content-space.de> 103 * 104 * @return int|string 105 */ 106 public function getVersion() 107 { 108 static $indexer_version = null; 109 if ($indexer_version == null) { 110 $version = INDEXER_VERSION; 111 112 // DokuWiki version is included for the convenience of plugins 113 $data = array('dokuwiki'=>$version); 114 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 115 unset($data['dokuwiki']); // this needs to be first 116 ksort($data); 117 foreach ($data as $plugin => $vers) { 118 $version .= '+'.$plugin.'='.$vers; 119 } 120 $indexer_version = $version; 121 } 122 return $indexer_version; 123 } 124 125 /** 126 * Adds/updates the search index for the given page 127 * 128 * Locking is handled internally. 129 * 130 * @param string $page name of the page to index 131 * @param boolean $verbose print status messages 132 * @param boolean $force force reindexing even when the index is up to date 133 * @return string|boolean the function completed successfully 134 * 135 * @author Tom N Harris <tnharris@whoopdedo.org> 136 */ 137 public function addPage($page, $verbose=false, $force=false) 138 { 139 $idxtag = metaFN($page,'.indexed'); 140 // check if page was deleted but is still in the index 141 if (!page_exists($page)) { 142 if (!file_exists($idxtag)) { 143 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 144 return false; 145 } 146 $result = $this->deletePage($page); 147 if ($result === 'locked') { 148 if ($verbose) print("Indexer: locked".DOKU_LF); 149 return false; 150 } 151 @unlink($idxtag); 152 return $result; 153 } 154 155 // check if indexing needed 156 if (!$force && file_exists($idxtag)) { 157 if (trim(io_readFile($idxtag)) == $this->getVersion()) { 158 $last = @filemtime($idxtag); 159 if ($last > @filemtime(wikiFN($page))) { 160 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 161 return false; 162 } 163 } 164 } 165 166 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 167 if ($indexenabled === false) { 168 $result = false; 169 if (file_exists($idxtag)) { 170 $result = $this->deletePage($page); 171 if ($result === 'locked') { 172 if ($verbose) print("Indexer: locked".DOKU_LF); 173 return false; 174 } 175 @unlink($idxtag); 176 } 177 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 178 return $result; 179 } 180 181 $pid = $this->getPID($page); 182 if ($pid === false) { 183 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 184 return false; 185 } 186 $body = ''; 187 $metadata = array(); 188 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 189 190 $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED); 191 $metadata['relation_references'] = ($references !== null) ? 192 array_keys($references) : array(); 193 194 $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED); 195 $metadata['relation_media'] = ($media !== null) ? 196 array_keys($media) : array(); 197 198 $data = compact('page', 'body', 'metadata', 'pid'); 199 $evt = new Event('INDEXER_PAGE_ADD', $data); 200 if ($evt->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page); 201 $evt->advise_after(); 202 unset($evt); 203 extract($data); 204 205 $result = $this->addPageWords($page, $body); 206 if ($result === 'locked') { 207 if ($verbose) print("Indexer: locked".DOKU_LF); 208 return false; 209 } 210 211 if ($result) { 212 $result = $this->addMetaKeys($page, $metadata); 213 if ($result === 'locked') { 214 if ($verbose) print("Indexer: locked".DOKU_LF); 215 return false; 216 } 217 } 218 219 if ($result) { 220 io_saveFile(metaFN($page,'.indexed'), $this->getVersion()); 221 } 222 if ($verbose) { 223 print("Indexer: finished".DOKU_LF); 224 return true; 225 } 226 return $result; 227 } 228 229 /** 230 * Adds the contents of a page to the fulltext index 231 * 232 * The added text replaces previous words for the same page. 233 * An empty value erases the page. 234 * 235 * @param string $page a page name 236 * @param string $text the body of the page 237 * @return string|boolean the function completed successfully 238 * 239 * @author Tom N Harris <tnharris@whoopdedo.org> 240 * @author Andreas Gohr <andi@splitbrain.org> 241 */ 242 public function addPageWords($page, $text) 243 { 244 if (!$this->lock()) return 'locked'; 245 246 // load known documents 247 $pid = $this->getPIDNoLock($page); 248 if ($pid === false) { 249 $this->unlock(); 250 return false; 251 } 252 253 $pagewords = array(); 254 // get word usage in page 255 $words = $this->getPageWords($text); 256 if ($words === false) { 257 $this->unlock(); 258 return false; 259 } 260 261 if (!empty($words)) { 262 foreach (array_keys($words) as $wlen) { 263 $index = $this->getIndex('i', $wlen); 264 foreach ($words[$wlen] as $wid => $freq) { 265 $idx = ($wid<count($index)) ? $index[$wid] : ''; 266 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 267 $pagewords[] = "$wlen*$wid"; 268 } 269 if (!$this->saveIndex('i', $wlen, $index)) { 270 $this->unlock(); 271 return false; 272 } 273 } 274 } 275 276 // Remove obsolete index entries 277 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 278 if ($pageword_idx !== '') { 279 $oldwords = explode(':',$pageword_idx); 280 $delwords = array_diff($oldwords, $pagewords); 281 $upwords = array(); 282 foreach ($delwords as $word) { 283 if ($word != '') { 284 list($wlen,$wid) = explode('*', $word); 285 $wid = (int)$wid; 286 $upwords[$wlen][] = $wid; 287 } 288 } 289 foreach ($upwords as $wlen => $widx) { 290 $index = $this->getIndex('i', $wlen); 291 foreach ($widx as $wid) { 292 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 293 } 294 $this->saveIndex('i', $wlen, $index); 295 } 296 } 297 // Save the reverse index 298 $pageword_idx = join(':', $pagewords); 299 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 300 $this->unlock(); 301 return false; 302 } 303 304 $this->unlock(); 305 return true; 306 } 307 308 /** 309 * Split the words in a page and add them to the index. 310 * 311 * @param string $text content of the page 312 * @return array list of word IDs and number of times used 313 * 314 * @author Andreas Gohr <andi@splitbrain.org> 315 * @author Christopher Smith <chris@jalakai.co.uk> 316 * @author Tom N Harris <tnharris@whoopdedo.org> 317 */ 318 protected function getPageWords($text) 319 { 320 $tokens = $this->tokenizer($text); 321 $tokens = array_count_values($tokens); // count the frequency of each token 322 323 $words = array(); 324 foreach ($tokens as $w => $c) { 325 $l = static::wordlen($w); 326 if (isset($words[$l])) { 327 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 328 } else { 329 $words[$l] = array($w => $c); 330 } 331 } 332 333 // arrive here with $words = array(wordlen => array(word => frequency)) 334 $word_idx_modified = false; 335 $index = array(); //resulting index 336 foreach (array_keys($words) as $wlen) { 337 $word_idx = $this->getIndex('w', $wlen); 338 foreach ($words[$wlen] as $word => $freq) { 339 $word = (string)$word; 340 $wid = array_search($word, $word_idx, true); 341 if ($wid === false) { 342 $wid = count($word_idx); 343 $word_idx[] = $word; 344 $word_idx_modified = true; 345 } 346 if (!isset($index[$wlen])) { 347 $index[$wlen] = array(); 348 } 349 $index[$wlen][$wid] = $freq; 350 } 351 // save back the word index 352 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) { 353 return false; 354 } 355 } 356 357 return $index; 358 } 359 360 /** 361 * Add/update keys to/of the metadata index. 362 * 363 * Adding new keys does not remove other keys for the page. 364 * An empty value will erase the key. 365 * The $key parameter can be an array to add multiple keys. $value will 366 * not be used if $key is an array. 367 * 368 * @param string $page a page name 369 * @param mixed $key a key string or array of key=>value pairs 370 * @param mixed $value the value or list of values 371 * @return boolean|string the function completed successfully 372 * 373 * @author Tom N Harris <tnharris@whoopdedo.org> 374 * @author Michael Hamann <michael@content-space.de> 375 */ 376 public function addMetaKeys($page, $key, $value=null) 377 { 378 if (!is_array($key)) { 379 $key = array($key => $value); 380 } elseif (!is_null($value)) { 381 // $key is array, but $value is not null 382 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 383 } 384 385 if (!$this->lock()) return 'locked'; 386 387 // load known documents 388 $pid = $this->getPIDNoLock($page); 389 if ($pid === false) { 390 $this->unlock(); 391 return false; 392 } 393 394 // Special handling for titles so the index file is simpler 395 if (array_key_exists('title', $key)) { 396 $value = $key['title']; 397 if (is_array($value)) { 398 $value = $value[0]; 399 } 400 $this->saveIndexKey('title', '', $pid, $value); 401 unset($key['title']); 402 } 403 404 foreach ($key as $name => $values) { 405 $metaname = $this->cleanName($name); 406 $this->addIndexKey('metadata', '', $metaname); 407 $metaidx = $this->getIndex($metaname.'_i', ''); 408 $metawords = $this->getIndex($metaname.'_w', ''); 409 $addwords = false; 410 411 if (!is_array($values)) $values = array($values); 412 413 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 414 if ($val_idx !== '') { 415 $val_idx = explode(':', $val_idx); 416 // -1 means remove, 0 keep, 1 add 417 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 418 } else { 419 $val_idx = array(); 420 } 421 422 foreach ($values as $val) { 423 $val = (string)$val; 424 if ($val !== '') { 425 $id = array_search($val, $metawords, true); 426 if ($id === false) { 427 // didn't find $val, so we'll add it to the end of metawords 428 // and create a placeholder in metaidx 429 $id = count($metawords); 430 $metawords[$id] = $val; 431 $metaidx[$id] = ''; 432 $addwords = true; 433 } 434 // test if value is already in the index 435 if (isset($val_idx[$id]) && $val_idx[$id] <= 0) { 436 $val_idx[$id] = 0; 437 } else { // else add it 438 $val_idx[$id] = 1; 439 } 440 } 441 } 442 443 if ($addwords) { 444 $this->saveIndex($metaname.'_w', '', $metawords); 445 } 446 $vals_changed = false; 447 foreach ($val_idx as $id => $action) { 448 if ($action == -1) { 449 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 450 $vals_changed = true; 451 unset($val_idx[$id]); 452 } elseif ($action == 1) { 453 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 454 $vals_changed = true; 455 } 456 } 457 458 if ($vals_changed) { 459 $this->saveIndex($metaname.'_i', '', $metaidx); 460 $val_idx = implode(':', array_keys($val_idx)); 461 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 462 } 463 464 unset($metaidx); 465 unset($metawords); 466 } 467 468 $this->unlock(); 469 return true; 470 } 471 472 /** 473 * Rename a page in the search index without changing the indexed content. 474 * This function doesn't check if the old or new name exists in the filesystem. 475 * It returns an error if the old page isn't in the page list of the indexer 476 * and it deletes all previously indexed content of the new page. 477 * 478 * @param string $oldpage The old page name 479 * @param string $newpage The new page name 480 * @return string|bool If the page was successfully renamed, 481 * can be a message in the case of an error 482 */ 483 public function renamePage($oldpage, $newpage) 484 { 485 if (!$this->lock()) return 'locked'; 486 487 $pages = $this->getPages(); 488 489 $id = array_search($oldpage, $pages, true); 490 if ($id === false) { 491 $this->unlock(); 492 return 'page is not in index'; 493 } 494 495 $new_id = array_search($newpage, $pages, true); 496 if ($new_id !== false) { 497 // make sure the page is not in the index anymore 498 if ($this->deletePageNoLock($newpage) !== true) { 499 return false; 500 } 501 502 $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 503 } 504 505 $pages[$id] = $newpage; 506 507 // update index 508 if (!$this->saveIndex('page', '', $pages)) { 509 $this->unlock(); 510 return false; 511 } 512 513 // reset the pid cache 514 $this->pidCache = array(); 515 516 $this->unlock(); 517 return true; 518 } 519 520 /** 521 * Renames a meta value in the index. 522 * This doesn't change the meta value in the pages, it assumes that 523 * all pages will be updated. 524 * 525 * @param string $key The metadata key of which a value shall be changed 526 * @param string $oldvalue The old value that shall be renamed 527 * @param string $newvalue The new value to which the old value shall be renamed, 528 * if exists values will be merged 529 * @return bool|string If renaming the value has been successful, false 530 * or error message on error. 531 */ 532 public function renameMetaValue($key, $oldvalue, $newvalue) 533 { 534 if (!$this->lock()) return 'locked'; 535 536 // change the relation references index 537 $metavalues = $this->getIndex($key, '_w'); 538 $oldid = array_search($oldvalue, $metavalues, true); 539 if ($oldid !== false) { 540 $newid = array_search($newvalue, $metavalues, true); 541 if ($newid !== false) { 542 // free memory 543 unset ($metavalues); 544 545 // okay, now we have two entries for the same value. we need to merge them. 546 $indexline = $this->getIndexKey($key.'_i', '', $oldid); 547 if ($indexline != '') { 548 $newindexline = $this->getIndexKey($key.'_i', '', $newid); 549 $pagekeys = $this->getIndex($key.'_p', ''); 550 $parts = explode(':', $indexline); 551 foreach ($parts as $part) { 552 list($id, $count) = explode('*', $part); 553 $newindexline = $this->updateTuple($newindexline, $id, $count); 554 555 $keyline = explode(':', $pagekeys[$id]); 556 // remove old meta value 557 $keyline = array_diff($keyline, array($oldid)); 558 // add new meta value when not already present 559 if (!in_array($newid, $keyline)) { 560 array_push($keyline, $newid); 561 } 562 $pagekeys[$id] = implode(':', $keyline); 563 } 564 $this->saveIndex($key.'_p', '', $pagekeys); 565 unset($pagekeys); 566 $this->saveIndexKey($key.'_i', '', $oldid, ''); 567 $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 568 } 569 } else { 570 $metavalues[$oldid] = $newvalue; 571 if (!$this->saveIndex($key.'_w', '', $metavalues)) { 572 $this->unlock(); 573 return false; 574 } 575 } 576 } 577 578 $this->unlock(); 579 return true; 580 } 581 582 /** 583 * Remove a page from the index 584 * 585 * Erases entries in all known indexes. 586 * 587 * @param string $page a page name 588 * @return string|boolean the function completed successfully 589 * 590 * @author Tom N Harris <tnharris@whoopdedo.org> 591 */ 592 public function deletePage($page) 593 { 594 if (!$this->lock()) return 'locked'; 595 596 $result = $this->deletePageNoLock($page); 597 $this->unlock(); 598 return $result; 599 } 600 601 /** 602 * Remove a page from the index without locking the index, 603 * only use this function if the index is already locked 604 * 605 * Erases entries in all known indexes. 606 * 607 * @param string $page a page name 608 * @return boolean the function completed successfully 609 * 610 * @author Tom N Harris <tnharris@whoopdedo.org> 611 */ 612 protected function deletePageNoLock($page) 613 { 614 // load known documents 615 $pid = $this->getPIDNoLock($page); 616 if ($pid === false) { 617 return false; 618 } 619 620 // Remove obsolete index entries 621 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 622 if ($pageword_idx !== '') { 623 $delwords = explode(':', $pageword_idx); 624 $upwords = array(); 625 foreach ($delwords as $word) { 626 if ($word != '') { 627 list($wlen,$wid) = explode('*', $word); 628 $wid = (int)$wid; 629 $upwords[$wlen][] = $wid; 630 } 631 } 632 foreach ($upwords as $wlen => $widx) { 633 $index = $this->getIndex('i', $wlen); 634 foreach ($widx as $wid) { 635 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 636 } 637 $this->saveIndex('i', $wlen, $index); 638 } 639 } 640 // Save the reverse index 641 if (!$this->saveIndexKey('pageword', '', $pid, '')) { 642 return false; 643 } 644 645 $this->saveIndexKey('title', '', $pid, ''); 646 $keyidx = $this->getIndex('metadata', ''); 647 foreach ($keyidx as $metaname) { 648 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 649 $meta_idx = $this->getIndex($metaname.'_i', ''); 650 foreach ($val_idx as $id) { 651 if ($id === '') continue; 652 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 653 } 654 $this->saveIndex($metaname.'_i', '', $meta_idx); 655 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 656 } 657 658 return true; 659 } 660 661 /** 662 * Clear the whole index 663 * 664 * @return bool If the index has been cleared successfully 665 */ 666 public function clear() 667 { 668 global $conf; 669 670 if (!$this->lock()) return false; 671 672 @unlink($conf['indexdir'].'/page.idx'); 673 @unlink($conf['indexdir'].'/title.idx'); 674 @unlink($conf['indexdir'].'/pageword.idx'); 675 @unlink($conf['indexdir'].'/metadata.idx'); 676 $dir = @opendir($conf['indexdir']); 677 if ($dir !== false) { 678 while (($f = readdir($dir)) !== false) { 679 if (in_array($f[0], ['i', 'w']) && substr($f, -4) == '.idx') { 680 // fulltext index 681 @unlink($conf['indexdir']."/$f"); 682 } elseif (in_array(substr($f, -6), ['_w.idx','_i.idx','_p.idx'])) { 683 // metadata index 684 @unlink($conf['indexdir']."/$f"); 685 } 686 } 687 } 688 @unlink($conf['indexdir'].'/lengths.idx'); 689 690 // clear the pid cache 691 $this->pidCache = array(); 692 693 $this->unlock(); 694 return true; 695 } 696 697 /** 698 * Split the text into words for fulltext search 699 * 700 * @triggers INDEXER_TEXT_PREPARE 701 * This event allows plugins to modify the text before it gets tokenized. 702 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 703 * 704 * @param string $text plain text 705 * @param boolean $wc are wildcards allowed? 706 * @return array list of words in the text 707 * 708 * @author Tom N Harris <tnharris@whoopdedo.org> 709 * @author Andreas Gohr <andi@splitbrain.org> 710 */ 711 public function tokenizer($text, $wc=false) 712 { 713 $wc = ($wc) ? '' : '\*'; 714 715 // prepare the text to be tokenized 716 $evt = new Event('INDEXER_TEXT_PREPARE', $text); 717 if ($evt->advise_before(true)) { 718 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 719 $text = Utf8\Asian::separateAsianWords($text); 720 } 721 } 722 $evt->advise_after(); 723 unset($evt); 724 725 $text = strtr($text, 726 array( 727 "\r" => ' ', 728 "\n" => ' ', 729 "\t" => ' ', 730 "\xC2\xAD" => '', //soft-hyphen 731 ) 732 ); 733 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 734 $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 735 } 736 737 $wordlist = explode(' ', $text); 738 foreach ($wordlist as $i => $word) { 739 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 740 Utf8\PhpString::strtolower($word) : strtolower($word); 741 } 742 743 foreach ($wordlist as $i => $word) { 744 if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 745 || array_search($word, $this->getStopwords(), true) !== false) { 746 unset($wordlist[$i]); 747 } 748 } 749 return array_values($wordlist); 750 } 751 752 /** 753 * Get the numeric PID of a page 754 * 755 * @param string $page The page to get the PID for 756 * @return bool|int The page id on success, false on error 757 */ 758 public function getPID($page) 759 { 760 // return PID without locking when it is in the cache 761 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 762 763 if (!$this->lock()) return false; 764 765 // load known documents 766 $pid = $this->getPIDNoLock($page); 767 if ($pid === false) { 768 $this->unlock(); 769 return false; 770 } 771 772 $this->unlock(); 773 return $pid; 774 } 775 776 /** 777 * Get the numeric PID of a page without locking the index. 778 * Only use this function when the index is already locked. 779 * 780 * @param string $page The page to get the PID for 781 * @return int|bool The page id on success, false on error 782 */ 783 protected function getPIDNoLock($page) 784 { 785 // avoid expensive addIndexKey operation for the most recently 786 // requested pages by using a cache 787 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 788 $pid = $this->addIndexKey('page', '', $page); 789 // limit cache to 10 entries by discarding the oldest element 790 // as in DokuWiki usually only the most recently 791 // added item will be requested again 792 if (count($this->pidCache) > 10) array_shift($this->pidCache); 793 $this->pidCache[$page] = $pid; 794 return $pid; 795 } 796 797 /** 798 * Get the page id of a numeric PID 799 * 800 * @param int $pid The PID to get the page id for 801 * @return string The page id 802 */ 803 public function getPageFromPID($pid) 804 { 805 return $this->getIndexKey('page', '', $pid); 806 } 807 808 /** 809 * Find pages in the fulltext index containing the words, 810 * 811 * The search words must be pre-tokenized, meaning only letters and 812 * numbers with an optional wildcard 813 * 814 * The returned array will have the original tokens as key. The values 815 * in the returned list is an array with the page names as keys and the 816 * number of times that token appears on the page as value. 817 * 818 * @param array $tokens list of words to search for 819 * @return array list of page names with usage counts 820 * 821 * @author Tom N Harris <tnharris@whoopdedo.org> 822 * @author Andreas Gohr <andi@splitbrain.org> 823 */ 824 public function lookup(&$tokens) 825 { 826 $result = array(); 827 $wids = $this->getIndexWords($tokens, $result); 828 if (empty($wids)) return array(); 829 // load known words and documents 830 $page_idx = $this->getIndex('page', ''); 831 $docs = array(); 832 foreach (array_keys($wids) as $wlen) { 833 $wids[$wlen] = array_unique($wids[$wlen]); 834 $index = $this->getIndex('i', $wlen); 835 foreach ($wids[$wlen] as $ixid) { 836 if ($ixid < count($index)) { 837 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 838 } 839 } 840 } 841 // merge found pages into final result array 842 $final = array(); 843 foreach ($result as $word => $res) { 844 $final[$word] = array(); 845 foreach ($res as $wid) { 846 // handle the case when ($ixid < count($index)) has been false 847 // and thus $docs[$wid] hasn't been set. 848 if (!isset($docs[$wid])) continue; 849 $hits = &$docs[$wid]; 850 foreach ($hits as $hitkey => $hitcnt) { 851 // make sure the document still exists 852 if (!page_exists($hitkey, '', false)) continue; 853 if (!isset($final[$word][$hitkey])) { 854 $final[$word][$hitkey] = $hitcnt; 855 } else { 856 $final[$word][$hitkey] += $hitcnt; 857 } 858 } 859 } 860 } 861 return $final; 862 } 863 864 /** 865 * Find pages containing a metadata key. 866 * 867 * The metadata values are compared as case-sensitive strings. Pass a 868 * callback function that returns true or false to use a different 869 * comparison function. The function will be called with the $value being 870 * searched for as the first argument, and the word in the index as the 871 * second argument. The function preg_match can be used directly if the 872 * values are regexes. 873 * 874 * @param string $key name of the metadata key to look for 875 * @param string $value search term to look for, must be a string or array of strings 876 * @param callback $func comparison function 877 * @return array lists with page names, keys are query values if $value is array 878 * 879 * @author Tom N Harris <tnharris@whoopdedo.org> 880 * @author Michael Hamann <michael@content-space.de> 881 */ 882 public function lookupKey($key, &$value, $func=null) 883 { 884 if (!is_array($value)) { 885 $value_array = array($value); 886 } else { 887 $value_array =& $value; 888 } 889 890 // the matching ids for the provided value(s) 891 $value_ids = array(); 892 893 $metaname = $this->cleanName($key); 894 895 // get all words in order to search the matching ids 896 if ($key == 'title') { 897 $words = $this->getIndex('title', ''); 898 } else { 899 $words = $this->getIndex($metaname.'_w', ''); 900 } 901 902 if (!is_null($func)) { 903 foreach ($value_array as $val) { 904 foreach ($words as $i => $word) { 905 if (call_user_func_array($func, array($val, $word))) { 906 $value_ids[$i][] = $val; 907 } 908 } 909 } 910 } else { 911 foreach ($value_array as $val) { 912 $xval = $val; 913 $caret = '^'; 914 $dollar = '$'; 915 // check for wildcards 916 if (substr($xval, 0, 1) == '*') { 917 $xval = substr($xval, 1); 918 $caret = ''; 919 } 920 if (substr($xval, -1, 1) == '*') { 921 $xval = substr($xval, 0, -1); 922 $dollar = ''; 923 } 924 if (!$caret || !$dollar) { 925 $re = $caret.preg_quote($xval, '/').$dollar; 926 foreach (array_keys(preg_grep('/'.$re.'/', $words)) as $i) { 927 $value_ids[$i][] = $val; 928 } 929 } else { 930 if (($i = array_search($val, $words, true)) !== false) { 931 $value_ids[$i][] = $val; 932 } 933 } 934 } 935 } 936 937 unset($words); // free the used memory 938 939 // initialize the result so it won't be null 940 $result = array(); 941 foreach ($value_array as $val) { 942 $result[$val] = array(); 943 } 944 945 $page_idx = $this->getIndex('page', ''); 946 947 // Special handling for titles 948 if ($key == 'title') { 949 foreach ($value_ids as $pid => $val_list) { 950 $page = $page_idx[$pid]; 951 foreach ($val_list as $val) { 952 $result[$val][] = $page; 953 } 954 } 955 } else { 956 // load all lines and pages so the used lines can be taken 957 // and matched with the pages 958 $lines = $this->getIndex($metaname.'_i', ''); 959 960 foreach ($value_ids as $value_id => $val_list) { 961 // parse the tuples of the form page_id*1:page2_id*1 and so on, 962 // return value is an array with page_id => 1, page2_id => 1 etc. 963 // so take the keys only 964 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 965 foreach ($val_list as $val) { 966 $result[$val] = array_merge($result[$val], $pages); 967 } 968 } 969 } 970 if (!is_array($value)) $result = $result[$value]; 971 return $result; 972 } 973 974 /** 975 * Find the index ID of each search term. 976 * 977 * The query terms should only contain valid characters, with a '*' at 978 * either the beginning or end of the word (or both). 979 * The $result parameter can be used to merge the index locations with 980 * the appropriate query term. 981 * 982 * @param array $words The query terms. 983 * @param array $result Set to word => array("length*id" ...) 984 * @return array Set to length => array(id ...) 985 * 986 * @author Tom N Harris <tnharris@whoopdedo.org> 987 */ 988 protected function getIndexWords(&$words, &$result) 989 { 990 $tokens = array(); 991 $tokenlength = array(); 992 $tokenwild = array(); 993 foreach ($words as $word) { 994 $result[$word] = array(); 995 $caret = '^'; 996 $dollar = '$'; 997 $xword = $word; 998 $wlen = static::wordlen($word); 999 1000 // check for wildcards 1001 if (substr($xword, 0, 1) == '*') { 1002 $xword = substr($xword, 1); 1003 $caret = ''; 1004 $wlen -= 1; 1005 } 1006 if (substr($xword, -1, 1) == '*') { 1007 $xword = substr($xword, 0, -1); 1008 $dollar = ''; 1009 $wlen -= 1; 1010 } 1011 if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) { 1012 continue; 1013 } 1014 if (!isset($tokens[$xword])) { 1015 $tokenlength[$wlen][] = $xword; 1016 } 1017 if (!$caret || !$dollar) { 1018 $re = $caret.preg_quote($xword, '/').$dollar; 1019 $tokens[$xword][] = array($word, '/'.$re.'/'); 1020 if (!isset($tokenwild[$xword])) { 1021 $tokenwild[$xword] = $wlen; 1022 } 1023 } else { 1024 $tokens[$xword][] = array($word, null); 1025 } 1026 } 1027 asort($tokenwild); 1028 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 1029 // $tokenlength = array( base word length => base word ... ) 1030 // $tokenwild = array( base word => base word length ... ) 1031 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 1032 $indexes_known = $this->indexLengths($length_filter); 1033 if (!empty($tokenwild)) sort($indexes_known); 1034 // get word IDs 1035 $wids = array(); 1036 foreach ($indexes_known as $ixlen) { 1037 $word_idx = $this->getIndex('w', $ixlen); 1038 // handle exact search 1039 if (isset($tokenlength[$ixlen])) { 1040 foreach ($tokenlength[$ixlen] as $xword) { 1041 $wid = array_search($xword, $word_idx, true); 1042 if ($wid !== false) { 1043 $wids[$ixlen][] = $wid; 1044 foreach ($tokens[$xword] as $w) 1045 $result[$w[0]][] = "$ixlen*$wid"; 1046 } 1047 } 1048 } 1049 // handle wildcard search 1050 foreach ($tokenwild as $xword => $wlen) { 1051 if ($wlen >= $ixlen) break; 1052 foreach ($tokens[$xword] as $w) { 1053 if (is_null($w[1])) continue; 1054 foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { 1055 $wids[$ixlen][] = $wid; 1056 $result[$w[0]][] = "$ixlen*$wid"; 1057 } 1058 } 1059 } 1060 } 1061 return $wids; 1062 } 1063 1064 /** 1065 * Return a list of all pages 1066 * Warning: pages may not exist! 1067 * 1068 * @param string $key list only pages containing the metadata key (optional) 1069 * @return array list of page names 1070 * 1071 * @author Tom N Harris <tnharris@whoopdedo.org> 1072 */ 1073 public function getPages($key=null) 1074 { 1075 $page_idx = $this->getIndex('page', ''); 1076 if (is_null($key)) return $page_idx; 1077 1078 $metaname = $this->cleanName($key); 1079 1080 // Special handling for titles 1081 if ($key == 'title') { 1082 $title_idx = $this->getIndex('title', ''); 1083 array_splice($page_idx, count($title_idx)); 1084 foreach ($title_idx as $i => $title) { 1085 if ($title === '') unset($page_idx[$i]); 1086 } 1087 return array_values($page_idx); 1088 } 1089 1090 $pages = array(); 1091 $lines = $this->getIndex($metaname.'_i', ''); 1092 foreach ($lines as $line) { 1093 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 1094 } 1095 return array_keys($pages); 1096 } 1097 1098 /** 1099 * Return a list of words sorted by number of times used 1100 * 1101 * @param int $min bottom frequency threshold 1102 * @param int $max upper frequency limit. No limit if $max<$min 1103 * @param int $minlen minimum length of words to count 1104 * @param string $key metadata key to list. Uses the fulltext index if not given 1105 * @return array list of words as the keys and frequency as values 1106 * 1107 * @author Tom N Harris <tnharris@whoopdedo.org> 1108 */ 1109 public function histogram($min=1, $max=0, $minlen=3, $key=null) 1110 { 1111 if ($min < 1) $min = 1; 1112 if ($max < $min) $max = 0; 1113 1114 $result = array(); 1115 1116 if ($key == 'title') { 1117 $index = $this->getIndex('title', ''); 1118 $index = array_count_values($index); 1119 foreach ($index as $val => $cnt) { 1120 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) { 1121 $result[$val] = $cnt; 1122 } 1123 } 1124 } elseif (!is_null($key)) { 1125 $metaname = $this->cleanName($key); 1126 $index = $this->getIndex($metaname.'_i', ''); 1127 $val_idx = array(); 1128 foreach ($index as $wid => $line) { 1129 $freq = $this->countTuples($line); 1130 if ($freq >= $min && (!$max || $freq <= $max)) { 1131 $val_idx[$wid] = $freq; 1132 } 1133 } 1134 if (!empty($val_idx)) { 1135 $words = $this->getIndex($metaname.'_w', ''); 1136 foreach ($val_idx as $wid => $freq) { 1137 if (strlen($words[$wid]) >= $minlen) { 1138 $result[$words[$wid]] = $freq; 1139 } 1140 } 1141 } 1142 } else { 1143 $lengths = $this->listIndexLengths(); 1144 foreach ($lengths as $length) { 1145 if ($length < $minlen) continue; 1146 $index = $this->getIndex('i', $length); 1147 $words = null; 1148 foreach ($index as $wid => $line) { 1149 $freq = $this->countTuples($line); 1150 if ($freq >= $min && (!$max || $freq <= $max)) { 1151 if ($words === null) { 1152 $words = $this->getIndex('w', $length); 1153 } 1154 $result[$words[$wid]] = $freq; 1155 } 1156 } 1157 } 1158 } 1159 1160 arsort($result); 1161 return $result; 1162 } 1163 1164 /** 1165 * Clean a name of a key for use as a file name. 1166 * 1167 * Romanizes non-latin characters, then strips away anything that's 1168 * not a letter, number, or underscore. 1169 * 1170 * @author Tom N Harris <tnharris@whoopdedo.org> 1171 * 1172 * @param string $name 1173 * @return string 1174 */ 1175 protected function cleanName($name) 1176 { 1177 $name = Utf8\Clean::romanize(trim((string)$name)); 1178 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1179 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1180 return strtolower($name); 1181 } 1182 1183 /** 1184 * Lock the indexer. 1185 * 1186 * @author Tom N Harris <tnharris@whoopdedo.org> 1187 * 1188 * @return bool|string 1189 */ 1190 protected function lock() 1191 { 1192 global $conf; 1193 $status = true; 1194 $run = 0; 1195 $lock = $conf['lockdir'].'/_indexer.lock'; 1196 while (!@mkdir($lock, $conf['dmode'])) { 1197 usleep(50); 1198 if (is_dir($lock) && time() - @filemtime($lock) > 60*5) { 1199 // looks like a stale lock - remove it 1200 if (!@rmdir($lock)) { 1201 $status = "removing the stale lock failed"; 1202 return false; 1203 } else { 1204 $status = "stale lock removed"; 1205 } 1206 } elseif ($run++ == 1000) { 1207 // we waited 5 seconds for that lock 1208 return false; 1209 } 1210 } 1211 if (!empty($conf['dperm'])) { 1212 chmod($lock, $conf['dperm']); 1213 } 1214 return $status; 1215 } 1216 1217 /** 1218 * Release the indexer lock. 1219 * 1220 * @author Tom N Harris <tnharris@whoopdedo.org> 1221 * 1222 * @return bool 1223 */ 1224 protected function unlock() 1225 { 1226 global $conf; 1227 @rmdir($conf['lockdir'].'/_indexer.lock'); 1228 return true; 1229 } 1230 1231 /** 1232 * Retrieve the entire index. 1233 * 1234 * The $suffix argument is for an index that is split into multiple parts. 1235 * Different index files should use different base names. 1236 * 1237 * @param string $idx name of the index 1238 * @param string $suffix subpart identifier 1239 * @return array list of lines without CR or LF 1240 * 1241 * @author Tom N Harris <tnharris@whoopdedo.org> 1242 */ 1243 public function getIndex($idx, $suffix) 1244 { 1245 global $conf; 1246 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1247 if (!file_exists($fn)) return array(); 1248 return file($fn, FILE_IGNORE_NEW_LINES); 1249 } 1250 1251 /** 1252 * Replace the contents of the index with an array. 1253 * 1254 * @param string $idx name of the index 1255 * @param string $suffix subpart identifier 1256 * @param array $lines list of lines without LF 1257 * @return bool If saving succeeded 1258 * 1259 * @author Tom N Harris <tnharris@whoopdedo.org> 1260 */ 1261 protected function saveIndex($idx, $suffix, &$lines) 1262 { 1263 global $conf; 1264 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1265 $fh = @fopen($fn.'.tmp', 'w'); 1266 if (!$fh) return false; 1267 fwrite($fh, join("\n", $lines)); 1268 if (!empty($lines)) { 1269 fwrite($fh, "\n"); 1270 } 1271 fclose($fh); 1272 if (isset($conf['fperm'])) { 1273 chmod($fn.'.tmp', $conf['fperm']); 1274 } 1275 io_rename($fn.'.tmp', $fn.'.idx'); 1276 return true; 1277 } 1278 1279 /** 1280 * Retrieve a line from the index. 1281 * 1282 * @param string $idx name of the index 1283 * @param string $suffix subpart identifier 1284 * @param int $id the line number 1285 * @return string a line with trailing whitespace removed 1286 * 1287 * @author Tom N Harris <tnharris@whoopdedo.org> 1288 */ 1289 protected function getIndexKey($idx, $suffix, $id) 1290 { 1291 global $conf; 1292 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1293 if (!file_exists($fn)) return ''; 1294 $fh = @fopen($fn, 'r'); 1295 if (!$fh) return ''; 1296 $ln = -1; 1297 while (($line = fgets($fh)) !== false) { 1298 if (++$ln == $id) break; 1299 } 1300 fclose($fh); 1301 return rtrim((string)$line); 1302 } 1303 1304 /** 1305 * Write a line into the index. 1306 * 1307 * @param string $idx name of the index 1308 * @param string $suffix subpart identifier 1309 * @param int $id the line number 1310 * @param string $line line to write 1311 * @return bool If saving succeeded 1312 * 1313 * @author Tom N Harris <tnharris@whoopdedo.org> 1314 */ 1315 protected function saveIndexKey($idx, $suffix, $id, $line) 1316 { 1317 global $conf; 1318 if (substr($line, -1) !== "\n") { 1319 $line .= "\n"; 1320 } 1321 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1322 $fh = @fopen($fn.'.tmp', 'w'); 1323 if (!$fh) return false; 1324 $ih = @fopen($fn.'.idx', 'r'); 1325 if ($ih) { 1326 $ln = -1; 1327 while (($curline = fgets($ih)) !== false) { 1328 fwrite($fh, (++$ln == $id) ? $line : $curline); 1329 } 1330 if ($id > $ln) { 1331 while ($id > ++$ln) { 1332 fwrite($fh, "\n"); 1333 } 1334 fwrite($fh, $line); 1335 } 1336 fclose($ih); 1337 } else { 1338 $ln = -1; 1339 while ($id > ++$ln) { 1340 fwrite($fh, "\n"); 1341 } 1342 fwrite($fh, $line); 1343 } 1344 fclose($fh); 1345 if (isset($conf['fperm'])) { 1346 chmod($fn.'.tmp', $conf['fperm']); 1347 } 1348 io_rename($fn.'.tmp', $fn.'.idx'); 1349 return true; 1350 } 1351 1352 /** 1353 * Retrieve or insert a value in the index. 1354 * 1355 * @param string $idx name of the index 1356 * @param string $suffix subpart identifier 1357 * @param string $value line to find in the index 1358 * @return int|bool line number of the value in the index 1359 * or false if writing the index failed 1360 * 1361 * @author Tom N Harris <tnharris@whoopdedo.org> 1362 */ 1363 protected function addIndexKey($idx, $suffix, $value) 1364 { 1365 $index = $this->getIndex($idx, $suffix); 1366 $id = array_search($value, $index, true); 1367 if ($id === false) { 1368 $id = count($index); 1369 $index[$id] = $value; 1370 if (!$this->saveIndex($idx, $suffix, $index)) { 1371 trigger_error("Failed to write $idx index", E_USER_ERROR); 1372 return false; 1373 } 1374 } 1375 return $id; 1376 } 1377 1378 /** 1379 * Get the list of lengths indexed in the wiki. 1380 * 1381 * Read the index directory or a cache file and returns 1382 * a sorted array of lengths of the words used in the wiki. 1383 * 1384 * @author YoBoY <yoboy.leguesh@gmail.com> 1385 * 1386 * @return array 1387 */ 1388 public function listIndexLengths() 1389 { 1390 global $conf; 1391 $lengthsFile = $conf['indexdir'].'/lengths.idx'; 1392 1393 // testing what we have to do, create a cache file or not. 1394 if ($conf['readdircache'] == 0) { 1395 $docache = false; 1396 } else { 1397 clearstatcache(); 1398 if (file_exists($lengthsFile) 1399 && (time() < @filemtime($lengthsFile) + $conf['readdircache']) 1400 ) { 1401 if ( 1402 ($lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 1403 !== false 1404 ) { 1405 $idx = array(); 1406 foreach ($lengths as $length) { 1407 $idx[] = (int)$length; 1408 } 1409 return $idx; 1410 } 1411 } 1412 $docache = true; 1413 } 1414 1415 if ($conf['readdircache'] == 0 || $docache) { 1416 $dir = @opendir($conf['indexdir']); 1417 if ($dir === false) return array(); 1418 $idx = array(); 1419 while (($f = readdir($dir)) !== false) { 1420 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1421 $i = substr($f, 1, -4); 1422 if (is_numeric($i)) $idx[] = (int)$i; 1423 } 1424 } 1425 closedir($dir); 1426 sort($idx); 1427 // save this in a file 1428 if ($docache) { 1429 $handle = @fopen($lengthsFile, 'w'); 1430 @fwrite($handle, implode("\n", $idx)); 1431 @fclose($handle); 1432 } 1433 return $idx; 1434 } 1435 return array(); 1436 } 1437 1438 /** 1439 * Get the word lengths that have been indexed. 1440 * 1441 * Reads the index directory and returns an array of lengths 1442 * that there are indices for. 1443 * 1444 * @author YoBoY <yoboy.leguesh@gmail.com> 1445 * 1446 * @param array|int $filter 1447 * @return array 1448 */ 1449 protected function indexLengths($filter) 1450 { 1451 global $conf; 1452 $idx = array(); 1453 if (is_array($filter)) { 1454 // testing if index files exist only 1455 $path = $conf['indexdir']."/i"; 1456 foreach ($filter as $key => $value) { 1457 if (file_exists($path.$key.'.idx')) { 1458 $idx[] = $key; 1459 } 1460 } 1461 } else { 1462 $lengths = $this->listIndexLengths(); 1463 foreach ($lengths as $key => $length) { 1464 // keep all the values equal or superior 1465 if ((int)$length >= (int)$filter) { 1466 $idx[] = $length; 1467 } 1468 } 1469 } 1470 return $idx; 1471 } 1472 1473 /** 1474 * Insert or replace a tuple in a line. 1475 * 1476 * @author Tom N Harris <tnharris@whoopdedo.org> 1477 * 1478 * @param string $line 1479 * @param string|int $id 1480 * @param int $count 1481 * @return string 1482 */ 1483 protected function updateTuple($line, $id, $count) 1484 { 1485 if ($line != '') { 1486 $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1487 } 1488 $line = trim($line, ':'); 1489 if ($count) { 1490 if ($line) { 1491 return "$id*$count:".$line; 1492 } else { 1493 return "$id*$count"; 1494 } 1495 } 1496 return $line; 1497 } 1498 1499 /** 1500 * Split a line into an array of tuples. 1501 * 1502 * @author Tom N Harris <tnharris@whoopdedo.org> 1503 * @author Andreas Gohr <andi@splitbrain.org> 1504 * 1505 * @param array $keys 1506 * @param string $line 1507 * @return array 1508 */ 1509 protected function parseTuples(&$keys, $line) 1510 { 1511 $result = array(); 1512 if ($line == '') return $result; 1513 $parts = explode(':', $line); 1514 foreach ($parts as $tuple) { 1515 if ($tuple === '') continue; 1516 list($key, $cnt) = explode('*', $tuple); 1517 if (!$cnt) continue; 1518 $key = $keys[$key]; 1519 if ($key === false || is_null($key)) continue; 1520 $result[$key] = $cnt; 1521 } 1522 return $result; 1523 } 1524 1525 /** 1526 * Sum the counts in a list of tuples. 1527 * 1528 * @author Tom N Harris <tnharris@whoopdedo.org> 1529 * 1530 * @param string $line 1531 * @return int 1532 */ 1533 protected function countTuples($line) 1534 { 1535 $freq = 0; 1536 $parts = explode(':', $line); 1537 foreach ($parts as $tuple) { 1538 if ($tuple === '') continue; 1539 list(/* $pid */, $cnt) = explode('*', $tuple); 1540 $freq += (int)$cnt; 1541 } 1542 return $freq; 1543 } 1544} 1545