1<?php 2namespace dokuwiki\Search; 3 4use dokuwiki\Extension\Event; 5use dokuwiki\Utf8; 6 7// Version tag used to force rebuild on upgrade 8define('INDEXER_VERSION', 8); 9 10// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 11if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 12 13 14/** 15 * Class DokuWIki Indexer for Fulltext Search 16 * 17 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18 * @author Andreas Gohr <andi@splitbrain.org> 19 * @author Tom N Harris <tnharris@whoopdedo.org> 20 */ 21class Indexer { 22 23 /** @var Indexer */ 24 protected static $instance = null; 25 26 /** @var array $pidCache Cache for getPID() */ 27 protected $pidCache = array(); 28 29 /** @var array $Stopwords Words that indexer ignores */ 30 protected $Stopwords; 31 32 /** 33 * Indexer constructor. Singleton, thus protected! 34 */ 35 protected function __construct() {} 36 37 /** 38 * Get new or existing singleton instance of the Indexer 39 * 40 * @return Indexer 41 */ 42 public static function getInstance() 43 { 44 if (is_null(static::$instance)) { 45 static::$instance = new static(); 46 } 47 return static::$instance; 48 } 49 50 /** 51 * Returns words that will be ignored 52 * 53 * @return array list of stop words 54 * 55 * @author Tom N Harris <tnharris@whoopdedo.org> 56 */ 57 public function getStopwords() 58 { 59 if (!isset($this->Stopwords)) { 60 global $conf; 61 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 62 if (file_exists($swfile)) { 63 $this->Stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 64 } else { 65 $this->Stopwords = array(); 66 } 67 } 68 return $this->Stopwords; 69 } 70 71 /** 72 * Measure the length of a string. 73 * Differs from strlen in handling of asian characters. 74 * 75 * @author Tom N Harris <tnharris@whoopdedo.org> 76 * 77 * @param string $w 78 * @return int 79 */ 80 public static function wordlen($w) 81 { 82 $l = strlen($w); 83 // If left alone, all chinese "words" will get put into w3.idx 84 // So the "length" of a "word" is faked 85 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 86 foreach ($leadbytes[0] as $b) { 87 $l += ord($b) - 0xE1; 88 } 89 } 90 return $l; 91 } 92 93 /** 94 * Version of the indexer taking into consideration the external tokenizer. 95 * The indexer is only compatible with data written by the same version. 96 * 97 * @triggers INDEXER_VERSION_GET 98 * Plugins that modify what gets indexed should hook this event and 99 * add their version info to the event data like so: 100 * $data[$plugin_name] = $plugin_version; 101 * 102 * @author Tom N Harris <tnharris@whoopdedo.org> 103 * @author Michael Hamann <michael@content-space.de> 104 * 105 * @return int|string 106 */ 107 public function getVersion() 108 { 109 static $indexer_version = null; 110 if ($indexer_version == null) { 111 $version = INDEXER_VERSION; 112 113 // DokuWiki version is included for the convenience of plugins 114 $data = array('dokuwiki'=>$version); 115 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 116 unset($data['dokuwiki']); // this needs to be first 117 ksort($data); 118 foreach ($data as $plugin => $vers) { 119 $version .= '+'.$plugin.'='.$vers; 120 } 121 $indexer_version = $version; 122 } 123 return $indexer_version; 124 } 125 126 /** 127 * Adds/updates the search index for the given page 128 * 129 * Locking is handled internally. 130 * 131 * @param string $page name of the page to index 132 * @param bool $verbose print status messages 133 * @param bool $force force reindexing even when the index is up to date 134 * @return string|bool the function completed successfully 135 * 136 * @author Tom N Harris <tnharris@whoopdedo.org> 137 */ 138 public function addPage($page, $verbose=false, $force=false) 139 { 140 $idxtag = metaFN($page,'.indexed'); 141 // check if page was deleted but is still in the index 142 if (!page_exists($page)) { 143 if (!file_exists($idxtag)) { 144 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 145 return false; 146 } 147 $result = $this->deletePage($page); 148 if ($result === 'locked') { 149 if ($verbose) print("Indexer: locked".DOKU_LF); 150 return false; 151 } 152 @unlink($idxtag); 153 return $result; 154 } 155 156 // check if indexing needed 157 if (!$force && file_exists($idxtag)) { 158 if (trim(io_readFile($idxtag)) == $this->getVersion()) { 159 $last = @filemtime($idxtag); 160 if ($last > @filemtime(wikiFN($page))) { 161 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 162 return false; 163 } 164 } 165 } 166 167 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 168 if ($indexenabled === false) { 169 $result = false; 170 if (file_exists($idxtag)) { 171 $result = $this->deletePage($page); 172 if ($result === 'locked') { 173 if ($verbose) print("Indexer: locked".DOKU_LF); 174 return false; 175 } 176 @unlink($idxtag); 177 } 178 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 179 return $result; 180 } 181 182 $pid = $this->getPID($page); 183 if ($pid === false) { 184 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 185 return false; 186 } 187 $body = ''; 188 $metadata = array(); 189 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 190 191 $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED); 192 $metadata['relation_references'] = ($references !== null) ? 193 array_keys($references) : array(); 194 195 $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED); 196 $metadata['relation_media'] = ($media !== null) ? 197 array_keys($media) : array(); 198 199 $data = compact('page', 'body', 'metadata', 'pid'); 200 $evt = new Event('INDEXER_PAGE_ADD', $data); 201 if ($evt->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page); 202 $evt->advise_after(); 203 unset($evt); 204 extract($data); 205 206 $result = $this->addPageWords($page, $body); 207 if ($result === 'locked') { 208 if ($verbose) print("Indexer: locked".DOKU_LF); 209 return false; 210 } 211 212 if ($result) { 213 $result = $this->addMetaKeys($page, $metadata); 214 if ($result === 'locked') { 215 if ($verbose) print("Indexer: locked".DOKU_LF); 216 return false; 217 } 218 } 219 220 if ($result) { 221 io_saveFile(metaFN($page,'.indexed'), $this->getVersion()); 222 } 223 if ($verbose) { 224 print("Indexer: finished".DOKU_LF); 225 return true; 226 } 227 return $result; 228 } 229 230 /** 231 * Adds the contents of a page to the fulltext index 232 * 233 * The added text replaces previous words for the same page. 234 * An empty value erases the page. 235 * 236 * @param string $page a page name 237 * @param string $text the body of the page 238 * @return string|bool the function completed successfully 239 * 240 * @author Tom N Harris <tnharris@whoopdedo.org> 241 * @author Andreas Gohr <andi@splitbrain.org> 242 */ 243 public function addPageWords($page, $text) 244 { 245 if (!$this->lock()) return 'locked'; 246 247 // load known documents 248 $pid = $this->getPIDNoLock($page); 249 if ($pid === false) { 250 $this->unlock(); 251 return false; 252 } 253 254 $pagewords = array(); 255 // get word usage in page 256 $words = $this->getPageWords($text); 257 if ($words === false) { 258 $this->unlock(); 259 return false; 260 } 261 262 if (!empty($words)) { 263 foreach (array_keys($words) as $wlen) { 264 $index = $this->getIndex('i', $wlen); 265 foreach ($words[$wlen] as $wid => $freq) { 266 $idx = ($wid < count($index)) ? $index[$wid] : ''; 267 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 268 $pagewords[] = "$wlen*$wid"; 269 } 270 if (!$this->saveIndex('i', $wlen, $index)) { 271 $this->unlock(); 272 return false; 273 } 274 } 275 } 276 277 // Remove obsolete index entries 278 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 279 if ($pageword_idx !== '') { 280 $oldwords = explode(':',$pageword_idx); 281 $delwords = array_diff($oldwords, $pagewords); 282 $upwords = array(); 283 foreach ($delwords as $word) { 284 if ($word != '') { 285 list($wlen,$wid) = explode('*', $word); 286 $wid = (int)$wid; 287 $upwords[$wlen][] = $wid; 288 } 289 } 290 foreach ($upwords as $wlen => $widx) { 291 $index = $this->getIndex('i', $wlen); 292 foreach ($widx as $wid) { 293 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 294 } 295 $this->saveIndex('i', $wlen, $index); 296 } 297 } 298 // Save the reverse index 299 $pageword_idx = join(':', $pagewords); 300 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 301 $this->unlock(); 302 return false; 303 } 304 305 $this->unlock(); 306 return true; 307 } 308 309 /** 310 * Split the words in a page and add them to the index. 311 * 312 * @param string $text content of the page 313 * @return array list of word IDs and number of times used 314 * 315 * @author Andreas Gohr <andi@splitbrain.org> 316 * @author Christopher Smith <chris@jalakai.co.uk> 317 * @author Tom N Harris <tnharris@whoopdedo.org> 318 */ 319 protected function getPageWords($text) 320 { 321 $tokens = $this->tokenizer($text); 322 $tokens = array_count_values($tokens); // count the frequency of each token 323 324 $words = array(); 325 foreach ($tokens as $w => $c) { 326 $l = static::wordlen($w); 327 if (isset($words[$l])) { 328 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 329 } else { 330 $words[$l] = array($w => $c); 331 } 332 } 333 334 // arrive here with $words = array(wordlen => array(word => frequency)) 335 $word_idx_modified = false; 336 $index = array(); //resulting index 337 foreach (array_keys($words) as $wlen) { 338 $word_idx = $this->getIndex('w', $wlen); 339 foreach ($words[$wlen] as $word => $freq) { 340 $word = (string)$word; 341 $wid = array_search($word, $word_idx, true); 342 if ($wid === false) { 343 $wid = count($word_idx); 344 $word_idx[] = $word; 345 $word_idx_modified = true; 346 } 347 if (!isset($index[$wlen])) { 348 $index[$wlen] = array(); 349 } 350 $index[$wlen][$wid] = $freq; 351 } 352 // save back the word index 353 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) { 354 return false; 355 } 356 } 357 358 return $index; 359 } 360 361 /** 362 * Add/update keys to/of the metadata index. 363 * 364 * Adding new keys does not remove other keys for the page. 365 * An empty value will erase the key. 366 * The $key parameter can be an array to add multiple keys. $value will 367 * not be used if $key is an array. 368 * 369 * @param string $page a page name 370 * @param mixed $key a key string or array of key=>value pairs 371 * @param mixed $value the value or list of values 372 * @return bool|string the function completed successfully 373 * 374 * @author Tom N Harris <tnharris@whoopdedo.org> 375 * @author Michael Hamann <michael@content-space.de> 376 */ 377 public function addMetaKeys($page, $key, $value=null) 378 { 379 if (!is_array($key)) { 380 $key = array($key => $value); 381 } elseif (!is_null($value)) { 382 // $key is array, but $value is not null 383 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 384 } 385 386 if (!$this->lock()) return 'locked'; 387 388 // load known documents 389 $pid = $this->getPIDNoLock($page); 390 if ($pid === false) { 391 $this->unlock(); 392 return false; 393 } 394 395 // Special handling for titles so the index file is simpler 396 if (array_key_exists('title', $key)) { 397 $value = $key['title']; 398 if (is_array($value)) { 399 $value = $value[0]; 400 } 401 $this->saveIndexKey('title', '', $pid, $value); 402 unset($key['title']); 403 } 404 405 foreach ($key as $name => $values) { 406 $metaname = $this->cleanName($name); 407 $this->addIndexKey('metadata', '', $metaname); 408 $metaidx = $this->getIndex($metaname.'_i', ''); 409 $metawords = $this->getIndex($metaname.'_w', ''); 410 $addwords = false; 411 412 if (!is_array($values)) $values = array($values); 413 414 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 415 if ($val_idx !== '') { 416 $val_idx = explode(':', $val_idx); 417 // -1 means remove, 0 keep, 1 add 418 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 419 } else { 420 $val_idx = array(); 421 } 422 423 foreach ($values as $val) { 424 $val = (string)$val; 425 if ($val !== '') { 426 $id = array_search($val, $metawords, true); 427 if ($id === false) { 428 // didn't find $val, so we'll add it to the end of metawords 429 // and create a placeholder in metaidx 430 $id = count($metawords); 431 $metawords[$id] = $val; 432 $metaidx[$id] = ''; 433 $addwords = true; 434 } 435 // test if value is already in the index 436 if (isset($val_idx[$id]) && $val_idx[$id] <= 0) { 437 $val_idx[$id] = 0; 438 } else { // else add it 439 $val_idx[$id] = 1; 440 } 441 } 442 } 443 444 if ($addwords) { 445 $this->saveIndex($metaname.'_w', '', $metawords); 446 } 447 $vals_changed = false; 448 foreach ($val_idx as $id => $action) { 449 if ($action == -1) { 450 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 451 $vals_changed = true; 452 unset($val_idx[$id]); 453 } elseif ($action == 1) { 454 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 455 $vals_changed = true; 456 } 457 } 458 459 if ($vals_changed) { 460 $this->saveIndex($metaname.'_i', '', $metaidx); 461 $val_idx = implode(':', array_keys($val_idx)); 462 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 463 } 464 465 unset($metaidx); 466 unset($metawords); 467 } 468 469 $this->unlock(); 470 return true; 471 } 472 473 /** 474 * Rename a page in the search index without changing the indexed content. 475 * This function doesn't check if the old or new name exists in the filesystem. 476 * It returns an error if the old page isn't in the page list of the indexer 477 * and it deletes all previously indexed content of the new page. 478 * 479 * @param string $oldpage The old page name 480 * @param string $newpage The new page name 481 * @return bool|string If the page was successfully renamed, 482 * can be a message in the case of an error 483 */ 484 public function renamePage($oldpage, $newpage) 485 { 486 if (!$this->lock()) return 'locked'; 487 488 $pages = $this->getPages(); 489 490 $id = array_search($oldpage, $pages, true); 491 if ($id === false) { 492 $this->unlock(); 493 return 'page is not in index'; 494 } 495 496 $new_id = array_search($newpage, $pages, true); 497 if ($new_id !== false) { 498 // make sure the page is not in the index anymore 499 if ($this->deletePageNoLock($newpage) !== true) { 500 return false; 501 } 502 503 $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 504 } 505 506 $pages[$id] = $newpage; 507 508 // update index 509 if (!$this->saveIndex('page', '', $pages)) { 510 $this->unlock(); 511 return false; 512 } 513 514 // reset the pid cache 515 $this->pidCache = array(); 516 517 $this->unlock(); 518 return true; 519 } 520 521 /** 522 * Renames a meta value in the index. 523 * This doesn't change the meta value in the pages, it assumes that 524 * all pages will be updated. 525 * 526 * @param string $key The metadata key of which a value shall be changed 527 * @param string $oldvalue The old value that shall be renamed 528 * @param string $newvalue The new value to which the old value shall be renamed, 529 * if exists values will be merged 530 * @return bool|string If renaming the value has been successful, false 531 * or error message on error. 532 */ 533 public function renameMetaValue($key, $oldvalue, $newvalue) 534 { 535 if (!$this->lock()) return 'locked'; 536 537 // change the relation references index 538 $metavalues = $this->getIndex($key, '_w'); 539 $oldid = array_search($oldvalue, $metavalues, true); 540 if ($oldid !== false) { 541 $newid = array_search($newvalue, $metavalues, true); 542 if ($newid !== false) { 543 // free memory 544 unset ($metavalues); 545 546 // okay, now we have two entries for the same value. we need to merge them. 547 $indexline = $this->getIndexKey($key.'_i', '', $oldid); 548 if ($indexline != '') { 549 $newindexline = $this->getIndexKey($key.'_i', '', $newid); 550 $pagekeys = $this->getIndex($key.'_p', ''); 551 $parts = explode(':', $indexline); 552 foreach ($parts as $part) { 553 list($id, $count) = explode('*', $part); 554 if ($id === '') continue; 555 $newindexline = $this->updateTuple($newindexline, $id, $count); 556 557 $keyline = explode(':', $pagekeys[$id]); 558 // remove old meta value 559 $keyline = array_diff($keyline, array($oldid)); 560 // add new meta value when not already present 561 if (!in_array($newid, $keyline)) { 562 array_push($keyline, $newid); 563 } 564 $pagekeys[$id] = implode(':', $keyline); 565 } 566 $this->saveIndex($key.'_p', '', $pagekeys); 567 unset($pagekeys); 568 $this->saveIndexKey($key.'_i', '', $oldid, ''); 569 $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 570 } 571 } else { 572 $metavalues[$oldid] = $newvalue; 573 if (!$this->saveIndex($key.'_w', '', $metavalues)) { 574 $this->unlock(); 575 return false; 576 } 577 } 578 } 579 580 $this->unlock(); 581 return true; 582 } 583 584 /** 585 * Remove a page from the index 586 * 587 * Erases entries in all known indexes. 588 * 589 * @param string $page a page name 590 * @return string|bool the function completed successfully 591 * 592 * @author Tom N Harris <tnharris@whoopdedo.org> 593 */ 594 public function deletePage($page) 595 { 596 if (!$this->lock()) return 'locked'; 597 598 $result = $this->deletePageNoLock($page); 599 $this->unlock(); 600 return $result; 601 } 602 603 /** 604 * Remove a page from the index without locking the index, 605 * only use this function if the index is already locked 606 * 607 * Erases entries in all known indexes. 608 * 609 * @param string $page a page name 610 * @return bool the function completed successfully 611 * 612 * @author Tom N Harris <tnharris@whoopdedo.org> 613 */ 614 protected function deletePageNoLock($page) 615 { 616 // load known documents 617 $pid = $this->getPIDNoLock($page); 618 if ($pid === false) { 619 return false; 620 } 621 622 // Remove obsolete index entries 623 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 624 if ($pageword_idx !== '') { 625 $delwords = explode(':', $pageword_idx); 626 $upwords = array(); 627 foreach ($delwords as $word) { 628 if ($word != '') { 629 list($wlen,$wid) = explode('*', $word); 630 $wid = (int)$wid; 631 $upwords[$wlen][] = $wid; 632 } 633 } 634 foreach ($upwords as $wlen => $widx) { 635 $index = $this->getIndex('i', $wlen); 636 foreach ($widx as $wid) { 637 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 638 } 639 $this->saveIndex('i', $wlen, $index); 640 } 641 } 642 // Save the reverse index 643 if (!$this->saveIndexKey('pageword', '', $pid, '')) { 644 return false; 645 } 646 647 $this->saveIndexKey('title', '', $pid, ''); 648 $keyidx = $this->getIndex('metadata', ''); 649 foreach ($keyidx as $metaname) { 650 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 651 $meta_idx = $this->getIndex($metaname.'_i', ''); 652 foreach ($val_idx as $id) { 653 if ($id === '') continue; 654 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 655 } 656 $this->saveIndex($metaname.'_i', '', $meta_idx); 657 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 658 } 659 660 return true; 661 } 662 663 /** 664 * Clear the whole index 665 * 666 * @return bool If the index has been cleared successfully 667 */ 668 public function clear() 669 { 670 global $conf; 671 672 if (!$this->lock()) return false; 673 674 @unlink($conf['indexdir'].'/page.idx'); 675 @unlink($conf['indexdir'].'/title.idx'); 676 @unlink($conf['indexdir'].'/pageword.idx'); 677 @unlink($conf['indexdir'].'/metadata.idx'); 678 $dir = @opendir($conf['indexdir']); 679 if ($dir !== false) { 680 while (($f = readdir($dir)) !== false) { 681 if (in_array($f[0], ['i', 'w']) && substr($f, -4) == '.idx') { 682 // fulltext index 683 @unlink($conf['indexdir']."/$f"); 684 } elseif (in_array(substr($f, -6), ['_w.idx','_i.idx','_p.idx'])) { 685 // metadata index 686 @unlink($conf['indexdir']."/$f"); 687 } 688 } 689 } 690 @unlink($conf['indexdir'].'/lengths.idx'); 691 692 // clear the pid cache 693 $this->pidCache = array(); 694 695 $this->unlock(); 696 return true; 697 } 698 699 /** 700 * Split the text into words for fulltext search 701 * 702 * @triggers INDEXER_TEXT_PREPARE 703 * This event allows plugins to modify the text before it gets tokenized. 704 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 705 * 706 * @param string $text plain text 707 * @param bool $wc are wildcards allowed? 708 * @return array list of words in the text 709 * 710 * @author Tom N Harris <tnharris@whoopdedo.org> 711 * @author Andreas Gohr <andi@splitbrain.org> 712 */ 713 public function tokenizer($text, $wc=false) 714 { 715 $wc = ($wc) ? '' : '\*'; 716 717 // prepare the text to be tokenized 718 $evt = new Event('INDEXER_TEXT_PREPARE', $text); 719 if ($evt->advise_before(true)) { 720 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 721 $text = Utf8\Asian::separateAsianWords($text); 722 } 723 } 724 $evt->advise_after(); 725 unset($evt); 726 727 $text = strtr($text, 728 array( 729 "\r" => ' ', 730 "\n" => ' ', 731 "\t" => ' ', 732 "\xC2\xAD" => '', //soft-hyphen 733 ) 734 ); 735 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 736 $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 737 } 738 739 $wordlist = explode(' ', $text); 740 foreach ($wordlist as $i => $word) { 741 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 742 Utf8\PhpString::strtolower($word) : strtolower($word); 743 } 744 745 foreach ($wordlist as $i => $word) { 746 if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 747 || array_search($word, $this->getStopwords(), true) !== false) { 748 unset($wordlist[$i]); 749 } 750 } 751 return array_values($wordlist); 752 } 753 754 /** 755 * Get the numeric PID of a page 756 * 757 * @param string $page The page to get the PID for 758 * @return int|bool The page id on success, false on error 759 */ 760 public function getPID($page) 761 { 762 // return PID without locking when it is in the cache 763 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 764 765 if (!$this->lock()) return false; 766 767 // load known documents 768 $pid = $this->getPIDNoLock($page); 769 if ($pid === false) { 770 $this->unlock(); 771 return false; 772 } 773 774 $this->unlock(); 775 return $pid; 776 } 777 778 /** 779 * Get the numeric PID of a page without locking the index. 780 * Only use this function when the index is already locked. 781 * 782 * @param string $page The page to get the PID for 783 * @return int|bool The page id on success, false on error 784 */ 785 protected function getPIDNoLock($page) 786 { 787 // avoid expensive addIndexKey operation for the most recently 788 // requested pages by using a cache 789 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 790 $pid = $this->addIndexKey('page', '', $page); 791 // limit cache to 10 entries by discarding the oldest element 792 // as in DokuWiki usually only the most recently 793 // added item will be requested again 794 if (count($this->pidCache) > 10) array_shift($this->pidCache); 795 $this->pidCache[$page] = $pid; 796 return $pid; 797 } 798 799 /** 800 * Get the page id of a numeric PID 801 * 802 * @param int $pid The PID to get the page id for 803 * @return string The page id 804 */ 805 public function getPageFromPID($pid) 806 { 807 return $this->getIndexKey('page', '', $pid); 808 } 809 810 /** 811 * Find pages in the fulltext index containing the words, 812 * 813 * The search words must be pre-tokenized, meaning only letters and 814 * numbers with an optional wildcard 815 * 816 * The returned array will have the original tokens as key. The values 817 * in the returned list is an array with the page names as keys and the 818 * number of times that token appears on the page as value. 819 * 820 * @param array $tokens list of words to search for 821 * @return array list of page names with usage counts 822 * 823 * @author Tom N Harris <tnharris@whoopdedo.org> 824 * @author Andreas Gohr <andi@splitbrain.org> 825 */ 826 public function lookup(&$tokens) 827 { 828 $result = array(); 829 $wids = $this->getIndexWords($tokens, $result); 830 if (empty($wids)) return array(); 831 // load known words and documents 832 $page_idx = $this->getIndex('page', ''); 833 $docs = array(); 834 foreach (array_keys($wids) as $wlen) { 835 $wids[$wlen] = array_unique($wids[$wlen]); 836 $index = $this->getIndex('i', $wlen); 837 foreach ($wids[$wlen] as $ixid) { 838 if ($ixid < count($index)) { 839 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 840 } 841 } 842 } 843 // merge found pages into final result array 844 $final = array(); 845 foreach ($result as $word => $res) { 846 $final[$word] = array(); 847 foreach ($res as $wid) { 848 // handle the case when ($ixid < count($index)) has been false 849 // and thus $docs[$wid] hasn't been set. 850 if (!isset($docs[$wid])) continue; 851 $hits = &$docs[$wid]; 852 foreach ($hits as $hitkey => $hitcnt) { 853 // make sure the document still exists 854 if (!page_exists($hitkey, '', false)) continue; 855 if (!isset($final[$word][$hitkey])) { 856 $final[$word][$hitkey] = $hitcnt; 857 } else { 858 $final[$word][$hitkey] += $hitcnt; 859 } 860 } 861 } 862 } 863 return $final; 864 } 865 866 /** 867 * Find pages containing a metadata key. 868 * 869 * The metadata values are compared as case-sensitive strings. Pass a 870 * callback function that returns true or false to use a different 871 * comparison function. The function will be called with the $value being 872 * searched for as the first argument, and the word in the index as the 873 * second argument. The function preg_match can be used directly if the 874 * values are regexes. 875 * 876 * @param string $key name of the metadata key to look for 877 * @param string $value search term to look for, must be a string or array of strings 878 * @param callback $func comparison function 879 * @return array lists with page names, keys are query values if $value is array 880 * 881 * @author Tom N Harris <tnharris@whoopdedo.org> 882 * @author Michael Hamann <michael@content-space.de> 883 */ 884 public function lookupKey($key, &$value, $func=null) 885 { 886 if (!is_array($value)) { 887 $value_array = array($value); 888 } else { 889 $value_array =& $value; 890 } 891 892 // the matching ids for the provided value(s) 893 $value_ids = array(); 894 895 $metaname = $this->cleanName($key); 896 897 // get all words in order to search the matching ids 898 if ($key == 'title') { 899 $words = $this->getIndex('title', ''); 900 } else { 901 $words = $this->getIndex($metaname.'_w', ''); 902 } 903 904 if (!is_null($func)) { 905 foreach ($value_array as $val) { 906 foreach ($words as $i => $word) { 907 if (call_user_func_array($func, array($val, $word))) { 908 $value_ids[$i][] = $val; 909 } 910 } 911 } 912 } else { 913 foreach ($value_array as $val) { 914 $xval = $val; 915 $caret = '^'; 916 $dollar = '$'; 917 // check for wildcards 918 if (substr($xval, 0, 1) == '*') { 919 $xval = substr($xval, 1); 920 $caret = ''; 921 } 922 if (substr($xval, -1, 1) == '*') { 923 $xval = substr($xval, 0, -1); 924 $dollar = ''; 925 } 926 if (!$caret || !$dollar) { 927 $re = $caret.preg_quote($xval, '/').$dollar; 928 foreach (array_keys(preg_grep('/'.$re.'/', $words)) as $i) { 929 $value_ids[$i][] = $val; 930 } 931 } else { 932 if (($i = array_search($val, $words, true)) !== false) { 933 $value_ids[$i][] = $val; 934 } 935 } 936 } 937 } 938 939 unset($words); // free the used memory 940 941 // initialize the result so it won't be null 942 $result = array(); 943 foreach ($value_array as $val) { 944 $result[$val] = array(); 945 } 946 947 $page_idx = $this->getIndex('page', ''); 948 949 // Special handling for titles 950 if ($key == 'title') { 951 foreach ($value_ids as $pid => $val_list) { 952 $page = $page_idx[$pid]; 953 foreach ($val_list as $val) { 954 $result[$val][] = $page; 955 } 956 } 957 } else { 958 // load all lines and pages so the used lines can be taken 959 // and matched with the pages 960 $lines = $this->getIndex($metaname.'_i', ''); 961 962 foreach ($value_ids as $value_id => $val_list) { 963 // parse the tuples of the form page_id*1:page2_id*1 and so on, 964 // return value is an array with page_id => 1, page2_id => 1 etc. 965 // so take the keys only 966 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 967 foreach ($val_list as $val) { 968 $result[$val] = array_merge($result[$val], $pages); 969 } 970 } 971 } 972 if (!is_array($value)) $result = $result[$value]; 973 return $result; 974 } 975 976 /** 977 * Find the index ID of each search term. 978 * 979 * The query terms should only contain valid characters, with a '*' at 980 * either the beginning or end of the word (or both). 981 * The $result parameter can be used to merge the index locations with 982 * the appropriate query term. 983 * 984 * @param array $words The query terms. 985 * @param array $result Set to word => array("length*id" ...) 986 * @return array Set to length => array(id ...) 987 * 988 * @author Tom N Harris <tnharris@whoopdedo.org> 989 */ 990 protected function getIndexWords(&$words, &$result) 991 { 992 $tokens = array(); 993 $tokenlength = array(); 994 $tokenwild = array(); 995 foreach ($words as $word) { 996 $result[$word] = array(); 997 $caret = '^'; 998 $dollar = '$'; 999 $xword = $word; 1000 $wlen = static::wordlen($word); 1001 1002 // check for wildcards 1003 if (substr($xword, 0, 1) == '*') { 1004 $xword = substr($xword, 1); 1005 $caret = ''; 1006 $wlen -= 1; 1007 } 1008 if (substr($xword, -1, 1) == '*') { 1009 $xword = substr($xword, 0, -1); 1010 $dollar = ''; 1011 $wlen -= 1; 1012 } 1013 if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) { 1014 continue; 1015 } 1016 if (!isset($tokens[$xword])) { 1017 $tokenlength[$wlen][] = $xword; 1018 } 1019 if (!$caret || !$dollar) { 1020 $re = $caret.preg_quote($xword, '/').$dollar; 1021 $tokens[$xword][] = array($word, '/'.$re.'/'); 1022 if (!isset($tokenwild[$xword])) { 1023 $tokenwild[$xword] = $wlen; 1024 } 1025 } else { 1026 $tokens[$xword][] = array($word, null); 1027 } 1028 } 1029 asort($tokenwild); 1030 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 1031 // $tokenlength = array( base word length => base word ... ) 1032 // $tokenwild = array( base word => base word length ... ) 1033 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 1034 $indexes_known = $this->indexLengths($length_filter); 1035 if (!empty($tokenwild)) sort($indexes_known); 1036 // get word IDs 1037 $wids = array(); 1038 foreach ($indexes_known as $ixlen) { 1039 $word_idx = $this->getIndex('w', $ixlen); 1040 // handle exact search 1041 if (isset($tokenlength[$ixlen])) { 1042 foreach ($tokenlength[$ixlen] as $xword) { 1043 $wid = array_search($xword, $word_idx, true); 1044 if ($wid !== false) { 1045 $wids[$ixlen][] = $wid; 1046 foreach ($tokens[$xword] as $w) 1047 $result[$w[0]][] = "$ixlen*$wid"; 1048 } 1049 } 1050 } 1051 // handle wildcard search 1052 foreach ($tokenwild as $xword => $wlen) { 1053 if ($wlen >= $ixlen) break; 1054 foreach ($tokens[$xword] as $w) { 1055 if (is_null($w[1])) continue; 1056 foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { 1057 $wids[$ixlen][] = $wid; 1058 $result[$w[0]][] = "$ixlen*$wid"; 1059 } 1060 } 1061 } 1062 } 1063 return $wids; 1064 } 1065 1066 /** 1067 * Return a list of all pages 1068 * Warning: pages may not exist! 1069 * 1070 * @param string $key list only pages containing the metadata key (optional) 1071 * @return array list of page names 1072 * 1073 * @author Tom N Harris <tnharris@whoopdedo.org> 1074 */ 1075 public function getPages($key=null) 1076 { 1077 $page_idx = $this->getIndex('page', ''); 1078 if (is_null($key)) return $page_idx; 1079 1080 $metaname = $this->cleanName($key); 1081 1082 // Special handling for titles 1083 if ($key == 'title') { 1084 $title_idx = $this->getIndex('title', ''); 1085 array_splice($page_idx, count($title_idx)); 1086 foreach ($title_idx as $i => $title) { 1087 if ($title === '') unset($page_idx[$i]); 1088 } 1089 return array_values($page_idx); 1090 } 1091 1092 $pages = array(); 1093 $lines = $this->getIndex($metaname.'_i', ''); 1094 foreach ($lines as $line) { 1095 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 1096 } 1097 return array_keys($pages); 1098 } 1099 1100 /** 1101 * Return a list of words sorted by number of times used 1102 * 1103 * @param int $min bottom frequency threshold 1104 * @param int $max upper frequency limit. No limit if $max<$min 1105 * @param int $minlen minimum length of words to count 1106 * @param string $key metadata key to list. Uses the fulltext index if not given 1107 * @return array list of words as the keys and frequency as values 1108 * 1109 * @author Tom N Harris <tnharris@whoopdedo.org> 1110 */ 1111 public function histogram($min=1, $max=0, $minlen=3, $key=null) 1112 { 1113 if ($min < 1) $min = 1; 1114 if ($max < $min) $max = 0; 1115 1116 $result = array(); 1117 1118 if ($key == 'title') { 1119 $index = $this->getIndex('title', ''); 1120 $index = array_count_values($index); 1121 foreach ($index as $val => $cnt) { 1122 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) { 1123 $result[$val] = $cnt; 1124 } 1125 } 1126 } elseif (!is_null($key)) { 1127 $metaname = $this->cleanName($key); 1128 $index = $this->getIndex($metaname.'_i', ''); 1129 $val_idx = array(); 1130 foreach ($index as $wid => $line) { 1131 $freq = $this->countTuples($line); 1132 if ($freq >= $min && (!$max || $freq <= $max)) { 1133 $val_idx[$wid] = $freq; 1134 } 1135 } 1136 if (!empty($val_idx)) { 1137 $words = $this->getIndex($metaname.'_w', ''); 1138 foreach ($val_idx as $wid => $freq) { 1139 if (strlen($words[$wid]) >= $minlen) { 1140 $result[$words[$wid]] = $freq; 1141 } 1142 } 1143 } 1144 } else { 1145 $lengths = $this->listIndexLengths(); 1146 foreach ($lengths as $length) { 1147 if ($length < $minlen) continue; 1148 $index = $this->getIndex('i', $length); 1149 $words = null; 1150 foreach ($index as $wid => $line) { 1151 $freq = $this->countTuples($line); 1152 if ($freq >= $min && (!$max || $freq <= $max)) { 1153 if ($words === null) { 1154 $words = $this->getIndex('w', $length); 1155 } 1156 $result[$words[$wid]] = $freq; 1157 } 1158 } 1159 } 1160 } 1161 1162 arsort($result); 1163 return $result; 1164 } 1165 1166 /** 1167 * Clean a name of a key for use as a file name. 1168 * 1169 * Romanizes non-latin characters, then strips away anything that's 1170 * not a letter, number, or underscore. 1171 * 1172 * @author Tom N Harris <tnharris@whoopdedo.org> 1173 * 1174 * @param string $name 1175 * @return string 1176 */ 1177 protected function cleanName($name) 1178 { 1179 $name = Utf8\Clean::romanize(trim((string)$name)); 1180 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1181 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1182 return strtolower($name); 1183 } 1184 1185 /** 1186 * Lock the indexer. 1187 * 1188 * @author Tom N Harris <tnharris@whoopdedo.org> 1189 * 1190 * @return bool|string 1191 */ 1192 protected function lock() 1193 { 1194 global $conf; 1195 $status = true; 1196 $run = 0; 1197 $lock = $conf['lockdir'].'/_indexer.lock'; 1198 while (!@mkdir($lock, $conf['dmode'])) { 1199 usleep(50); 1200 if (is_dir($lock) && time() - @filemtime($lock) > 60*5) { 1201 // looks like a stale lock - remove it 1202 if (!@rmdir($lock)) { 1203 $status = "removing the stale lock failed"; 1204 return false; 1205 } else { 1206 $status = "stale lock removed"; 1207 } 1208 } elseif ($run++ == 1000) { 1209 // we waited 5 seconds for that lock 1210 return false; 1211 } 1212 } 1213 if (!empty($conf['dperm'])) { 1214 chmod($lock, $conf['dperm']); 1215 } 1216 return $status; 1217 } 1218 1219 /** 1220 * Release the indexer lock. 1221 * 1222 * @author Tom N Harris <tnharris@whoopdedo.org> 1223 * 1224 * @return bool 1225 */ 1226 protected function unlock() 1227 { 1228 global $conf; 1229 @rmdir($conf['lockdir'].'/_indexer.lock'); 1230 return true; 1231 } 1232 1233 /** 1234 * Retrieve the entire index. 1235 * 1236 * The $suffix argument is for an index that is split into multiple parts. 1237 * Different index files should use different base names. 1238 * 1239 * @param string $idx name of the index 1240 * @param string $suffix subpart identifier 1241 * @return array list of lines without CR or LF 1242 * 1243 * @author Tom N Harris <tnharris@whoopdedo.org> 1244 */ 1245 public function getIndex($idx, $suffix) 1246 { 1247 global $conf; 1248 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1249 if (!file_exists($fn)) return array(); 1250 return file($fn, FILE_IGNORE_NEW_LINES); 1251 } 1252 1253 /** 1254 * Replace the contents of the index with an array. 1255 * 1256 * @param string $idx name of the index 1257 * @param string $suffix subpart identifier 1258 * @param array $lines list of lines without LF 1259 * @return bool If saving succeeded 1260 * 1261 * @author Tom N Harris <tnharris@whoopdedo.org> 1262 */ 1263 protected function saveIndex($idx, $suffix, &$lines) 1264 { 1265 global $conf; 1266 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1267 $fh = @fopen($fn.'.tmp', 'w'); 1268 if (!$fh) return false; 1269 fwrite($fh, join("\n", $lines)); 1270 if (!empty($lines)) { 1271 fwrite($fh, "\n"); 1272 } 1273 fclose($fh); 1274 if (isset($conf['fperm'])) { 1275 chmod($fn.'.tmp', $conf['fperm']); 1276 } 1277 io_rename($fn.'.tmp', $fn.'.idx'); 1278 return true; 1279 } 1280 1281 /** 1282 * Retrieve a line from the index. 1283 * 1284 * @param string $idx name of the index 1285 * @param string $suffix subpart identifier 1286 * @param int $id the line number 1287 * @return string a line with trailing whitespace removed 1288 * 1289 * @author Tom N Harris <tnharris@whoopdedo.org> 1290 */ 1291 protected function getIndexKey($idx, $suffix, $id) 1292 { 1293 global $conf; 1294 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1295 if (!file_exists($fn)) return ''; 1296 $fh = @fopen($fn, 'r'); 1297 if (!$fh) return ''; 1298 $ln = -1; 1299 while (($line = fgets($fh)) !== false) { 1300 if (++$ln == $id) break; 1301 } 1302 fclose($fh); 1303 return rtrim((string)$line); 1304 } 1305 1306 /** 1307 * Write a line into the index. 1308 * 1309 * @param string $idx name of the index 1310 * @param string $suffix subpart identifier 1311 * @param int $id the line number 1312 * @param string $line line to write 1313 * @return bool If saving succeeded 1314 * 1315 * @author Tom N Harris <tnharris@whoopdedo.org> 1316 */ 1317 protected function saveIndexKey($idx, $suffix, $id, $line) 1318 { 1319 global $conf; 1320 if (substr($line, -1) !== "\n") { 1321 $line .= "\n"; 1322 } 1323 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1324 $fh = @fopen($fn.'.tmp', 'w'); 1325 if (!$fh) return false; 1326 $ih = @fopen($fn.'.idx', 'r'); 1327 if ($ih) { 1328 $ln = -1; 1329 while (($curline = fgets($ih)) !== false) { 1330 fwrite($fh, (++$ln == $id) ? $line : $curline); 1331 } 1332 if ($id > $ln) { 1333 while ($id > ++$ln) { 1334 fwrite($fh, "\n"); 1335 } 1336 fwrite($fh, $line); 1337 } 1338 fclose($ih); 1339 } else { 1340 $ln = -1; 1341 while ($id > ++$ln) { 1342 fwrite($fh, "\n"); 1343 } 1344 fwrite($fh, $line); 1345 } 1346 fclose($fh); 1347 if (isset($conf['fperm'])) { 1348 chmod($fn.'.tmp', $conf['fperm']); 1349 } 1350 io_rename($fn.'.tmp', $fn.'.idx'); 1351 return true; 1352 } 1353 1354 /** 1355 * Retrieve or insert a value in the index. 1356 * 1357 * @param string $idx name of the index 1358 * @param string $suffix subpart identifier 1359 * @param string $value line to find in the index 1360 * @return int|bool line number of the value in the index 1361 * or false if writing the index failed 1362 * 1363 * @author Tom N Harris <tnharris@whoopdedo.org> 1364 */ 1365 protected function addIndexKey($idx, $suffix, $value) 1366 { 1367 $index = $this->getIndex($idx, $suffix); 1368 $id = array_search($value, $index, true); 1369 if ($id === false) { 1370 $id = count($index); 1371 $index[$id] = $value; 1372 if (!$this->saveIndex($idx, $suffix, $index)) { 1373 trigger_error("Failed to write $idx index", E_USER_ERROR); 1374 return false; 1375 } 1376 } 1377 return $id; 1378 } 1379 1380 /** 1381 * Get the list of lengths indexed in the wiki. 1382 * 1383 * Read the index directory or a cache file and returns 1384 * a sorted array of lengths of the words used in the wiki. 1385 * 1386 * @author YoBoY <yoboy.leguesh@gmail.com> 1387 * 1388 * @return array 1389 */ 1390 public function listIndexLengths() 1391 { 1392 global $conf; 1393 $lengthsFile = $conf['indexdir'].'/lengths.idx'; 1394 1395 // testing what we have to do, create a cache file or not. 1396 if ($conf['readdircache'] == 0) { 1397 $docache = false; 1398 } else { 1399 clearstatcache(); 1400 if (file_exists($lengthsFile) 1401 && (time() < @filemtime($lengthsFile) + $conf['readdircache']) 1402 ) { 1403 if ( 1404 ($lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 1405 !== false 1406 ) { 1407 $idx = array(); 1408 foreach ($lengths as $length) { 1409 $idx[] = (int)$length; 1410 } 1411 return $idx; 1412 } 1413 } 1414 $docache = true; 1415 } 1416 1417 if ($conf['readdircache'] == 0 || $docache) { 1418 $dir = @opendir($conf['indexdir']); 1419 if ($dir === false) return array(); 1420 $idx = array(); 1421 while (($f = readdir($dir)) !== false) { 1422 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1423 $i = substr($f, 1, -4); 1424 if (is_numeric($i)) $idx[] = (int)$i; 1425 } 1426 } 1427 closedir($dir); 1428 sort($idx); 1429 // save this in a file 1430 if ($docache) { 1431 $handle = @fopen($lengthsFile, 'w'); 1432 @fwrite($handle, implode("\n", $idx)); 1433 @fclose($handle); 1434 } 1435 return $idx; 1436 } 1437 return array(); 1438 } 1439 1440 /** 1441 * Get the word lengths that have been indexed. 1442 * 1443 * Reads the index directory and returns an array of lengths 1444 * that there are indices for. 1445 * 1446 * @author YoBoY <yoboy.leguesh@gmail.com> 1447 * 1448 * @param array|int $filter 1449 * @return array 1450 */ 1451 protected function indexLengths($filter) 1452 { 1453 global $conf; 1454 $idx = array(); 1455 if (is_array($filter)) { 1456 // testing if index files exist only 1457 $path = $conf['indexdir']."/i"; 1458 foreach ($filter as $key => $value) { 1459 if (file_exists($path.$key.'.idx')) { 1460 $idx[] = $key; 1461 } 1462 } 1463 } else { 1464 $lengths = $this->listIndexLengths(); 1465 foreach ($lengths as $key => $length) { 1466 // keep all the values equal or superior 1467 if ((int)$length >= (int)$filter) { 1468 $idx[] = $length; 1469 } 1470 } 1471 } 1472 return $idx; 1473 } 1474 1475 /** 1476 * Insert or replace a tuple in a line. 1477 * 1478 * @author Tom N Harris <tnharris@whoopdedo.org> 1479 * 1480 * @param string $line 1481 * @param int $id 1482 * @param int $count 1483 * @return string 1484 */ 1485 protected function updateTuple($line, $id, $count) 1486 { 1487 if ($line != '') { 1488 $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1489 } 1490 $line = trim($line, ':'); 1491 if ($count) { 1492 if ($line) { 1493 return "$id*$count:".$line; 1494 } else { 1495 return "$id*$count"; 1496 } 1497 } 1498 return $line; 1499 } 1500 1501 /** 1502 * Split a line into an array of tuples. 1503 * 1504 * @author Tom N Harris <tnharris@whoopdedo.org> 1505 * @author Andreas Gohr <andi@splitbrain.org> 1506 * 1507 * @param array $keys 1508 * @param string $line 1509 * @return array 1510 */ 1511 protected function parseTuples(&$keys, $line) 1512 { 1513 $result = array(); 1514 if ($line == '') return $result; 1515 $parts = explode(':', $line); 1516 foreach ($parts as $tuple) { 1517 if ($tuple === '') continue; 1518 list($key, $cnt) = explode('*', $tuple); 1519 if (!$cnt) continue; 1520 $key = $keys[$key]; 1521 if ($key === false || is_null($key)) continue; 1522 $result[$key] = $cnt; 1523 } 1524 return $result; 1525 } 1526 1527 /** 1528 * Sum the counts in a list of tuples. 1529 * 1530 * @author Tom N Harris <tnharris@whoopdedo.org> 1531 * 1532 * @param string $line 1533 * @return int 1534 */ 1535 protected function countTuples($line) 1536 { 1537 $freq = 0; 1538 $parts = explode(':', $line); 1539 foreach ($parts as $tuple) { 1540 if ($tuple === '') continue; 1541 list(/* $pid */, $cnt) = explode('*', $tuple); 1542 $freq += (int)$cnt; 1543 } 1544 return $freq; 1545 } 1546} 1547