1<?php 2namespace dokuwiki\Search; 3 4use dokuwiki\Extension\Event; 5use dokuwiki\Utf8; 6 7// Version tag used to force rebuild on upgrade 8const INDEXER_VERSION = 8; 9 10// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 11const MINWORDLENGTH = 2; 12 13 14/** 15 * Class DokuWIki Indexer for Fulltext Search 16 * 17 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18 * @author Andreas Gohr <andi@splitbrain.org> 19 * @author Tom N Harris <tnharris@whoopdedo.org> 20 */ 21class Indexer { 22 23 /** @var Indexer */ 24 protected static $instance = null; 25 26 /** @var array $pidCache Cache for getPID() */ 27 protected $pidCache = array(); 28 29 /** @var array $Stopwords Words that indexer ignores */ 30 protected $Stopwords; 31 32 /** @var int $MinWordLength minimum token length */ 33 protected $MinWordLength; 34 35 /** 36 * Indexer constructor. Singleton, thus protected! 37 */ 38 protected function __construct() { 39 // set the minimum token length to use in the index 40 // (note, this doesn't apply to numeric tokens) 41 $this->MinWordLength = (defined('IDX_MINWORDLENGTH')) 42 ? IDX_MINWORDLENGTH 43 : MINWORDLENGTH; 44 } 45 46 /** 47 * Get new or existing singleton instance of the Indexer 48 * 49 * @return Indexer 50 */ 51 public static function getInstance() 52 { 53 if (is_null(static::$instance)) { 54 static::$instance = new static(); 55 } 56 return static::$instance; 57 } 58 59 /** 60 * Returns words that will be ignored 61 * 62 * @return array list of stop words 63 * 64 * @author Tom N Harris <tnharris@whoopdedo.org> 65 */ 66 public function getStopwords() 67 { 68 if (!isset($this->Stopwords)) { 69 global $conf; 70 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 71 if (file_exists($swfile)) { 72 $this->Stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 73 } else { 74 $this->Stopwords = array(); 75 } 76 } 77 return $this->Stopwords; 78 } 79 80 /** 81 * Measure the length of a string. 82 * Differs from strlen in handling of asian characters. 83 * 84 * @author Tom N Harris <tnharris@whoopdedo.org> 85 * 86 * @param string $w 87 * @return int 88 */ 89 public static function wordlen($w) { 90 $l = strlen($w); 91 // If left alone, all chinese "words" will get put into w3.idx 92 // So the "length" of a "word" is faked 93 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 94 foreach ($leadbytes[0] as $b) { 95 $l += ord($b) - 0xE1; 96 } 97 } 98 return $l; 99 } 100 101 /** 102 * Version of the indexer taking into consideration the external tokenizer. 103 * The indexer is only compatible with data written by the same version. 104 * 105 * @triggers INDEXER_VERSION_GET 106 * Plugins that modify what gets indexed should hook this event and 107 * add their version info to the event data like so: 108 * $data[$plugin_name] = $plugin_version; 109 * 110 * @author Tom N Harris <tnharris@whoopdedo.org> 111 * @author Michael Hamann <michael@content-space.de> 112 * 113 * @return int|string 114 */ 115 public function getVersion() 116 { 117 static $indexer_version = null; 118 if ($indexer_version == null) { 119 $version = INDEXER_VERSION; 120 121 // DokuWiki version is included for the convenience of plugins 122 $data = array('dokuwiki' => $version); 123 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 124 unset($data['dokuwiki']); // this needs to be first 125 ksort($data); 126 foreach ($data as $plugin => $vers) { 127 $version .= '+'.$plugin.'='.$vers; 128 } 129 $indexer_version = $version; 130 } 131 return $indexer_version; 132 } 133 134 /** 135 * Adds/updates the search index for the given page 136 * 137 * Locking is handled internally. 138 * 139 * @param string $page name of the page to index 140 * @param boolean $verbose print status messages 141 * @param boolean $force force reindexing even when the index is up to date 142 * @return string|boolean the function completed successfully 143 * 144 * @author Tom N Harris <tnharris@whoopdedo.org> 145 */ 146 public function addPage($page, $verbose=false, $force=false) 147 { 148 $idxtag = metaFN($page,'.indexed'); 149 // check if page was deleted but is still in the index 150 if (!page_exists($page)) { 151 if (!file_exists($idxtag)) { 152 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 153 return false; 154 } 155 $result = $this->deletePage($page); 156 if ($result === 'locked') { 157 if ($verbose) print("Indexer: locked".DOKU_LF); 158 return false; 159 } 160 @unlink($idxtag); 161 return $result; 162 } 163 164 // check if indexing needed 165 if (!$force && file_exists($idxtag)) { 166 if (trim(io_readFile($idxtag)) == $this->getVersion()) { 167 $last = @filemtime($idxtag); 168 if ($last > @filemtime(wikiFN($page))) { 169 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 170 return false; 171 } 172 } 173 } 174 175 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 176 if ($indexenabled === false) { 177 $result = false; 178 if (file_exists($idxtag)) { 179 $result = $this->deletePage($page); 180 if ($result === 'locked') { 181 if ($verbose) print("Indexer: locked".DOKU_LF); 182 return false; 183 } 184 @unlink($idxtag); 185 } 186 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 187 return $result; 188 } 189 190 $pid = $this->getPID($page); 191 if ($pid === false) { 192 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 193 return false; 194 } 195 $body = ''; 196 $metadata = array(); 197 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 198 199 $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED); 200 $metadata['relation_references'] = ($references !== null) ? 201 array_keys($references) : array(); 202 203 $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED); 204 $metadata['relation_media'] = ($media !== null) ? 205 array_keys($media) : array(); 206 207 $data = compact('page', 'body', 'metadata', 'pid'); 208 $evt = new Event('INDEXER_PAGE_ADD', $data); 209 if ($evt->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page); 210 $evt->advise_after(); 211 unset($evt); 212 extract($data); 213 214 $result = $this->addPageWords($page, $body); 215 if ($result === 'locked') { 216 if ($verbose) print("Indexer: locked".DOKU_LF); 217 return false; 218 } 219 220 if ($result) { 221 $result = $this->addMetaKeys($page, $metadata); 222 if ($result === 'locked') { 223 if ($verbose) print("Indexer: locked".DOKU_LF); 224 return false; 225 } 226 } 227 228 if ($result) { 229 io_saveFile(metaFN($page,'.indexed'), $this->getVersion()); 230 } 231 if ($verbose) { 232 print("Indexer: finished".DOKU_LF); 233 return true; 234 } 235 return $result; 236 } 237 238 /** 239 * Adds the contents of a page to the fulltext index 240 * 241 * The added text replaces previous words for the same page. 242 * An empty value erases the page. 243 * 244 * @param string $page a page name 245 * @param string $text the body of the page 246 * @return string|boolean the function completed successfully 247 * 248 * @author Tom N Harris <tnharris@whoopdedo.org> 249 * @author Andreas Gohr <andi@splitbrain.org> 250 */ 251 public function addPageWords($page, $text) 252 { 253 if (!$this->lock()) return 'locked'; 254 255 // load known documents 256 $pid = $this->getPIDNoLock($page); 257 if ($pid === false) { 258 $this->unlock(); 259 return false; 260 } 261 262 $pagewords = array(); 263 // get word usage in page 264 $words = $this->getPageWords($text); 265 if ($words === false) { 266 $this->unlock(); 267 return false; 268 } 269 270 if (!empty($words)) { 271 foreach (array_keys($words) as $wlen) { 272 $index = $this->getIndex('i', $wlen); 273 foreach ($words[$wlen] as $wid => $freq) { 274 $idx = ($wid < count($index)) ? $index[$wid] : ''; 275 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 276 $pagewords[] = "$wlen*$wid"; 277 } 278 if (!$this->saveIndex('i', $wlen, $index)) { 279 $this->unlock(); 280 return false; 281 } 282 } 283 } 284 285 // Remove obsolete index entries 286 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 287 if ($pageword_idx !== '') { 288 $oldwords = explode(':',$pageword_idx); 289 $delwords = array_diff($oldwords, $pagewords); 290 $upwords = array(); 291 foreach ($delwords as $word) { 292 if ($word != '') { 293 list($wlen,$wid) = explode('*', $word); 294 $wid = (int)$wid; 295 $upwords[$wlen][] = $wid; 296 } 297 } 298 foreach ($upwords as $wlen => $widx) { 299 $index = $this->getIndex('i', $wlen); 300 foreach ($widx as $wid) { 301 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 302 } 303 $this->saveIndex('i', $wlen, $index); 304 } 305 } 306 // Save the reverse index 307 $pageword_idx = join(':', $pagewords); 308 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 309 $this->unlock(); 310 return false; 311 } 312 313 $this->unlock(); 314 return true; 315 } 316 317 /** 318 * Split the words in a page and add them to the index. 319 * 320 * @param string $text content of the page 321 * @return array list of word IDs and number of times used 322 * 323 * @author Andreas Gohr <andi@splitbrain.org> 324 * @author Christopher Smith <chris@jalakai.co.uk> 325 * @author Tom N Harris <tnharris@whoopdedo.org> 326 */ 327 protected function getPageWords($text) 328 { 329 $tokens = $this->tokenizer($text); 330 $tokens = array_count_values($tokens); // count the frequency of each token 331 332 $words = array(); 333 foreach ($tokens as $w => $c) { 334 $l = static::wordlen($w); 335 if (isset($words[$l])) { 336 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 337 } else { 338 $words[$l] = array($w => $c); 339 } 340 } 341 342 // arrive here with $words = array(wordlen => array(word => frequency)) 343 $word_idx_modified = false; 344 $index = array(); //resulting index 345 foreach (array_keys($words) as $wlen) { 346 $word_idx = $this->getIndex('w', $wlen); 347 foreach ($words[$wlen] as $word => $freq) { 348 $word = (string)$word; 349 $wid = array_search($word, $word_idx, true); 350 if ($wid === false) { 351 $wid = count($word_idx); 352 $word_idx[] = $word; 353 $word_idx_modified = true; 354 } 355 if (!isset($index[$wlen])) { 356 $index[$wlen] = array(); 357 } 358 $index[$wlen][$wid] = $freq; 359 } 360 // save back the word index 361 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) { 362 return false; 363 } 364 } 365 366 return $index; 367 } 368 369 /** 370 * Add/update keys to/of the metadata index. 371 * 372 * Adding new keys does not remove other keys for the page. 373 * An empty value will erase the key. 374 * The $key parameter can be an array to add multiple keys. $value will 375 * not be used if $key is an array. 376 * 377 * @param string $page a page name 378 * @param mixed $key a key string or array of key=>value pairs 379 * @param mixed $value the value or list of values 380 * @return boolean|string the function completed successfully 381 * 382 * @author Tom N Harris <tnharris@whoopdedo.org> 383 * @author Michael Hamann <michael@content-space.de> 384 */ 385 public function addMetaKeys($page, $key, $value=null) 386 { 387 if (!is_array($key)) { 388 $key = array($key => $value); 389 } elseif (!is_null($value)) { 390 // $key is array, but $value is not null 391 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 392 } 393 394 if (!$this->lock()) return 'locked'; 395 396 // load known documents 397 $pid = $this->getPIDNoLock($page); 398 if ($pid === false) { 399 $this->unlock(); 400 return false; 401 } 402 403 // Special handling for titles so the index file is simpler 404 if (array_key_exists('title', $key)) { 405 $value = $key['title']; 406 if (is_array($value)) { 407 $value = $value[0]; 408 } 409 $this->saveIndexKey('title', '', $pid, $value); 410 unset($key['title']); 411 } 412 413 foreach ($key as $name => $values) { 414 $metaname = $this->cleanName($name); 415 $this->addIndexKey('metadata', '', $metaname); 416 $metaidx = $this->getIndex($metaname.'_i', ''); 417 $metawords = $this->getIndex($metaname.'_w', ''); 418 $addwords = false; 419 420 if (!is_array($values)) $values = array($values); 421 422 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 423 if ($val_idx !== '') { 424 $val_idx = explode(':', $val_idx); 425 // -1 means remove, 0 keep, 1 add 426 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 427 } else { 428 $val_idx = array(); 429 } 430 431 foreach ($values as $val) { 432 $val = (string)$val; 433 if ($val !== '') { 434 $id = array_search($val, $metawords, true); 435 if ($id === false) { 436 // didn't find $val, so we'll add it to the end of metawords 437 // and create a placeholder in metaidx 438 $id = count($metawords); 439 $metawords[$id] = $val; 440 $metaidx[$id] = ''; 441 $addwords = true; 442 } 443 // test if value is already in the index 444 if (isset($val_idx[$id]) && $val_idx[$id] <= 0) { 445 $val_idx[$id] = 0; 446 } else { // else add it 447 $val_idx[$id] = 1; 448 } 449 } 450 } 451 452 if ($addwords) { 453 $this->saveIndex($metaname.'_w', '', $metawords); 454 } 455 $vals_changed = false; 456 foreach ($val_idx as $id => $action) { 457 if ($action == -1) { 458 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 459 $vals_changed = true; 460 unset($val_idx[$id]); 461 } elseif ($action == 1) { 462 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 463 $vals_changed = true; 464 } 465 } 466 467 if ($vals_changed) { 468 $this->saveIndex($metaname.'_i', '', $metaidx); 469 $val_idx = implode(':', array_keys($val_idx)); 470 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 471 } 472 473 unset($metaidx); 474 unset($metawords); 475 } 476 477 $this->unlock(); 478 return true; 479 } 480 481 /** 482 * Rename a page in the search index without changing the indexed content. 483 * This function doesn't check if the old or new name exists in the filesystem. 484 * It returns an error if the old page isn't in the page list of the indexer 485 * and it deletes all previously indexed content of the new page. 486 * 487 * @param string $oldpage The old page name 488 * @param string $newpage The new page name 489 * @return string|bool If the page was successfully renamed, 490 * can be a message in the case of an error 491 */ 492 public function renamePage($oldpage, $newpage) 493 { 494 if (!$this->lock()) return 'locked'; 495 496 $pages = $this->getPages(); 497 498 $id = array_search($oldpage, $pages, true); 499 if ($id === false) { 500 $this->unlock(); 501 return 'page is not in index'; 502 } 503 504 $new_id = array_search($newpage, $pages, true); 505 if ($new_id !== false) { 506 // make sure the page is not in the index anymore 507 if ($this->deletePageNoLock($newpage) !== true) { 508 return false; 509 } 510 511 $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 512 } 513 514 $pages[$id] = $newpage; 515 516 // update index 517 if (!$this->saveIndex('page', '', $pages)) { 518 $this->unlock(); 519 return false; 520 } 521 522 // reset the pid cache 523 $this->pidCache = array(); 524 525 $this->unlock(); 526 return true; 527 } 528 529 /** 530 * Renames a meta value in the index. 531 * This doesn't change the meta value in the pages, it assumes that 532 * all pages will be updated. 533 * 534 * @param string $key The metadata key of which a value shall be changed 535 * @param string $oldvalue The old value that shall be renamed 536 * @param string $newvalue The new value to which the old value shall be renamed, 537 * if exists values will be merged 538 * @return bool|string If renaming the value has been successful, false 539 * or error message on error. 540 */ 541 public function renameMetaValue($key, $oldvalue, $newvalue) 542 { 543 if (!$this->lock()) return 'locked'; 544 545 // change the relation references index 546 $metavalues = $this->getIndex($key, '_w'); 547 $oldid = array_search($oldvalue, $metavalues, true); 548 if ($oldid !== false) { 549 $newid = array_search($newvalue, $metavalues, true); 550 if ($newid !== false) { 551 // free memory 552 unset ($metavalues); 553 554 // okay, now we have two entries for the same value. we need to merge them. 555 $indexline = $this->getIndexKey($key.'_i', '', $oldid); 556 if ($indexline != '') { 557 $newindexline = $this->getIndexKey($key.'_i', '', $newid); 558 $pagekeys = $this->getIndex($key.'_p', ''); 559 $parts = explode(':', $indexline); 560 foreach ($parts as $part) { 561 list($id, $count) = explode('*', $part); 562 $newindexline = $this->updateTuple($newindexline, $id, $count); 563 564 $keyline = explode(':', $pagekeys[$id]); 565 // remove old meta value 566 $keyline = array_diff($keyline, array($oldid)); 567 // add new meta value when not already present 568 if (!in_array($newid, $keyline)) { 569 array_push($keyline, $newid); 570 } 571 $pagekeys[$id] = implode(':', $keyline); 572 } 573 $this->saveIndex($key.'_p', '', $pagekeys); 574 unset($pagekeys); 575 $this->saveIndexKey($key.'_i', '', $oldid, ''); 576 $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 577 } 578 } else { 579 $metavalues[$oldid] = $newvalue; 580 if (!$this->saveIndex($key.'_w', '', $metavalues)) { 581 $this->unlock(); 582 return false; 583 } 584 } 585 } 586 587 $this->unlock(); 588 return true; 589 } 590 591 /** 592 * Remove a page from the index 593 * 594 * Erases entries in all known indexes. 595 * 596 * @param string $page a page name 597 * @return string|boolean the function completed successfully 598 * 599 * @author Tom N Harris <tnharris@whoopdedo.org> 600 */ 601 public function deletePage($page) 602 { 603 if (!$this->lock()) return 'locked'; 604 605 $result = $this->deletePageNoLock($page); 606 $this->unlock(); 607 return $result; 608 } 609 610 /** 611 * Remove a page from the index without locking the index, 612 * only use this function if the index is already locked 613 * 614 * Erases entries in all known indexes. 615 * 616 * @param string $page a page name 617 * @return boolean the function completed successfully 618 * 619 * @author Tom N Harris <tnharris@whoopdedo.org> 620 */ 621 protected function deletePageNoLock($page) 622 { 623 // load known documents 624 $pid = $this->getPIDNoLock($page); 625 if ($pid === false) { 626 return false; 627 } 628 629 // Remove obsolete index entries 630 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 631 if ($pageword_idx !== '') { 632 $delwords = explode(':', $pageword_idx); 633 $upwords = array(); 634 foreach ($delwords as $word) { 635 if ($word != '') { 636 list($wlen,$wid) = explode('*', $word); 637 $wid = (int)$wid; 638 $upwords[$wlen][] = $wid; 639 } 640 } 641 foreach ($upwords as $wlen => $widx) { 642 $index = $this->getIndex('i', $wlen); 643 foreach ($widx as $wid) { 644 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 645 } 646 $this->saveIndex('i', $wlen, $index); 647 } 648 } 649 // Save the reverse index 650 if (!$this->saveIndexKey('pageword', '', $pid, '')) { 651 return false; 652 } 653 654 $this->saveIndexKey('title', '', $pid, ''); 655 $keyidx = $this->getIndex('metadata', ''); 656 foreach ($keyidx as $metaname) { 657 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 658 $meta_idx = $this->getIndex($metaname.'_i', ''); 659 foreach ($val_idx as $id) { 660 if ($id === '') continue; 661 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 662 } 663 $this->saveIndex($metaname.'_i', '', $meta_idx); 664 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 665 } 666 667 return true; 668 } 669 670 /** 671 * Clear the whole index 672 * 673 * @return bool If the index has been cleared successfully 674 */ 675 public function clear() 676 { 677 global $conf; 678 679 if (!$this->lock()) return false; 680 681 @unlink($conf['indexdir'].'/page.idx'); 682 @unlink($conf['indexdir'].'/title.idx'); 683 @unlink($conf['indexdir'].'/pageword.idx'); 684 @unlink($conf['indexdir'].'/metadata.idx'); 685 $dir = @opendir($conf['indexdir']); 686 if ($dir !== false) { 687 while (($f = readdir($dir)) !== false) { 688 if (in_array($f[0], ['i', 'w']) && substr($f, -4) == '.idx') { 689 // fulltext index 690 @unlink($conf['indexdir']."/$f"); 691 } elseif (in_array(substr($f, -6), ['_w.idx','_i.idx','_p.idx'])) { 692 // metadata index 693 @unlink($conf['indexdir']."/$f"); 694 } 695 } 696 } 697 @unlink($conf['indexdir'].'/lengths.idx'); 698 699 // clear the pid cache 700 $this->pidCache = array(); 701 702 $this->unlock(); 703 return true; 704 } 705 706 /** 707 * Split the text into words for fulltext search 708 * 709 * @triggers INDEXER_TEXT_PREPARE 710 * This event allows plugins to modify the text before it gets tokenized. 711 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 712 * 713 * @param string $text plain text 714 * @param boolean $wc are wildcards allowed? 715 * @return array list of words in the text 716 * 717 * @author Tom N Harris <tnharris@whoopdedo.org> 718 * @author Andreas Gohr <andi@splitbrain.org> 719 */ 720 public function tokenizer($text, $wc=false) 721 { 722 $wc = ($wc) ? '' : '\*'; 723 724 // prepare the text to be tokenized 725 $evt = new Event('INDEXER_TEXT_PREPARE', $text); 726 if ($evt->advise_before(true)) { 727 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 728 $text = Utf8\Asian::separateAsianWords($text); 729 } 730 } 731 $evt->advise_after(); 732 unset($evt); 733 734 $text = strtr($text, 735 array( 736 "\r" => ' ', 737 "\n" => ' ', 738 "\t" => ' ', 739 "\xC2\xAD" => '', //soft-hyphen 740 ) 741 ); 742 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 743 $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 744 } 745 746 $wordlist = explode(' ', $text); 747 foreach ($wordlist as $i => $word) { 748 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 749 Utf8\PhpString::strtolower($word) : strtolower($word); 750 } 751 752 foreach ($wordlist as $i => $word) { 753 if ((!is_numeric($word) && strlen($word) < $this->MinWordLength) 754 || array_search($word, $this->getStopwords(), true) !== false) { 755 unset($wordlist[$i]); 756 } 757 } 758 return array_values($wordlist); 759 } 760 761 /** 762 * Get the numeric PID of a page 763 * 764 * @param string $page The page to get the PID for 765 * @return bool|int The page id on success, false on error 766 */ 767 public function getPID($page) 768 { 769 // return PID without locking when it is in the cache 770 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 771 772 if (!$this->lock()) return false; 773 774 // load known documents 775 $pid = $this->getPIDNoLock($page); 776 if ($pid === false) { 777 $this->unlock(); 778 return false; 779 } 780 781 $this->unlock(); 782 return $pid; 783 } 784 785 /** 786 * Get the numeric PID of a page without locking the index. 787 * Only use this function when the index is already locked. 788 * 789 * @param string $page The page to get the PID for 790 * @return int|bool The page id on success, false on error 791 */ 792 protected function getPIDNoLock($page) 793 { 794 // avoid expensive addIndexKey operation for the most recently 795 // requested pages by using a cache 796 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 797 $pid = $this->addIndexKey('page', '', $page); 798 // limit cache to 10 entries by discarding the oldest element 799 // as in DokuWiki usually only the most recently 800 // added item will be requested again 801 if (count($this->pidCache) > 10) array_shift($this->pidCache); 802 $this->pidCache[$page] = $pid; 803 return $pid; 804 } 805 806 /** 807 * Get the page id of a numeric PID 808 * 809 * @param int $pid The PID to get the page id for 810 * @return string The page id 811 */ 812 public function getPageFromPID($pid) 813 { 814 return $this->getIndexKey('page', '', $pid); 815 } 816 817 /** 818 * Find pages in the fulltext index containing the words, 819 * 820 * The search words must be pre-tokenized, meaning only letters and 821 * numbers with an optional wildcard 822 * 823 * The returned array will have the original tokens as key. The values 824 * in the returned list is an array with the page names as keys and the 825 * number of times that token appears on the page as value. 826 * 827 * @param array $tokens list of words to search for 828 * @return array list of page names with usage counts 829 * 830 * @author Tom N Harris <tnharris@whoopdedo.org> 831 * @author Andreas Gohr <andi@splitbrain.org> 832 */ 833 public function lookup(&$tokens) 834 { 835 $result = array(); 836 $wids = $this->getIndexWords($tokens, $result); 837 if (empty($wids)) return array(); 838 // load known words and documents 839 $page_idx = $this->getIndex('page', ''); 840 $docs = array(); 841 foreach (array_keys($wids) as $wlen) { 842 $wids[$wlen] = array_unique($wids[$wlen]); 843 $index = $this->getIndex('i', $wlen); 844 foreach ($wids[$wlen] as $ixid) { 845 if ($ixid < count($index)) { 846 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 847 } 848 } 849 } 850 // merge found pages into final result array 851 $final = array(); 852 foreach ($result as $word => $res) { 853 $final[$word] = array(); 854 foreach ($res as $wid) { 855 // handle the case when ($ixid < count($index)) has been false 856 // and thus $docs[$wid] hasn't been set. 857 if (!isset($docs[$wid])) continue; 858 $hits = &$docs[$wid]; 859 foreach ($hits as $hitkey => $hitcnt) { 860 // make sure the document still exists 861 if (!page_exists($hitkey, '', false)) continue; 862 if (!isset($final[$word][$hitkey])) { 863 $final[$word][$hitkey] = $hitcnt; 864 } else { 865 $final[$word][$hitkey] += $hitcnt; 866 } 867 } 868 } 869 } 870 return $final; 871 } 872 873 /** 874 * Find pages containing a metadata key. 875 * 876 * The metadata values are compared as case-sensitive strings. Pass a 877 * callback function that returns true or false to use a different 878 * comparison function. The function will be called with the $value being 879 * searched for as the first argument, and the word in the index as the 880 * second argument. The function preg_match can be used directly if the 881 * values are regexes. 882 * 883 * @param string $key name of the metadata key to look for 884 * @param string $value search term to look for, must be a string or array of strings 885 * @param callback $func comparison function 886 * @return array lists with page names, keys are query values if $value is array 887 * 888 * @author Tom N Harris <tnharris@whoopdedo.org> 889 * @author Michael Hamann <michael@content-space.de> 890 */ 891 public function lookupKey($key, &$value, $func=null) 892 { 893 if (!is_array($value)) { 894 $value_array = array($value); 895 } else { 896 $value_array =& $value; 897 } 898 899 // the matching ids for the provided value(s) 900 $value_ids = array(); 901 902 $metaname = $this->cleanName($key); 903 904 // get all words in order to search the matching ids 905 if ($key == 'title') { 906 $words = $this->getIndex('title', ''); 907 } else { 908 $words = $this->getIndex($metaname.'_w', ''); 909 } 910 911 if (!is_null($func)) { 912 foreach ($value_array as $val) { 913 foreach ($words as $i => $word) { 914 if (call_user_func_array($func, array($val, $word))) { 915 $value_ids[$i][] = $val; 916 } 917 } 918 } 919 } else { 920 foreach ($value_array as $val) { 921 $xval = $val; 922 $caret = '^'; 923 $dollar = '$'; 924 // check for wildcards 925 if (substr($xval, 0, 1) == '*') { 926 $xval = substr($xval, 1); 927 $caret = ''; 928 } 929 if (substr($xval, -1, 1) == '*') { 930 $xval = substr($xval, 0, -1); 931 $dollar = ''; 932 } 933 if (!$caret || !$dollar) { 934 $re = $caret.preg_quote($xval, '/').$dollar; 935 foreach (array_keys(preg_grep('/'.$re.'/', $words)) as $i) { 936 $value_ids[$i][] = $val; 937 } 938 } else { 939 if (($i = array_search($val, $words, true)) !== false) { 940 $value_ids[$i][] = $val; 941 } 942 } 943 } 944 } 945 946 unset($words); // free the used memory 947 948 // initialize the result so it won't be null 949 $result = array(); 950 foreach ($value_array as $val) { 951 $result[$val] = array(); 952 } 953 954 $page_idx = $this->getIndex('page', ''); 955 956 // Special handling for titles 957 if ($key == 'title') { 958 foreach ($value_ids as $pid => $val_list) { 959 $page = $page_idx[$pid]; 960 foreach ($val_list as $val) { 961 $result[$val][] = $page; 962 } 963 } 964 } else { 965 // load all lines and pages so the used lines can be taken 966 // and matched with the pages 967 $lines = $this->getIndex($metaname.'_i', ''); 968 969 foreach ($value_ids as $value_id => $val_list) { 970 // parse the tuples of the form page_id*1:page2_id*1 and so on, 971 // return value is an array with page_id => 1, page2_id => 1 etc. 972 // so take the keys only 973 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 974 foreach ($val_list as $val) { 975 $result[$val] = array_merge($result[$val], $pages); 976 } 977 } 978 } 979 if (!is_array($value)) $result = $result[$value]; 980 return $result; 981 } 982 983 /** 984 * Find the index ID of each search term. 985 * 986 * The query terms should only contain valid characters, with a '*' at 987 * either the beginning or end of the word (or both). 988 * The $result parameter can be used to merge the index locations with 989 * the appropriate query term. 990 * 991 * @param array $words The query terms. 992 * @param array $result Set to word => array("length*id" ...) 993 * @return array Set to length => array(id ...) 994 * 995 * @author Tom N Harris <tnharris@whoopdedo.org> 996 */ 997 protected function getIndexWords(&$words, &$result) 998 { 999 $tokens = array(); 1000 $tokenlength = array(); 1001 $tokenwild = array(); 1002 foreach ($words as $word) { 1003 $result[$word] = array(); 1004 $caret = '^'; 1005 $dollar = '$'; 1006 $xword = $word; 1007 $wlen = static::wordlen($word); 1008 1009 // check for wildcards 1010 if (substr($xword, 0, 1) == '*') { 1011 $xword = substr($xword, 1); 1012 $caret = ''; 1013 $wlen -= 1; 1014 } 1015 if (substr($xword, -1, 1) == '*') { 1016 $xword = substr($xword, 0, -1); 1017 $dollar = ''; 1018 $wlen -= 1; 1019 } 1020 if ($wlen < $this->MinWordLength && $caret && $dollar && !is_numeric($xword)) { 1021 continue; 1022 } 1023 if (!isset($tokens[$xword])) { 1024 $tokenlength[$wlen][] = $xword; 1025 } 1026 if (!$caret || !$dollar) { 1027 $re = $caret.preg_quote($xword, '/').$dollar; 1028 $tokens[$xword][] = array($word, '/'.$re.'/'); 1029 if (!isset($tokenwild[$xword])) { 1030 $tokenwild[$xword] = $wlen; 1031 } 1032 } else { 1033 $tokens[$xword][] = array($word, null); 1034 } 1035 } 1036 asort($tokenwild); 1037 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 1038 // $tokenlength = array( base word length => base word ... ) 1039 // $tokenwild = array( base word => base word length ... ) 1040 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 1041 $indexes_known = $this->indexLengths($length_filter); 1042 if (!empty($tokenwild)) sort($indexes_known); 1043 // get word IDs 1044 $wids = array(); 1045 foreach ($indexes_known as $ixlen) { 1046 $word_idx = $this->getIndex('w', $ixlen); 1047 // handle exact search 1048 if (isset($tokenlength[$ixlen])) { 1049 foreach ($tokenlength[$ixlen] as $xword) { 1050 $wid = array_search($xword, $word_idx, true); 1051 if ($wid !== false) { 1052 $wids[$ixlen][] = $wid; 1053 foreach ($tokens[$xword] as $w) 1054 $result[$w[0]][] = "$ixlen*$wid"; 1055 } 1056 } 1057 } 1058 // handle wildcard search 1059 foreach ($tokenwild as $xword => $wlen) { 1060 if ($wlen >= $ixlen) break; 1061 foreach ($tokens[$xword] as $w) { 1062 if (is_null($w[1])) continue; 1063 foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { 1064 $wids[$ixlen][] = $wid; 1065 $result[$w[0]][] = "$ixlen*$wid"; 1066 } 1067 } 1068 } 1069 } 1070 return $wids; 1071 } 1072 1073 /** 1074 * Return a list of all pages 1075 * Warning: pages may not exist! 1076 * 1077 * @param string $key list only pages containing the metadata key (optional) 1078 * @return array list of page names 1079 * 1080 * @author Tom N Harris <tnharris@whoopdedo.org> 1081 */ 1082 public function getPages($key=null) 1083 { 1084 $page_idx = $this->getIndex('page', ''); 1085 if (is_null($key)) return $page_idx; 1086 1087 $metaname = $this->cleanName($key); 1088 1089 // Special handling for titles 1090 if ($key == 'title') { 1091 $title_idx = $this->getIndex('title', ''); 1092 array_splice($page_idx, count($title_idx)); 1093 foreach ($title_idx as $i => $title) { 1094 if ($title === '') unset($page_idx[$i]); 1095 } 1096 return array_values($page_idx); 1097 } 1098 1099 $pages = array(); 1100 $lines = $this->getIndex($metaname.'_i', ''); 1101 foreach ($lines as $line) { 1102 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 1103 } 1104 return array_keys($pages); 1105 } 1106 1107 /** 1108 * Return a list of words sorted by number of times used 1109 * 1110 * @param int $min bottom frequency threshold 1111 * @param int $max upper frequency limit. No limit if $max<$min 1112 * @param int $minlen minimum length of words to count 1113 * @param string $key metadata key to list. Uses the fulltext index if not given 1114 * @return array list of words as the keys and frequency as values 1115 * 1116 * @author Tom N Harris <tnharris@whoopdedo.org> 1117 */ 1118 public function histogram($min=1, $max=0, $minlen=3, $key=null) 1119 { 1120 if ($min < 1) $min = 1; 1121 if ($max < $min) $max = 0; 1122 1123 $result = array(); 1124 1125 if ($key == 'title') { 1126 $index = $this->getIndex('title', ''); 1127 $index = array_count_values($index); 1128 foreach ($index as $val => $cnt) { 1129 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) { 1130 $result[$val] = $cnt; 1131 } 1132 } 1133 } elseif (!is_null($key)) { 1134 $metaname = $this->cleanName($key); 1135 $index = $this->getIndex($metaname.'_i', ''); 1136 $val_idx = array(); 1137 foreach ($index as $wid => $line) { 1138 $freq = $this->countTuples($line); 1139 if ($freq >= $min && (!$max || $freq <= $max)) { 1140 $val_idx[$wid] = $freq; 1141 } 1142 } 1143 if (!empty($val_idx)) { 1144 $words = $this->getIndex($metaname.'_w', ''); 1145 foreach ($val_idx as $wid => $freq) { 1146 if (strlen($words[$wid]) >= $minlen) { 1147 $result[$words[$wid]] = $freq; 1148 } 1149 } 1150 } 1151 } else { 1152 $lengths = $this->listIndexLengths(); 1153 foreach ($lengths as $length) { 1154 if ($length < $minlen) continue; 1155 $index = $this->getIndex('i', $length); 1156 $words = null; 1157 foreach ($index as $wid => $line) { 1158 $freq = $this->countTuples($line); 1159 if ($freq >= $min && (!$max || $freq <= $max)) { 1160 if ($words === null) { 1161 $words = $this->getIndex('w', $length); 1162 } 1163 $result[$words[$wid]] = $freq; 1164 } 1165 } 1166 } 1167 } 1168 1169 arsort($result); 1170 return $result; 1171 } 1172 1173 /** 1174 * Clean a name of a key for use as a file name. 1175 * 1176 * Romanizes non-latin characters, then strips away anything that's 1177 * not a letter, number, or underscore. 1178 * 1179 * @author Tom N Harris <tnharris@whoopdedo.org> 1180 * 1181 * @param string $name 1182 * @return string 1183 */ 1184 protected function cleanName($name) 1185 { 1186 $name = Utf8\Clean::romanize(trim((string)$name)); 1187 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 1188 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 1189 return strtolower($name); 1190 } 1191 1192 /** 1193 * Lock the indexer. 1194 * 1195 * @author Tom N Harris <tnharris@whoopdedo.org> 1196 * 1197 * @return bool|string 1198 */ 1199 protected function lock() 1200 { 1201 global $conf; 1202 $status = true; 1203 $run = 0; 1204 $lock = $conf['lockdir'].'/_indexer.lock'; 1205 while (!@mkdir($lock, $conf['dmode'])) { 1206 usleep(50); 1207 if (is_dir($lock) && time() - @filemtime($lock) > 60*5) { 1208 // looks like a stale lock - remove it 1209 if (!@rmdir($lock)) { 1210 $status = "removing the stale lock failed"; 1211 return false; 1212 } else { 1213 $status = "stale lock removed"; 1214 } 1215 } elseif ($run++ == 1000) { 1216 // we waited 5 seconds for that lock 1217 return false; 1218 } 1219 } 1220 if (!empty($conf['dperm'])) { 1221 chmod($lock, $conf['dperm']); 1222 } 1223 return $status; 1224 } 1225 1226 /** 1227 * Release the indexer lock. 1228 * 1229 * @author Tom N Harris <tnharris@whoopdedo.org> 1230 * 1231 * @return bool 1232 */ 1233 protected function unlock() 1234 { 1235 global $conf; 1236 @rmdir($conf['lockdir'].'/_indexer.lock'); 1237 return true; 1238 } 1239 1240 /** 1241 * Retrieve the entire index. 1242 * 1243 * The $suffix argument is for an index that is split into multiple parts. 1244 * Different index files should use different base names. 1245 * 1246 * @param string $idx name of the index 1247 * @param string $suffix subpart identifier 1248 * @return array list of lines without CR or LF 1249 * 1250 * @author Tom N Harris <tnharris@whoopdedo.org> 1251 */ 1252 public function getIndex($idx, $suffix) 1253 { 1254 global $conf; 1255 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1256 if (!file_exists($fn)) return array(); 1257 return file($fn, FILE_IGNORE_NEW_LINES); 1258 } 1259 1260 /** 1261 * Replace the contents of the index with an array. 1262 * 1263 * @param string $idx name of the index 1264 * @param string $suffix subpart identifier 1265 * @param array $lines list of lines without LF 1266 * @return bool If saving succeeded 1267 * 1268 * @author Tom N Harris <tnharris@whoopdedo.org> 1269 */ 1270 protected function saveIndex($idx, $suffix, &$lines) 1271 { 1272 global $conf; 1273 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1274 $fh = @fopen($fn.'.tmp', 'w'); 1275 if (!$fh) return false; 1276 fwrite($fh, join("\n", $lines)); 1277 if (!empty($lines)) { 1278 fwrite($fh, "\n"); 1279 } 1280 fclose($fh); 1281 if (isset($conf['fperm'])) { 1282 chmod($fn.'.tmp', $conf['fperm']); 1283 } 1284 io_rename($fn.'.tmp', $fn.'.idx'); 1285 return true; 1286 } 1287 1288 /** 1289 * Retrieve a line from the index. 1290 * 1291 * @param string $idx name of the index 1292 * @param string $suffix subpart identifier 1293 * @param int $id the line number 1294 * @return string a line with trailing whitespace removed 1295 * 1296 * @author Tom N Harris <tnharris@whoopdedo.org> 1297 */ 1298 protected function getIndexKey($idx, $suffix, $id) 1299 { 1300 global $conf; 1301 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1302 if (!file_exists($fn)) return ''; 1303 $fh = @fopen($fn, 'r'); 1304 if (!$fh) return ''; 1305 $ln = -1; 1306 while (($line = fgets($fh)) !== false) { 1307 if (++$ln == $id) break; 1308 } 1309 fclose($fh); 1310 return rtrim((string)$line); 1311 } 1312 1313 /** 1314 * Write a line into the index. 1315 * 1316 * @param string $idx name of the index 1317 * @param string $suffix subpart identifier 1318 * @param int $id the line number 1319 * @param string $line line to write 1320 * @return bool If saving succeeded 1321 * 1322 * @author Tom N Harris <tnharris@whoopdedo.org> 1323 */ 1324 protected function saveIndexKey($idx, $suffix, $id, $line) 1325 { 1326 global $conf; 1327 if (substr($line, -1) !== "\n") { 1328 $line .= "\n"; 1329 } 1330 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1331 $fh = @fopen($fn.'.tmp', 'w'); 1332 if (!$fh) return false; 1333 $ih = @fopen($fn.'.idx', 'r'); 1334 if ($ih) { 1335 $ln = -1; 1336 while (($curline = fgets($ih)) !== false) { 1337 fwrite($fh, (++$ln == $id) ? $line : $curline); 1338 } 1339 if ($id > $ln) { 1340 while ($id > ++$ln) { 1341 fwrite($fh, "\n"); 1342 } 1343 fwrite($fh, $line); 1344 } 1345 fclose($ih); 1346 } else { 1347 $ln = -1; 1348 while ($id > ++$ln) { 1349 fwrite($fh, "\n"); 1350 } 1351 fwrite($fh, $line); 1352 } 1353 fclose($fh); 1354 if (isset($conf['fperm'])) { 1355 chmod($fn.'.tmp', $conf['fperm']); 1356 } 1357 io_rename($fn.'.tmp', $fn.'.idx'); 1358 return true; 1359 } 1360 1361 /** 1362 * Retrieve or insert a value in the index. 1363 * 1364 * @param string $idx name of the index 1365 * @param string $suffix subpart identifier 1366 * @param string $value line to find in the index 1367 * @return int|bool line number of the value in the index 1368 * or false if writing the index failed 1369 * 1370 * @author Tom N Harris <tnharris@whoopdedo.org> 1371 */ 1372 protected function addIndexKey($idx, $suffix, $value) 1373 { 1374 $index = $this->getIndex($idx, $suffix); 1375 $id = array_search($value, $index, true); 1376 if ($id === false) { 1377 $id = count($index); 1378 $index[$id] = $value; 1379 if (!$this->saveIndex($idx, $suffix, $index)) { 1380 trigger_error("Failed to write $idx index", E_USER_ERROR); 1381 return false; 1382 } 1383 } 1384 return $id; 1385 } 1386 1387 /** 1388 * Get the list of lengths indexed in the wiki. 1389 * 1390 * Read the index directory or a cache file and returns 1391 * a sorted array of lengths of the words used in the wiki. 1392 * 1393 * @author YoBoY <yoboy.leguesh@gmail.com> 1394 * 1395 * @return array 1396 */ 1397 public function listIndexLengths() 1398 { 1399 global $conf; 1400 $lengthsFile = $conf['indexdir'].'/lengths.idx'; 1401 1402 // testing what we have to do, create a cache file or not. 1403 if ($conf['readdircache'] == 0) { 1404 $docache = false; 1405 } else { 1406 clearstatcache(); 1407 if (file_exists($lengthsFile) 1408 && (time() < @filemtime($lengthsFile) + $conf['readdircache']) 1409 ) { 1410 if ( 1411 ($lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 1412 !== false 1413 ) { 1414 $idx = array(); 1415 foreach ($lengths as $length) { 1416 $idx[] = (int)$length; 1417 } 1418 return $idx; 1419 } 1420 } 1421 $docache = true; 1422 } 1423 1424 if ($conf['readdircache'] == 0 || $docache) { 1425 $dir = @opendir($conf['indexdir']); 1426 if ($dir === false) return array(); 1427 $idx = array(); 1428 while (($f = readdir($dir)) !== false) { 1429 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 1430 $i = substr($f, 1, -4); 1431 if (is_numeric($i)) $idx[] = (int)$i; 1432 } 1433 } 1434 closedir($dir); 1435 sort($idx); 1436 // save this in a file 1437 if ($docache) { 1438 $handle = @fopen($lengthsFile, 'w'); 1439 @fwrite($handle, implode("\n", $idx)); 1440 @fclose($handle); 1441 } 1442 return $idx; 1443 } 1444 return array(); 1445 } 1446 1447 /** 1448 * Get the word lengths that have been indexed. 1449 * 1450 * Reads the index directory and returns an array of lengths 1451 * that there are indices for. 1452 * 1453 * @author YoBoY <yoboy.leguesh@gmail.com> 1454 * 1455 * @param array|int $filter 1456 * @return array 1457 */ 1458 protected function indexLengths($filter) 1459 { 1460 global $conf; 1461 $idx = array(); 1462 if (is_array($filter)) { 1463 // testing if index files exist only 1464 $path = $conf['indexdir']."/i"; 1465 foreach ($filter as $key => $value) { 1466 if (file_exists($path.$key.'.idx')) { 1467 $idx[] = $key; 1468 } 1469 } 1470 } else { 1471 $lengths = $this->listIndexLengths(); 1472 foreach ($lengths as $key => $length) { 1473 // keep all the values equal or superior 1474 if ((int)$length >= (int)$filter) { 1475 $idx[] = $length; 1476 } 1477 } 1478 } 1479 return $idx; 1480 } 1481 1482 /** 1483 * Insert or replace a tuple in a line. 1484 * 1485 * @author Tom N Harris <tnharris@whoopdedo.org> 1486 * 1487 * @param string $line 1488 * @param string|int $id 1489 * @param int $count 1490 * @return string 1491 */ 1492 protected function updateTuple($line, $id, $count) 1493 { 1494 if ($line != '') { 1495 $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1496 } 1497 $line = trim($line, ':'); 1498 if ($count) { 1499 if ($line) { 1500 return "$id*$count:".$line; 1501 } else { 1502 return "$id*$count"; 1503 } 1504 } 1505 return $line; 1506 } 1507 1508 /** 1509 * Split a line into an array of tuples. 1510 * 1511 * @author Tom N Harris <tnharris@whoopdedo.org> 1512 * @author Andreas Gohr <andi@splitbrain.org> 1513 * 1514 * @param array $keys 1515 * @param string $line 1516 * @return array 1517 */ 1518 protected function parseTuples(&$keys, $line) 1519 { 1520 $result = array(); 1521 if ($line == '') return $result; 1522 $parts = explode(':', $line); 1523 foreach ($parts as $tuple) { 1524 if ($tuple === '') continue; 1525 list($key, $cnt) = explode('*', $tuple); 1526 if (!$cnt) continue; 1527 $key = $keys[$key]; 1528 if ($key === false || is_null($key)) continue; 1529 $result[$key] = $cnt; 1530 } 1531 return $result; 1532 } 1533 1534 /** 1535 * Sum the counts in a list of tuples. 1536 * 1537 * @author Tom N Harris <tnharris@whoopdedo.org> 1538 * 1539 * @param string $line 1540 * @return int 1541 */ 1542 protected function countTuples($line) 1543 { 1544 $freq = 0; 1545 $parts = explode(':', $line); 1546 foreach ($parts as $tuple) { 1547 if ($tuple === '') continue; 1548 list(/* $pid */, $cnt) = explode('*', $tuple); 1549 $freq += (int)$cnt; 1550 } 1551 return $freq; 1552 } 1553} 1554