xref: /dokuwiki/inc/Search/Indexer.php (revision 115f49194ce52c68de5b52a13f9f6e017b138e12)
1<?php
2namespace dokuwiki\Search;
3
4use dokuwiki\Extension\Event;
5use dokuwiki\Utf8;
6
7// Version tag used to force rebuild on upgrade
8define('INDEXER_VERSION', 8);
9
10// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
11if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
12
13
14/**
15 * Class DokuWIki Indexer for Fulltext Search
16 *
17 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18 * @author     Andreas Gohr <andi@splitbrain.org>
19 * @author Tom N Harris <tnharris@whoopdedo.org>
20 */
21class Indexer {
22
23    /** @var Indexer */
24    protected static $instance = null;
25
26    /** @var array $pidCache Cache for getPID() */
27    protected $pidCache = array();
28
29    /** @var array $Stopwords Words that indexer ignores */
30    protected $Stopwords;
31
32    /**
33     * Indexer constructor. Singleton, thus protected!
34     */
35    protected function __construct() {}
36
37    /**
38     * Get new or existing singleton instance of the Indexer
39     *
40     * @return Indexer
41     */
42    public static function getInstance()
43    {
44        if (is_null(static::$instance)) {
45            static::$instance = new static();
46        }
47        return static::$instance;
48    }
49
50    /**
51     * Returns words that will be ignored
52     *
53     * @return array  list of stop words
54     *
55     * @author Tom N Harris <tnharris@whoopdedo.org>
56     */
57    public function getStopwords()
58    {
59        if (!isset($this->Stopwords)) {
60            global $conf;
61            $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
62            if (file_exists($swfile)) {
63                $this->Stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
64            } else {
65                $this->Stopwords = array();
66           }
67        }
68        return $this->Stopwords;
69    }
70
71    /**
72     * Measure the length of a string.
73     * Differs from strlen in handling of asian characters.
74     *
75     * @author Tom N Harris <tnharris@whoopdedo.org>
76     *
77     * @param string $w
78     * @return int
79     */
80    public static function wordlen($w)
81    {
82        $l = strlen($w);
83        // If left alone, all chinese "words" will get put into w3.idx
84        // So the "length" of a "word" is faked
85        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
86            foreach ($leadbytes[0] as $b) {
87                $l += ord($b) - 0xE1;
88            }
89        }
90        return $l;
91    }
92
93    /**
94     * Version of the indexer taking into consideration the external tokenizer.
95     * The indexer is only compatible with data written by the same version.
96     *
97     * @triggers INDEXER_VERSION_GET
98     * Plugins that modify what gets indexed should hook this event and
99     * add their version info to the event data like so:
100     *     $data[$plugin_name] = $plugin_version;
101     *
102     * @author Tom N Harris <tnharris@whoopdedo.org>
103     * @author Michael Hamann <michael@content-space.de>
104     *
105     * @return int|string
106     */
107    public function getVersion()
108    {
109        static $indexer_version = null;
110        if ($indexer_version == null) {
111            $version = INDEXER_VERSION;
112
113            // DokuWiki version is included for the convenience of plugins
114            $data = array('dokuwiki'=>$version);
115            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
116            unset($data['dokuwiki']); // this needs to be first
117            ksort($data);
118            foreach ($data as $plugin => $vers) {
119                $version .= '+'.$plugin.'='.$vers;
120            }
121            $indexer_version = $version;
122        }
123        return $indexer_version;
124    }
125
126    /**
127     * Adds/updates the search index for the given page
128     *
129     * Locking is handled internally.
130     *
131     * @param string $page   name of the page to index
132     * @param bool $verbose  print status messages
133     * @param bool $force    force reindexing even when the index is up to date
134     * @return string|bool   the function completed successfully
135     *
136     * @author Tom N Harris <tnharris@whoopdedo.org>
137     */
138    public function addPage($page, $verbose=false, $force=false)
139    {
140        $idxtag = metaFN($page,'.indexed');
141        // check if page was deleted but is still in the index
142        if (!page_exists($page)) {
143            if (!file_exists($idxtag)) {
144                if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
145                return false;
146            }
147            $result = $this->deletePage($page);
148            if ($result === 'locked') {
149                if ($verbose) print("Indexer: locked".DOKU_LF);
150                return false;
151            }
152            @unlink($idxtag);
153            return $result;
154        }
155
156        // check if indexing needed
157        if (!$force && file_exists($idxtag)) {
158            if (trim(io_readFile($idxtag)) == $this->getVersion()) {
159                $last = @filemtime($idxtag);
160                if ($last > @filemtime(wikiFN($page))) {
161                    if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
162                    return false;
163                }
164            }
165        }
166
167        $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
168        if ($indexenabled === false) {
169            $result = false;
170            if (file_exists($idxtag)) {
171                $result = $this->deletePage($page);
172                if ($result === 'locked') {
173                    if ($verbose) print("Indexer: locked".DOKU_LF);
174                    return false;
175                }
176                @unlink($idxtag);
177            }
178            if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
179            return $result;
180        }
181
182        $pid = $this->getPID($page);
183        if ($pid === false) {
184            if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
185            return false;
186        }
187        $body = '';
188        $metadata = array();
189        $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
190
191        $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED);
192        $metadata['relation_references'] = ($references !== null) ?
193                array_keys($references) : array();
194
195        $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED);
196        $metadata['relation_media'] = ($media !== null) ?
197                array_keys($media) : array();
198
199        $data = compact('page', 'body', 'metadata', 'pid');
200        $evt = new Event('INDEXER_PAGE_ADD', $data);
201        if ($evt->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page);
202        $evt->advise_after();
203        unset($evt);
204        extract($data);
205
206        $result = $this->addPageWords($page, $body);
207        if ($result === 'locked') {
208            if ($verbose) print("Indexer: locked".DOKU_LF);
209            return false;
210        }
211
212        if ($result) {
213            $result = $this->addMetaKeys($page, $metadata);
214            if ($result === 'locked') {
215                if ($verbose) print("Indexer: locked".DOKU_LF);
216                return false;
217            }
218        }
219
220        if ($result) {
221            io_saveFile(metaFN($page,'.indexed'), $this->getVersion());
222        }
223        if ($verbose) {
224            print("Indexer: finished".DOKU_LF);
225            return true;
226        }
227        return $result;
228    }
229
230    /**
231     * Adds the contents of a page to the fulltext index
232     *
233     * The added text replaces previous words for the same page.
234     * An empty value erases the page.
235     *
236     * @param string $page  a page name
237     * @param string $text  the body of the page
238     * @return string|bool  the function completed successfully
239     *
240     * @author Tom N Harris <tnharris@whoopdedo.org>
241     * @author Andreas Gohr <andi@splitbrain.org>
242     */
243    public function addPageWords($page, $text)
244    {
245        if (!$this->lock()) return 'locked';
246
247        // load known documents
248        $pid = $this->getPIDNoLock($page);
249        if ($pid === false) {
250            $this->unlock();
251            return false;
252        }
253
254        $pagewords = array();
255        // get word usage in page
256        $words = $this->getPageWords($text);
257        if ($words === false) {
258            $this->unlock();
259            return false;
260        }
261
262        if (!empty($words)) {
263            foreach (array_keys($words) as $wlen) {
264                $index = $this->getIndex('i', $wlen);
265                foreach ($words[$wlen] as $wid => $freq) {
266                    $idx = ($wid < count($index)) ? $index[$wid] : '';
267                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
268                    $pagewords[] = "$wlen*$wid";
269                }
270                if (!$this->saveIndex('i', $wlen, $index)) {
271                    $this->unlock();
272                    return false;
273                }
274            }
275        }
276
277        // Remove obsolete index entries
278        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
279        if ($pageword_idx !== '') {
280            $oldwords = explode(':',$pageword_idx);
281            $delwords = array_diff($oldwords, $pagewords);
282            $upwords = array();
283            foreach ($delwords as $word) {
284                if ($word != '') {
285                    list($wlen,$wid) = explode('*', $word);
286                    $wid = (int)$wid;
287                    $upwords[$wlen][] = $wid;
288                }
289            }
290            foreach ($upwords as $wlen => $widx) {
291                $index = $this->getIndex('i', $wlen);
292                foreach ($widx as $wid) {
293                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
294                }
295                $this->saveIndex('i', $wlen, $index);
296            }
297        }
298        // Save the reverse index
299        $pageword_idx = join(':', $pagewords);
300        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
301            $this->unlock();
302            return false;
303        }
304
305        $this->unlock();
306        return true;
307    }
308
309    /**
310     * Split the words in a page and add them to the index.
311     *
312     * @param string $text  content of the page
313     * @return array        list of word IDs and number of times used
314     *
315     * @author Andreas Gohr <andi@splitbrain.org>
316     * @author Christopher Smith <chris@jalakai.co.uk>
317     * @author Tom N Harris <tnharris@whoopdedo.org>
318     */
319    protected function getPageWords($text)
320    {
321        $tokens = $this->tokenizer($text);
322        $tokens = array_count_values($tokens);  // count the frequency of each token
323
324        $words = array();
325        foreach ($tokens as $w => $c) {
326            $l = static::wordlen($w);
327            if (isset($words[$l])) {
328                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
329            } else {
330                $words[$l] = array($w => $c);
331            }
332        }
333
334        // arrive here with $words = array(wordlen => array(word => frequency))
335        $word_idx_modified = false;
336        $index = array();   //resulting index
337        foreach (array_keys($words) as $wlen) {
338            $word_idx = $this->getIndex('w', $wlen);
339            foreach ($words[$wlen] as $word => $freq) {
340                $word = (string)$word;
341                $wid = array_search($word, $word_idx, true);
342                if ($wid === false) {
343                    $wid = count($word_idx);
344                    $word_idx[] = $word;
345                    $word_idx_modified = true;
346                }
347                if (!isset($index[$wlen])) {
348                    $index[$wlen] = array();
349                }
350                $index[$wlen][$wid] = $freq;
351            }
352            // save back the word index
353            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
354                return false;
355            }
356        }
357
358        return $index;
359    }
360
361    /**
362     * Add/update keys to/of the metadata index.
363     *
364     * Adding new keys does not remove other keys for the page.
365     * An empty value will erase the key.
366     * The $key parameter can be an array to add multiple keys. $value will
367     * not be used if $key is an array.
368     *
369     * @param string $page  a page name
370     * @param mixed $key    a key string or array of key=>value pairs
371     * @param mixed $value  the value or list of values
372     * @return bool|string  the function completed successfully
373     *
374     * @author Tom N Harris <tnharris@whoopdedo.org>
375     * @author Michael Hamann <michael@content-space.de>
376     */
377    public function addMetaKeys($page, $key, $value=null)
378    {
379        if (!is_array($key)) {
380            $key = array($key => $value);
381        } elseif (!is_null($value)) {
382            // $key is array, but $value is not null
383            trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
384        }
385
386        if (!$this->lock()) return 'locked';
387
388        // load known documents
389        $pid = $this->getPIDNoLock($page);
390        if ($pid === false) {
391            $this->unlock();
392            return false;
393        }
394
395        // Special handling for titles so the index file is simpler
396        if (array_key_exists('title', $key)) {
397            $value = $key['title'];
398            if (is_array($value)) {
399                $value = $value[0];
400            }
401            $this->saveIndexKey('title', '', $pid, $value);
402            unset($key['title']);
403        }
404
405        foreach ($key as $name => $values) {
406            $metaname = $this->cleanName($name);
407            $this->addIndexKey('metadata', '', $metaname);
408            $metaidx = $this->getIndex($metaname.'_i', '');
409            $metawords = $this->getIndex($metaname.'_w', '');
410            $addwords = false;
411
412            if (!is_array($values)) $values = array($values);
413
414            $val_idx = $this->getIndexKey($metaname.'_p', '', $pid);
415            if ($val_idx !== '') {
416                $val_idx = explode(':', $val_idx);
417                // -1 means remove, 0 keep, 1 add
418                $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
419            } else {
420                $val_idx = array();
421            }
422
423            foreach ($values as $val) {
424                $val = (string)$val;
425                if ($val !== '') {
426                    $id = array_search($val, $metawords, true);
427                    if ($id === false) {
428                        // didn't find $val, so we'll add it to the end of metawords
429                        // and create a placeholder in metaidx
430                        $id = count($metawords);
431                        $metawords[$id] = $val;
432                        $metaidx[$id] = '';
433                        $addwords = true;
434                    }
435                    // test if value is already in the index
436                    if (isset($val_idx[$id]) && $val_idx[$id] <= 0) {
437                        $val_idx[$id] = 0;
438                    } else { // else add it
439                        $val_idx[$id] = 1;
440                    }
441                }
442            }
443
444            if ($addwords) {
445                $this->saveIndex($metaname.'_w', '', $metawords);
446            }
447            $vals_changed = false;
448            foreach ($val_idx as $id => $action) {
449                if ($action == -1) {
450                    $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0);
451                    $vals_changed = true;
452                    unset($val_idx[$id]);
453                } elseif ($action == 1) {
454                    $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1);
455                    $vals_changed = true;
456                }
457            }
458
459            if ($vals_changed) {
460                $this->saveIndex($metaname.'_i', '', $metaidx);
461                $val_idx = implode(':', array_keys($val_idx));
462                $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx);
463            }
464
465            unset($metaidx);
466            unset($metawords);
467        }
468
469        $this->unlock();
470        return true;
471    }
472
473    /**
474     * Rename a page in the search index without changing the indexed content.
475     * This function doesn't check if the old or new name exists in the filesystem.
476     * It returns an error if the old page isn't in the page list of the indexer
477     * and it deletes all previously indexed content of the new page.
478     *
479     * @param string $oldpage  The old page name
480     * @param string $newpage  The new page name
481     * @return bool|string  If the page was successfully renamed,
482     *                      can be a message in the case of an error
483     */
484    public function renamePage($oldpage, $newpage)
485    {
486        if (!$this->lock()) return 'locked';
487
488        $pages = $this->getPages();
489
490        $id = array_search($oldpage, $pages, true);
491        if ($id === false) {
492            $this->unlock();
493            return 'page is not in index';
494        }
495
496        $new_id = array_search($newpage, $pages, true);
497        if ($new_id !== false) {
498            // make sure the page is not in the index anymore
499            if ($this->deletePageNoLock($newpage) !== true) {
500                return false;
501            }
502
503            $pages[$new_id] = 'deleted:'.time().rand(0, 9999);
504        }
505
506        $pages[$id] = $newpage;
507
508        // update index
509        if (!$this->saveIndex('page', '', $pages)) {
510            $this->unlock();
511            return false;
512        }
513
514        // reset the pid cache
515        $this->pidCache = array();
516
517        $this->unlock();
518        return true;
519    }
520
521    /**
522     * Renames a meta value in the index.
523     * This doesn't change the meta value in the pages, it assumes that
524     * all pages will be updated.
525     *
526     * @param string $key       The metadata key of which a value shall be changed
527     * @param string $oldvalue  The old value that shall be renamed
528     * @param string $newvalue  The new value to which the old value shall be renamed,
529     *                          if exists values will be merged
530     * @return bool|string      If renaming the value has been successful, false
531     *                          or error message on error.
532     */
533    public function renameMetaValue($key, $oldvalue, $newvalue)
534    {
535        if (!$this->lock()) return 'locked';
536
537        // change the relation references index
538        $metavalues = $this->getIndex($key, '_w');
539        $oldid = array_search($oldvalue, $metavalues, true);
540        if ($oldid !== false) {
541            $newid = array_search($newvalue, $metavalues, true);
542            if ($newid !== false) {
543                // free memory
544                unset ($metavalues);
545
546                // okay, now we have two entries for the same value. we need to merge them.
547                $indexline = $this->getIndexKey($key.'_i', '', $oldid);
548                if ($indexline != '') {
549                    $newindexline = $this->getIndexKey($key.'_i', '', $newid);
550                    $pagekeys     = $this->getIndex($key.'_p', '');
551                    $parts = explode(':', $indexline);
552                    foreach ($parts as $part) {
553                        list($id, $count) = explode('*', $part);
554                        if ($id === '') continue;
555                        $newindexline = $this->updateTuple($newindexline, $id, $count);
556
557                        $keyline = explode(':', $pagekeys[$id]);
558                        // remove old meta value
559                        $keyline = array_diff($keyline, array($oldid));
560                        // add new meta value when not already present
561                        if (!in_array($newid, $keyline)) {
562                            array_push($keyline, $newid);
563                        }
564                        $pagekeys[$id] = implode(':', $keyline);
565                    }
566                    $this->saveIndex($key.'_p', '', $pagekeys);
567                    unset($pagekeys);
568                    $this->saveIndexKey($key.'_i', '', $oldid, '');
569                    $this->saveIndexKey($key.'_i', '', $newid, $newindexline);
570                }
571            } else {
572                $metavalues[$oldid] = $newvalue;
573                if (!$this->saveIndex($key.'_w', '', $metavalues)) {
574                    $this->unlock();
575                    return false;
576                }
577            }
578        }
579
580        $this->unlock();
581        return true;
582    }
583
584    /**
585     * Remove a page from the index
586     *
587     * Erases entries in all known indexes.
588     *
589     * @param string $page  a page name
590     * @return string|bool  the function completed successfully
591     *
592     * @author Tom N Harris <tnharris@whoopdedo.org>
593     */
594    public function deletePage($page)
595    {
596        if (!$this->lock()) return 'locked';
597
598        $result = $this->deletePageNoLock($page);
599        $this->unlock();
600        return $result;
601    }
602
603    /**
604     * Remove a page from the index without locking the index,
605     * only use this function if the index is already locked
606     *
607     * Erases entries in all known indexes.
608     *
609     * @param string $page  a page name
610     * @return bool         the function completed successfully
611     *
612     * @author Tom N Harris <tnharris@whoopdedo.org>
613     */
614    protected function deletePageNoLock($page)
615    {
616        // load known documents
617        $pid = $this->getPIDNoLock($page);
618        if ($pid === false) {
619            return false;
620        }
621
622        // Remove obsolete index entries
623        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
624        if ($pageword_idx !== '') {
625            $delwords = explode(':', $pageword_idx);
626            $upwords = array();
627            foreach ($delwords as $word) {
628                if ($word != '') {
629                    list($wlen,$wid) = explode('*', $word);
630                    $wid = (int)$wid;
631                    $upwords[$wlen][] = $wid;
632                }
633            }
634            foreach ($upwords as $wlen => $widx) {
635                $index = $this->getIndex('i', $wlen);
636                foreach ($widx as $wid) {
637                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
638                }
639                $this->saveIndex('i', $wlen, $index);
640            }
641        }
642        // Save the reverse index
643        if (!$this->saveIndexKey('pageword', '', $pid, '')) {
644            return false;
645        }
646
647        $this->saveIndexKey('title', '', $pid, '');
648        $keyidx = $this->getIndex('metadata', '');
649        foreach ($keyidx as $metaname) {
650            $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid));
651            $meta_idx = $this->getIndex($metaname.'_i', '');
652            foreach ($val_idx as $id) {
653                if ($id === '') continue;
654                $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0);
655            }
656            $this->saveIndex($metaname.'_i', '', $meta_idx);
657            $this->saveIndexKey($metaname.'_p', '', $pid, '');
658        }
659
660        return true;
661    }
662
663    /**
664     * Clear the whole index
665     *
666     * @return bool  If the index has been cleared successfully
667     */
668    public function clear()
669    {
670        global $conf;
671
672        if (!$this->lock()) return false;
673
674        @unlink($conf['indexdir'].'/page.idx');
675        @unlink($conf['indexdir'].'/title.idx');
676        @unlink($conf['indexdir'].'/pageword.idx');
677        @unlink($conf['indexdir'].'/metadata.idx');
678        $dir = @opendir($conf['indexdir']);
679        if ($dir !== false) {
680            while (($f = readdir($dir)) !== false) {
681                if (in_array($f[0], ['i', 'w']) && substr($f, -4) == '.idx') {
682                    // fulltext index
683                    @unlink($conf['indexdir']."/$f");
684                } elseif (in_array(substr($f, -6), ['_w.idx','_i.idx','_p.idx'])) {
685                    // metadata index
686                    @unlink($conf['indexdir']."/$f");
687                }
688            }
689        }
690        @unlink($conf['indexdir'].'/lengths.idx');
691
692        // clear the pid cache
693        $this->pidCache = array();
694
695        $this->unlock();
696        return true;
697    }
698
699    /**
700     * Split the text into words for fulltext search
701     *
702     * @triggers INDEXER_TEXT_PREPARE
703     * This event allows plugins to modify the text before it gets tokenized.
704     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
705     *
706     * @param string $text  plain text
707     * @param bool $wc      are wildcards allowed?
708     * @return array        list of words in the text
709     *
710     * @author Tom N Harris <tnharris@whoopdedo.org>
711     * @author Andreas Gohr <andi@splitbrain.org>
712     */
713    public function tokenizer($text, $wc=false)
714    {
715        $wc = ($wc) ? '' : '\*';
716
717        // prepare the text to be tokenized
718        $evt = new Event('INDEXER_TEXT_PREPARE', $text);
719        if ($evt->advise_before(true)) {
720            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
721                $text = Utf8\Asian::separateAsianWords($text);
722            }
723        }
724        $evt->advise_after();
725        unset($evt);
726
727        $text = strtr($text,
728                       array(
729                           "\r" => ' ',
730                           "\n" => ' ',
731                           "\t" => ' ',
732                           "\xC2\xAD" => '', //soft-hyphen
733                       )
734                     );
735        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
736            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
737        }
738
739        $wordlist = explode(' ', $text);
740        foreach ($wordlist as $i => $word) {
741            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
742                Utf8\PhpString::strtolower($word) : strtolower($word);
743        }
744
745        foreach ($wordlist as $i => $word) {
746            if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH)
747              || array_search($word, $this->getStopwords(), true) !== false) {
748                unset($wordlist[$i]);
749            }
750        }
751        return array_values($wordlist);
752    }
753
754    /**
755     * Get the numeric PID of a page
756     *
757     * @param string $page  The page to get the PID for
758     * @return int|bool     The page id on success, false on error
759     */
760    public function getPID($page)
761    {
762        // return PID without locking when it is in the cache
763        if (isset($this->pidCache[$page])) return $this->pidCache[$page];
764
765        if (!$this->lock()) return false;
766
767        // load known documents
768        $pid = $this->getPIDNoLock($page);
769        if ($pid === false) {
770            $this->unlock();
771            return false;
772        }
773
774        $this->unlock();
775        return $pid;
776    }
777
778    /**
779     * Get the numeric PID of a page without locking the index.
780     * Only use this function when the index is already locked.
781     *
782     * @param string $page  The page to get the PID for
783     * @return int|bool     The page id on success, false on error
784     */
785    protected function getPIDNoLock($page)
786    {
787        // avoid expensive addIndexKey operation for the most recently
788        // requested pages by using a cache
789        if (isset($this->pidCache[$page])) return $this->pidCache[$page];
790        $pid = $this->addIndexKey('page', '', $page);
791        // limit cache to 10 entries by discarding the oldest element
792        // as in DokuWiki usually only the most recently
793        // added item will be requested again
794        if (count($this->pidCache) > 10) array_shift($this->pidCache);
795        $this->pidCache[$page] = $pid;
796        return $pid;
797    }
798
799    /**
800     * Get the page id of a numeric PID
801     *
802     * @param int $pid The PID to get the page id for
803     * @return string The page id
804     */
805    public function getPageFromPID($pid)
806    {
807        return $this->getIndexKey('page', '', $pid);
808    }
809
810    /**
811     * Find pages in the fulltext index containing the words,
812     *
813     * The search words must be pre-tokenized, meaning only letters and
814     * numbers with an optional wildcard
815     *
816     * The returned array will have the original tokens as key. The values
817     * in the returned list is an array with the page names as keys and the
818     * number of times that token appears on the page as value.
819     *
820     * @param array  $tokens list of words to search for
821     * @return array         list of page names with usage counts
822     *
823     * @author Tom N Harris <tnharris@whoopdedo.org>
824     * @author Andreas Gohr <andi@splitbrain.org>
825     */
826    public function lookup(&$tokens)
827    {
828        $result = array();
829        $wids = $this->getIndexWords($tokens, $result);
830        if (empty($wids)) return array();
831        // load known words and documents
832        $page_idx = $this->getIndex('page', '');
833        $docs = array();
834        foreach (array_keys($wids) as $wlen) {
835            $wids[$wlen] = array_unique($wids[$wlen]);
836            $index = $this->getIndex('i', $wlen);
837            foreach ($wids[$wlen] as $ixid) {
838                if ($ixid < count($index)) {
839                    $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]);
840                }
841            }
842        }
843        // merge found pages into final result array
844        $final = array();
845        foreach ($result as $word => $res) {
846            $final[$word] = array();
847            foreach ($res as $wid) {
848                // handle the case when ($ixid < count($index)) has been false
849                // and thus $docs[$wid] hasn't been set.
850                if (!isset($docs[$wid])) continue;
851                $hits = &$docs[$wid];
852                foreach ($hits as $hitkey => $hitcnt) {
853                    // make sure the document still exists
854                    if (!page_exists($hitkey, '', false)) continue;
855                    if (!isset($final[$word][$hitkey])) {
856                        $final[$word][$hitkey] = $hitcnt;
857                    } else {
858                        $final[$word][$hitkey] += $hitcnt;
859                    }
860                }
861            }
862        }
863        return $final;
864    }
865
866    /**
867     * Find pages containing a metadata key.
868     *
869     * The metadata values are compared as case-sensitive strings. Pass a
870     * callback function that returns true or false to use a different
871     * comparison function. The function will be called with the $value being
872     * searched for as the first argument, and the word in the index as the
873     * second argument. The function preg_match can be used directly if the
874     * values are regexes.
875     *
876     * @param string    $key    name of the metadata key to look for
877     * @param string    $value  search term to look for, must be a string or array of strings
878     * @param callback  $func   comparison function
879     * @return array            lists with page names, keys are query values if $value is array
880     *
881     * @author Tom N Harris <tnharris@whoopdedo.org>
882     * @author Michael Hamann <michael@content-space.de>
883     */
884    public function lookupKey($key, &$value, $func=null)
885    {
886        if (!is_array($value)) {
887            $value_array = array($value);
888        } else {
889            $value_array =& $value;
890        }
891
892        // the matching ids for the provided value(s)
893        $value_ids = array();
894
895        $metaname = $this->cleanName($key);
896
897        // get all words in order to search the matching ids
898        if ($key == 'title') {
899            $words = $this->getIndex('title', '');
900        } else {
901            $words = $this->getIndex($metaname.'_w', '');
902        }
903
904        if (!is_null($func)) {
905            foreach ($value_array as $val) {
906                foreach ($words as $i => $word) {
907                    if (call_user_func_array($func, array($val, $word))) {
908                        $value_ids[$i][] = $val;
909                    }
910                }
911            }
912        } else {
913            foreach ($value_array as $val) {
914                $xval = $val;
915                $caret = '^';
916                $dollar = '$';
917                // check for wildcards
918                if (substr($xval, 0, 1) == '*') {
919                    $xval = substr($xval, 1);
920                    $caret = '';
921                }
922                if (substr($xval, -1, 1) == '*') {
923                    $xval = substr($xval, 0, -1);
924                    $dollar = '';
925                }
926                if (!$caret || !$dollar) {
927                    $re = $caret.preg_quote($xval, '/').$dollar;
928                    foreach (array_keys(preg_grep('/'.$re.'/', $words)) as $i) {
929                        $value_ids[$i][] = $val;
930                    }
931                } else {
932                    if (($i = array_search($val, $words, true)) !== false) {
933                        $value_ids[$i][] = $val;
934                    }
935                }
936            }
937        }
938
939        unset($words); // free the used memory
940
941        // initialize the result so it won't be null
942        $result = array();
943        foreach ($value_array as $val) {
944            $result[$val] = array();
945        }
946
947        $page_idx = $this->getIndex('page', '');
948
949        // Special handling for titles
950        if ($key == 'title') {
951            foreach ($value_ids as $pid => $val_list) {
952                $page = $page_idx[$pid];
953                foreach ($val_list as $val) {
954                    $result[$val][] = $page;
955                }
956            }
957        } else {
958            // load all lines and pages so the used lines can be taken
959            // and matched with the pages
960            $lines = $this->getIndex($metaname.'_i', '');
961
962            foreach ($value_ids as $value_id => $val_list) {
963                // parse the tuples of the form page_id*1:page2_id*1 and so on,
964                // return value is an array with page_id => 1, page2_id => 1 etc.
965                // so take the keys only
966                $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id]));
967                foreach ($val_list as $val) {
968                    $result[$val] = array_merge($result[$val], $pages);
969                }
970            }
971        }
972        if (!is_array($value)) $result = $result[$value];
973        return $result;
974    }
975
976    /**
977     * Find the index ID of each search term.
978     *
979     * The query terms should only contain valid characters, with a '*' at
980     * either the beginning or end of the word (or both).
981     * The $result parameter can be used to merge the index locations with
982     * the appropriate query term.
983     *
984     * @param array  $words  The query terms.
985     * @param array  $result Set to word => array("length*id" ...)
986     * @return array         Set to length => array(id ...)
987     *
988     * @author Tom N Harris <tnharris@whoopdedo.org>
989     */
990    protected function getIndexWords(&$words, &$result)
991    {
992        $tokens = array();
993        $tokenlength = array();
994        $tokenwild = array();
995        foreach ($words as $word) {
996            $result[$word] = array();
997            $caret = '^';
998            $dollar = '$';
999            $xword = $word;
1000            $wlen = static::wordlen($word);
1001
1002            // check for wildcards
1003            if (substr($xword, 0, 1) == '*') {
1004                $xword = substr($xword, 1);
1005                $caret = '';
1006                $wlen -= 1;
1007            }
1008            if (substr($xword, -1, 1) == '*') {
1009                $xword = substr($xword, 0, -1);
1010                $dollar = '';
1011                $wlen -= 1;
1012            }
1013            if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) {
1014                continue;
1015            }
1016            if (!isset($tokens[$xword])) {
1017                $tokenlength[$wlen][] = $xword;
1018            }
1019            if (!$caret || !$dollar) {
1020                $re = $caret.preg_quote($xword, '/').$dollar;
1021                $tokens[$xword][] = array($word, '/'.$re.'/');
1022                if (!isset($tokenwild[$xword])) {
1023                    $tokenwild[$xword] = $wlen;
1024                }
1025            } else {
1026                $tokens[$xword][] = array($word, null);
1027            }
1028        }
1029        asort($tokenwild);
1030        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
1031        // $tokenlength = array( base word length => base word ... )
1032        // $tokenwild = array( base word => base word length ... )
1033        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
1034        $indexes_known = $this->indexLengths($length_filter);
1035        if (!empty($tokenwild)) sort($indexes_known);
1036        // get word IDs
1037        $wids = array();
1038        foreach ($indexes_known as $ixlen) {
1039            $word_idx = $this->getIndex('w', $ixlen);
1040            // handle exact search
1041            if (isset($tokenlength[$ixlen])) {
1042                foreach ($tokenlength[$ixlen] as $xword) {
1043                    $wid = array_search($xword, $word_idx, true);
1044                    if ($wid !== false) {
1045                        $wids[$ixlen][] = $wid;
1046                        foreach ($tokens[$xword] as $w)
1047                            $result[$w[0]][] = "$ixlen*$wid";
1048                    }
1049                }
1050            }
1051            // handle wildcard search
1052            foreach ($tokenwild as $xword => $wlen) {
1053                if ($wlen >= $ixlen) break;
1054                foreach ($tokens[$xword] as $w) {
1055                    if (is_null($w[1])) continue;
1056                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
1057                        $wids[$ixlen][] = $wid;
1058                        $result[$w[0]][] = "$ixlen*$wid";
1059                    }
1060                }
1061            }
1062        }
1063        return $wids;
1064    }
1065
1066    /**
1067     * Return a list of all pages
1068     * Warning: pages may not exist!
1069     *
1070     * @param string    $key    list only pages containing the metadata key (optional)
1071     * @return array            list of page names
1072     *
1073     * @author Tom N Harris <tnharris@whoopdedo.org>
1074     */
1075    public function getPages($key=null)
1076    {
1077        $page_idx = $this->getIndex('page', '');
1078        if (is_null($key)) return $page_idx;
1079
1080        $metaname = $this->cleanName($key);
1081
1082        // Special handling for titles
1083        if ($key == 'title') {
1084            $title_idx = $this->getIndex('title', '');
1085            array_splice($page_idx, count($title_idx));
1086            foreach ($title_idx as $i => $title) {
1087                if ($title === '') unset($page_idx[$i]);
1088            }
1089            return array_values($page_idx);
1090        }
1091
1092        $pages = array();
1093        $lines = $this->getIndex($metaname.'_i', '');
1094        foreach ($lines as $line) {
1095            $pages = array_merge($pages, $this->parseTuples($page_idx, $line));
1096        }
1097        return array_keys($pages);
1098    }
1099
1100    /**
1101     * Return a list of words sorted by number of times used
1102     *
1103     * @param int       $min    bottom frequency threshold
1104     * @param int       $max    upper frequency limit. No limit if $max<$min
1105     * @param int       $minlen minimum length of words to count
1106     * @param string    $key    metadata key to list. Uses the fulltext index if not given
1107     * @return array            list of words as the keys and frequency as values
1108     *
1109     * @author Tom N Harris <tnharris@whoopdedo.org>
1110     */
1111    public function histogram($min=1, $max=0, $minlen=3, $key=null)
1112    {
1113        if ($min < 1)    $min = 1;
1114        if ($max < $min) $max = 0;
1115
1116        $result = array();
1117
1118        if ($key == 'title') {
1119            $index = $this->getIndex('title', '');
1120            $index = array_count_values($index);
1121            foreach ($index as $val => $cnt) {
1122                if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) {
1123                    $result[$val] = $cnt;
1124                }
1125            }
1126        } elseif (!is_null($key)) {
1127            $metaname = $this->cleanName($key);
1128            $index = $this->getIndex($metaname.'_i', '');
1129            $val_idx = array();
1130            foreach ($index as $wid => $line) {
1131                $freq = $this->countTuples($line);
1132                if ($freq >= $min && (!$max || $freq <= $max)) {
1133                    $val_idx[$wid] = $freq;
1134                }
1135            }
1136            if (!empty($val_idx)) {
1137                $words = $this->getIndex($metaname.'_w', '');
1138                foreach ($val_idx as $wid => $freq) {
1139                    if (strlen($words[$wid]) >= $minlen) {
1140                        $result[$words[$wid]] = $freq;
1141                    }
1142                }
1143            }
1144        } else {
1145            $lengths = $this->listIndexLengths();
1146            foreach ($lengths as $length) {
1147                if ($length < $minlen) continue;
1148                $index = $this->getIndex('i', $length);
1149                $words = null;
1150                foreach ($index as $wid => $line) {
1151                    $freq = $this->countTuples($line);
1152                    if ($freq >= $min && (!$max || $freq <= $max)) {
1153                        if ($words === null) {
1154                            $words = $this->getIndex('w', $length);
1155                        }
1156                        $result[$words[$wid]] = $freq;
1157                    }
1158                }
1159            }
1160        }
1161
1162        arsort($result);
1163        return $result;
1164    }
1165
1166    /**
1167     * Clean a name of a key for use as a file name.
1168     *
1169     * Romanizes non-latin characters, then strips away anything that's
1170     * not a letter, number, or underscore.
1171     *
1172     * @author Tom N Harris <tnharris@whoopdedo.org>
1173     *
1174     * @param string $name
1175     * @return string
1176     */
1177    protected function cleanName($name)
1178    {
1179        $name = Utf8\Clean::romanize(trim((string)$name));
1180        $name = preg_replace('#[ \./\\:-]+#', '_', $name);
1181        $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
1182        return strtolower($name);
1183    }
1184
1185    /**
1186     * Lock the indexer.
1187     *
1188     * @author Tom N Harris <tnharris@whoopdedo.org>
1189     *
1190     * @return bool|string
1191     */
1192    protected function lock()
1193    {
1194        global $conf;
1195        $status = true;
1196        $run = 0;
1197        $lock = $conf['lockdir'].'/_indexer.lock';
1198        while (!@mkdir($lock, $conf['dmode'])) {
1199            usleep(50);
1200            if (is_dir($lock) && time() - @filemtime($lock) > 60*5) {
1201                // looks like a stale lock - remove it
1202                if (!@rmdir($lock)) {
1203                    $status = "removing the stale lock failed";
1204                    return false;
1205                } else {
1206                    $status = "stale lock removed";
1207                }
1208            } elseif ($run++ == 1000) {
1209                // we waited 5 seconds for that lock
1210                return false;
1211            }
1212        }
1213        if (!empty($conf['dperm'])) {
1214            chmod($lock, $conf['dperm']);
1215        }
1216        return $status;
1217    }
1218
1219    /**
1220     * Release the indexer lock.
1221     *
1222     * @author Tom N Harris <tnharris@whoopdedo.org>
1223     *
1224     * @return bool
1225     */
1226    protected function unlock()
1227    {
1228        global $conf;
1229        @rmdir($conf['lockdir'].'/_indexer.lock');
1230        return true;
1231    }
1232
1233    /**
1234     * Retrieve the entire index.
1235     *
1236     * The $suffix argument is for an index that is split into multiple parts.
1237     * Different index files should use different base names.
1238     *
1239     * @param string    $idx    name of the index
1240     * @param string    $suffix subpart identifier
1241     * @return array            list of lines without CR or LF
1242     *
1243     * @author Tom N Harris <tnharris@whoopdedo.org>
1244     */
1245    public function getIndex($idx, $suffix)
1246    {
1247        global $conf;
1248        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1249        if (!file_exists($fn)) return array();
1250        return file($fn, FILE_IGNORE_NEW_LINES);
1251    }
1252
1253    /**
1254     * Replace the contents of the index with an array.
1255     *
1256     * @param string    $idx    name of the index
1257     * @param string    $suffix subpart identifier
1258     * @param array     $lines  list of lines without LF
1259     * @return bool             If saving succeeded
1260     *
1261     * @author Tom N Harris <tnharris@whoopdedo.org>
1262     */
1263    protected function saveIndex($idx, $suffix, &$lines)
1264    {
1265        global $conf;
1266        $fn = $conf['indexdir'].'/'.$idx.$suffix;
1267        $fh = @fopen($fn.'.tmp', 'w');
1268        if (!$fh) return false;
1269        fwrite($fh, join("\n", $lines));
1270        if (!empty($lines)) {
1271            fwrite($fh, "\n");
1272        }
1273        fclose($fh);
1274        if (isset($conf['fperm'])) {
1275            chmod($fn.'.tmp', $conf['fperm']);
1276        }
1277        io_rename($fn.'.tmp', $fn.'.idx');
1278        return true;
1279    }
1280
1281    /**
1282     * Retrieve a line from the index.
1283     *
1284     * @param string    $idx    name of the index
1285     * @param string    $suffix subpart identifier
1286     * @param int       $id     the line number
1287     * @return string           a line with trailing whitespace removed
1288     *
1289     * @author Tom N Harris <tnharris@whoopdedo.org>
1290     */
1291    protected function getIndexKey($idx, $suffix, $id)
1292    {
1293        global $conf;
1294        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1295        if (!file_exists($fn)) return '';
1296        $fh = @fopen($fn, 'r');
1297        if (!$fh) return '';
1298        $ln = -1;
1299        while (($line = fgets($fh)) !== false) {
1300            if (++$ln == $id) break;
1301        }
1302        fclose($fh);
1303        return rtrim((string)$line);
1304    }
1305
1306    /**
1307     * Write a line into the index.
1308     *
1309     * @param string    $idx    name of the index
1310     * @param string    $suffix subpart identifier
1311     * @param int       $id     the line number
1312     * @param string    $line   line to write
1313     * @return bool             If saving succeeded
1314     *
1315     * @author Tom N Harris <tnharris@whoopdedo.org>
1316     */
1317    protected function saveIndexKey($idx, $suffix, $id, $line)
1318    {
1319        global $conf;
1320        if (substr($line, -1) !== "\n") {
1321            $line .= "\n";
1322        }
1323        $fn = $conf['indexdir'].'/'.$idx.$suffix;
1324        $fh = @fopen($fn.'.tmp', 'w');
1325        if (!$fh) return false;
1326        $ih = @fopen($fn.'.idx', 'r');
1327        if ($ih) {
1328            $ln = -1;
1329            while (($curline = fgets($ih)) !== false) {
1330                fwrite($fh, (++$ln == $id) ? $line : $curline);
1331            }
1332            if ($id > $ln) {
1333                while ($id > ++$ln) {
1334                    fwrite($fh, "\n");
1335                }
1336                fwrite($fh, $line);
1337            }
1338            fclose($ih);
1339        } else {
1340            $ln = -1;
1341            while ($id > ++$ln) {
1342                fwrite($fh, "\n");
1343            }
1344            fwrite($fh, $line);
1345        }
1346        fclose($fh);
1347        if (isset($conf['fperm'])) {
1348            chmod($fn.'.tmp', $conf['fperm']);
1349        }
1350        io_rename($fn.'.tmp', $fn.'.idx');
1351        return true;
1352    }
1353
1354    /**
1355     * Retrieve or insert a value in the index.
1356     *
1357     * @param string    $idx    name of the index
1358     * @param string    $suffix subpart identifier
1359     * @param string    $value  line to find in the index
1360     * @return int|bool          line number of the value in the index
1361     *                           or false if writing the index failed
1362     *
1363     * @author Tom N Harris <tnharris@whoopdedo.org>
1364     */
1365    protected function addIndexKey($idx, $suffix, $value)
1366    {
1367        $index = $this->getIndex($idx, $suffix);
1368        $id = array_search($value, $index, true);
1369        if ($id === false) {
1370            $id = count($index);
1371            $index[$id] = $value;
1372            if (!$this->saveIndex($idx, $suffix, $index)) {
1373                trigger_error("Failed to write $idx index", E_USER_ERROR);
1374                return false;
1375            }
1376        }
1377        return $id;
1378    }
1379
1380    /**
1381     * Get the list of lengths indexed in the wiki.
1382     *
1383     * Read the index directory or a cache file and returns
1384     * a sorted array of lengths of the words used in the wiki.
1385     *
1386     * @author YoBoY <yoboy.leguesh@gmail.com>
1387     *
1388     * @return array
1389     */
1390    public function listIndexLengths()
1391    {
1392        global $conf;
1393        $lengthsFile = $conf['indexdir'].'/lengths.idx';
1394
1395        // testing what we have to do, create a cache file or not.
1396        if ($conf['readdircache'] == 0) {
1397            $docache = false;
1398        } else {
1399            clearstatcache();
1400            if (file_exists($lengthsFile)
1401                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
1402            ) {
1403                if (
1404                    ($lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
1405                    !== false
1406                ) {
1407                    $idx = array();
1408                    foreach ($lengths as $length) {
1409                        $idx[] = (int)$length;
1410                    }
1411                    return $idx;
1412                }
1413            }
1414            $docache = true;
1415        }
1416
1417        if ($conf['readdircache'] == 0 || $docache) {
1418            $dir = @opendir($conf['indexdir']);
1419            if ($dir === false) return array();
1420            $idx = array();
1421            while (($f = readdir($dir)) !== false) {
1422                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
1423                    $i = substr($f, 1, -4);
1424                    if (is_numeric($i)) $idx[] = (int)$i;
1425                }
1426            }
1427            closedir($dir);
1428            sort($idx);
1429            // save this in a file
1430            if ($docache) {
1431                $handle = @fopen($lengthsFile, 'w');
1432                @fwrite($handle, implode("\n", $idx));
1433                @fclose($handle);
1434            }
1435            return $idx;
1436        }
1437        return array();
1438    }
1439
1440    /**
1441     * Get the word lengths that have been indexed.
1442     *
1443     * Reads the index directory and returns an array of lengths
1444     * that there are indices for.
1445     *
1446     * @author YoBoY <yoboy.leguesh@gmail.com>
1447     *
1448     * @param array|int $filter
1449     * @return array
1450     */
1451    protected function indexLengths($filter)
1452    {
1453        global $conf;
1454        $idx = array();
1455        if (is_array($filter)) {
1456            // testing if index files exist only
1457            $path = $conf['indexdir']."/i";
1458            foreach ($filter as $key => $value) {
1459                if (file_exists($path.$key.'.idx')) {
1460                    $idx[] = $key;
1461                }
1462            }
1463        } else {
1464            $lengths = $this->listIndexLengths();
1465            foreach ($lengths as $key => $length) {
1466                // keep all the values equal or superior
1467                if ((int)$length >= (int)$filter) {
1468                    $idx[] = $length;
1469                }
1470            }
1471        }
1472        return $idx;
1473    }
1474
1475    /**
1476     * Insert or replace a tuple in a line.
1477     *
1478     * @author Tom N Harris <tnharris@whoopdedo.org>
1479     *
1480     * @param string $line
1481     * @param int $id
1482     * @param int $count
1483     * @return string
1484     */
1485    protected function updateTuple($line, $id, $count)
1486    {
1487        if ($line != '') {
1488            $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line);
1489        }
1490        $line = trim($line, ':');
1491        if ($count) {
1492            if ($line) {
1493                return "$id*$count:".$line;
1494            } else {
1495                return "$id*$count";
1496            }
1497        }
1498        return $line;
1499    }
1500
1501    /**
1502     * Split a line into an array of tuples.
1503     *
1504     * @author Tom N Harris <tnharris@whoopdedo.org>
1505     * @author Andreas Gohr <andi@splitbrain.org>
1506     *
1507     * @param array $keys
1508     * @param string $line
1509     * @return array
1510     */
1511    protected function parseTuples(&$keys, $line)
1512    {
1513        $result = array();
1514        if ($line == '') return $result;
1515        $parts = explode(':', $line);
1516        foreach ($parts as $tuple) {
1517            if ($tuple === '') continue;
1518            list($key, $cnt) = explode('*', $tuple);
1519            if (!$cnt) continue;
1520            $key = $keys[$key];
1521            if ($key === false || is_null($key)) continue;
1522            $result[$key] = $cnt;
1523        }
1524        return $result;
1525    }
1526
1527    /**
1528     * Sum the counts in a list of tuples.
1529     *
1530     * @author Tom N Harris <tnharris@whoopdedo.org>
1531     *
1532     * @param string $line
1533     * @return int
1534     */
1535    protected function countTuples($line)
1536    {
1537        $freq = 0;
1538        $parts = explode(':', $line);
1539        foreach ($parts as $tuple) {
1540            if ($tuple === '') continue;
1541            list(/* $pid */, $cnt) = explode('*', $tuple);
1542            $freq += (int)$cnt;
1543        }
1544        return $freq;
1545    }
1546}
1547