xref: /dokuwiki/inc/indexer.php (revision e1e1a7e012189660a2cfd7631e82234b5ae92f69)
1<?php
2/**
3 * Functions to create the fulltext search index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 * @author     Tom N Harris <tnharris@whoopdedo.org>
8 */
9
10if(!defined('DOKU_INC')) die('meh.');
11
12// Version tag used to force rebuild on upgrade
13define('INDEXER_VERSION', 3);
14
15// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
16if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
17
18// Asian characters are handled as words. The following regexp defines the
19// Unicode-Ranges for Asian characters
20// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
21// I'm no language expert. If you think some ranges are wrongly chosen or
22// a range is missing, please contact me
23define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai
24define('IDX_ASIAN2','['.
25                   '\x{2E80}-\x{3040}'.  // CJK -> Hangul
26                   '\x{309D}-\x{30A0}'.
27                   '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
28                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
29                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
30                   ']');
31define('IDX_ASIAN3','['.                // Hiragana/Katakana (can be two characters)
32                   '\x{3042}\x{3044}\x{3046}\x{3048}'.
33                   '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
34                   '\x{3084}\x{3086}\x{3088}-\x{308D}'.
35                   '\x{308F}-\x{3094}'.
36                   '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
37                   '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
38                   '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
39                   '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
40                   ']['.
41                   '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
42                   '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
43                   '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
44                   '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
45                   '\x{31F0}-\x{31FF}'.
46                   ']?');
47define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
48
49/**
50 * Version of the indexer taking into consideration the external tokenizer.
51 * The indexer is only compatible with data written by the same version.
52 *
53 * @author Tom N Harris <tnharris@whoopdedo.org>
54 */
55function idx_get_version(){
56    global $conf;
57    if($conf['external_tokenizer'])
58        return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']);
59    else
60        return INDEXER_VERSION;
61}
62
63/**
64 * Measure the length of a string.
65 * Differs from strlen in handling of asian characters.
66 *
67 * @author Tom N Harris <tnharris@whoopdedo.org>
68 */
69function wordlen($w){
70    $l = strlen($w);
71    // If left alone, all chinese "words" will get put into w3.idx
72    // So the "length" of a "word" is faked
73    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
74        foreach($leadbytes[0] as $b)
75            $l += ord($b) - 0xE1;
76    }
77    return $l;
78}
79
80/**
81 * Class that encapsulates operations on the indexer database.
82 *
83 * @author Tom N Harris <tnharris@whoopdedo.org>
84 */
85class Doku_Indexer {
86
87    /**
88     * Adds the contents of a page to the fulltext index
89     *
90     * The added text replaces previous words for the same page.
91     * An empty value erases the page.
92     *
93     * @param string    $page   a page name
94     * @param string    $text   the body of the page
95     * @return boolean          the function completed successfully
96     * @author Tom N Harris <tnharris@whoopdedo.org>
97     * @author Andreas Gohr <andi@splitbrain.org>
98     */
99    public function addPageWords($page, $text) {
100        if (!$this->_lock())
101            return "locked";
102
103        // load known documents
104        $page_idx = $this->_addIndexKey('page', '', $page);
105        if ($page_idx === false) {
106            $this->_unlock();
107            return false;
108        }
109
110        $pagewords = array();
111        // get word usage in page
112        $words = $this->_getPageWords($text);
113        if ($words === false) {
114            $this->_unlock();
115            return false;
116        }
117
118        if (!empty($words)) {
119            foreach (array_keys($words) as $wlen) {
120                $index = $this->_getIndex('i', $wlen);
121                foreach ($words[$wlen] as $wid => $freq) {
122                    $idx = ($wid<count($index)) ? $index[$wid] : '';
123                    $index[$wid] = $this->_updateTuple($idx, $pid, $freq);
124                    $pagewords[] = "$wlen*$wid";
125                }
126                if (!$this->_saveIndex('i', $wlen, $index)) {
127                    $this->_unlock();
128                    return false;
129                }
130            }
131        }
132
133        // Remove obsolete index entries
134        $pageword_idx = $this->_getIndexKey('pageword', '', $pid);
135        if ($pageword_idx !== '') {
136            $oldwords = explode(':',$pageword_idx);
137            $delwords = array_diff($oldwords, $pagewords);
138            $upwords = array();
139            foreach ($delwords as $word) {
140                if ($word != '') {
141                    list($wlen,$wid) = explode('*', $word);
142                    $wid = (int)$wid;
143                    $upwords[$wlen][] = $wid;
144                }
145            }
146            foreach ($upwords as $wlen => $widx) {
147                $index = $this->_getIndex('i', $wlen);
148                foreach ($widx as $wid) {
149                    $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0);
150                }
151                $this->_saveIndex('i', $wlen, $index);
152            }
153        }
154        // Save the reverse index
155        $pageword_idx = join(':', $pagewords);
156        if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) {
157            $this->_unlock();
158            return false;
159        }
160
161        $this->_unlock();
162        return true;
163    }
164
165    /**
166     * Split the words in a page and add them to the index.
167     *
168     * @author Andreas Gohr <andi@splitbrain.org>
169     * @author Christopher Smith <chris@jalakai.co.uk>
170     * @author Tom N Harris <tnharris@whoopdedo.org>
171     */
172    private function _getPageWords($text) {
173        global $conf;
174
175        $tokens = $this->tokenizer($text);
176        $tokens = array_count_values($tokens);  // count the frequency of each token
177
178        $words = array();
179        foreach ($tokens as $w=>$c) {
180            $l = wordlen($w);
181            if (isset($words[$l])){
182                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
183            }else{
184                $words[$l] = array($w => $c);
185            }
186        }
187
188        // arrive here with $words = array(wordlen => array(word => frequency))
189        $word_idx_modified = false;
190        $index = array();   //resulting index
191        foreach (array_keys($words) as $wlen) {
192            $word_idx = $this->_getIndex('w', $wlen);
193            foreach ($words[$wlen] as $word => $freq) {
194                $wid = array_search($word, $word_idx);
195                if ($wid === false) {
196                    $wid = count($word_idx);
197                    $word_idx[] = $word;
198                    $word_idx_modified = true;
199                }
200                if (!isset($index[$wlen]))
201                    $index[$wlen] = array();
202                $index[$wlen][$wid] = $freq;
203            }
204            // save back the word index
205            if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx))
206                return false;
207        }
208
209        return $index;
210    }
211
212    /**
213     * Add/update keys to/of the metadata index.
214     *
215     * Adding new keys does not remove other keys for the page.
216     * An empty value will erase the key.
217     * The $key parameter can be an array to add multiple keys. $value will
218     * not be used if $key is an array.
219     *
220     * @param string    $page   a page name
221     * @param mixed     $key    a key string or array of key=>value pairs
222     * @param mixed     $value  the value or list of values
223     * @return boolean          the function completed successfully
224     * @author Tom N Harris <tnharris@whoopdedo.org>
225     * @author Michael Hamann <michael@content-space.de>
226     */
227    public function addMetaKeys($page, $key, $value=null) {
228        if (!is_array($key)) {
229            $key = array($key => $value);
230        } elseif (!is_null($value)) {
231            // $key is array, but $value is not null
232            trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
233        }
234
235        if (!$this->_lock())
236            return "locked";
237
238        // load known documents
239        $pid = $this->_addIndexKey('page', '', $page);
240        if ($pid === false) {
241            $this->_unlock();
242            return false;
243        }
244
245        foreach ($key as $name => $values) {
246            $metaname = idx_cleanName($name);
247            $metaidx = $this->_getIndex($metaname, '_i');
248            $metawords = $this->_getIndex($metaname, '_w');
249            $addwords = false;
250
251            if (!is_array($values)) $values = array($values);
252
253            $val_idx = $this->_getIndexKey($metaname, '_p', $pid);
254            if ($val_idx != '') {
255                $val_idx = explode(':', $val_idx);
256                // -1 means remove, 0 keep, 1 add
257                $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
258            } else {
259                $val_idx = array();
260            }
261
262
263            foreach ($values as $val) {
264                $val = (string)$val;
265                if ($val !== "") {
266                    $id = array_search($val, $metawords);
267                    if ($id === false) {
268                        $id = count($metawords);
269                        $metawords[$id] = $val;
270                        $addwords = true;
271                    }
272                    // test if value is already in the index
273                    if (isset($val_idx[$id]) && $val_idx[$id] <= 0)
274                        $val_idx[$id] = 0;
275                    else // else add it
276                        $val_idx[$id] = 1;
277                }
278            }
279
280            if ($addwords)
281                $this->_saveIndex($metaname.'_w', '', $metawords);
282            $vals_changed = false;
283            foreach ($val_idx as $id => $action) {
284                if ($action == -1) {
285                    $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0);
286                    $vals_changed = true;
287                    unset($val_idx[$id]);
288                } elseif ($action == 1) {
289                    $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1);
290                    $vals_changed = true;
291                }
292            }
293
294            if ($vals_changed) {
295                $this->_saveIndex($metaname.'_i', '', $metaidx);
296                $val_idx = implode(':', array_keys($val_idx));
297                $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx);
298            }
299
300            unset($metaidx);
301            unset($metawords);
302        }
303
304        $this->_unlock();
305        return true;
306    }
307
308    /**
309     * Remove a page from the index
310     *
311     * Erases entries in all known indexes.
312     *
313     * @param string    $page   a page name
314     * @return boolean          the function completed successfully
315     * @author Tom N Harris <tnharris@whoopdedo.org>
316     */
317    public function deletePage($page) {
318    }
319
320    /**
321     * Split the text into words for fulltext search
322     *
323     * TODO: does this also need &$stopwords ?
324     *
325     * @param string    $text   plain text
326     * @param boolean   $wc     are wildcards allowed?
327     * @return array            list of words in the text
328     * @author Tom N Harris <tnharris@whoopdedo.org>
329     * @author Andreas Gohr <andi@splitbrain.org>
330     */
331    public function tokenizer($text, $wc=false) {
332        global $conf;
333        $words = array();
334        $wc = ($wc) ? '' : '\*';
335        $stopwords =& idx_get_stopwords();
336
337        if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') {
338            if (0 == io_exec($conf['tokenizer_cmd'], $text, $output))
339                $text = $output;
340        } else {
341            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
342                // handle asian chars as single words (may fail on older PHP version)
343                $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text);
344                if (!is_null($asia)) $text = $asia; // recover from regexp falure
345            }
346        }
347        $text = strtr($text, "\r\n\t", '   ');
348        if (preg_match('/[^0-9A-Za-z ]/u', $text))
349            $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc);
350
351        $wordlist = explode(' ', $text);
352        foreach ($wordlist as $word) {
353            $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
354                utf8_strtolower($word) : strtolower($word);
355            if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
356            if (array_search($word, $stopwords) !== false) continue;
357            $words[] = $word;
358        }
359        return $words;
360    }
361
362    /**
363     * Find pages in the fulltext index containing the words,
364     *
365     * The search words must be pre-tokenized, meaning only letters and
366     * numbers with an optional wildcard
367     *
368     * The returned array will have the original tokens as key. The values
369     * in the returned list is an array with the page names as keys and the
370     * number of times that token appeas on the page as value.
371     *
372     * @param arrayref  $tokens list of words to search for
373     * @return array            list of page names with usage counts
374     * @author Tom N Harris <tnharris@whoopdedo.org>
375     * @author Andreas Gohr <andi@splitbrain.org>
376     */
377    public function lookup(&$tokens) {
378        $result = array();
379        $wids = $this->_getIndexWords($tokens, $result);
380        if (empty($wids)) return array();
381        // load known words and documents
382        $page_idx = $this->_getIndex('page', '');
383        $docs = array();
384        foreach (array_keys($wids) as $wlen) {
385            $wids[$wlen] = array_unique($wids[$wlen]);
386            $index = $this->_getIndex('i', $wlen);
387            foreach($wids[$wlen] as $ixid) {
388                if ($ixid < count($index))
389                    $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]);
390            }
391        }
392        // merge found pages into final result array
393        $final = array();
394        foreach ($result as $word => $res) {
395            $final[$word] = array();
396            foreach ($res as $wid) {
397                $hits = &$docs[$wid];
398                foreach ($hits as $hitkey => $hitcnt) {
399                    // make sure the document still exists
400                    if (!page_exists($hitkey, '', false)) continue;
401                    if (!isset($final[$word][$hitkey]))
402                        $final[$word][$hitkey] = $hitcnt;
403                    else
404                        $final[$word][$hitkey] += $hitcnt;
405                }
406            }
407        }
408        return $final;
409    }
410
411    /**
412     * Find pages containing a metadata key.
413     *
414     * The metadata values are compared as case-sensitive strings. Pass a
415     * callback function that returns true or false to use a different
416     * comparison function
417     *
418     * @param string    $key    name of the metadata key to look for
419     * @param string    $value  search term to look for
420     * @param callback  $func   comparison function
421     * @return array            lists with page names, keys are query values
422     * @author Tom N Harris <tnharris@whoopdedo.org>
423     * @author Michael Hamann <michael@content-space.de>
424     */
425    public function lookupKey($key, $value, $func=null) {
426        $metaname = idx_cleanName($key);
427
428        // get all words in order to search the matching ids
429        $words = $this->_getIndex($metaname, '_w');
430
431        // the matching ids for the provided value(s)
432        $value_ids = array();
433
434        if (!is_array($value)) $value = array($value);
435
436        foreach ($value as $val) {
437            if (is_null($func)) {
438                if (($i = array_search($val, $words)) !== false)
439                    $value_ids[$i] = $val;
440            } else {
441                foreach ($words as $i => $word) {
442                    if (call_user_func_array($func, array($word, $value)))
443                        $value_ids[$i] = $val;
444                }
445            }
446        }
447
448        unset($words); // free the used memory
449
450        // load all lines and pages so the used lines can be taken and matched with the pages
451        $lines = $this->_getIndex($metaname, '_i');
452        $page_idx = $this->_getIndex('page', '');
453
454        $result = array();
455        foreach ($value_ids as $value_id => $val) {
456            // parse the tuples of the form page_id*1:page2_id*1 and so on, return value
457            // is an array with page_id => 1, page2_id => 1 etc. so take the keys only
458            $result[$val] = array_keys($this->_parseTuples($page_idx, $lines[$value_id]));
459        }
460        return $result;
461    }
462
463    /**
464     * Find the index ID of each search term.
465     *
466     * The query terms should only contain valid characters, with a '*' at
467     * either the beginning or end of the word (or both).
468     * The $result parameter can be used to merge the index locations with
469     * the appropriate query term.
470     *
471     * @param arrayref  $words  The query terms.
472     * @param arrayref  $result Set to word => array("length*id" ...)
473     * @return array            Set to length => array(id ...)
474     * @author Tom N Harris <tnharris@whoopdedo.org>
475     */
476    private function _getIndexWords(&$words, &$result) {
477        $tokens = array();
478        $tokenlength = array();
479        $tokenwild = array();
480        foreach ($words as $word) {
481            $result[$word] = array();
482            $caret = false;
483            $dollar = false;
484            $xword = $word;
485            $wlen = wordlen($word);
486
487            // check for wildcards
488            if (substr($xword, 0, 1) == '*') {
489                $xword = substr($xword, 1);
490                $caret = true;
491                $wlen -= 1;
492            }
493            if (substr($xword, -1, 1) == '*') {
494                $xword = substr($xword, 0, -1);
495                $dollar = true;
496                $wlen -= 1;
497            }
498            if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword))
499                continue;
500            if (!isset($tokens[$xword]))
501                $tokenlength[$wlen][] = $xword;
502            if ($caret || $dollar) {
503                $re = preg_quote($xword, '/');
504                if ($caret) $re = '^'.$re;
505                if ($dollar) $re = $re.'$';
506                $tokens[$xword][] = array($word, '/'.$re.'/');
507                if (!isset($tokenwild[$xword]))
508                    $tokenwild[$xword] = $wlen;
509            } else {
510                $tokens[$xword][] = array($word, null);
511            }
512        }
513        asort($tokenwild);
514        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
515        // $tokenlength = array( base word length => base word ... )
516        // $tokenwild = array( base word => base word length ... )
517        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
518        $indexes_known = $this->_indexLengths($length_filter);
519        if (!empty($tokenwild)) sort($indexes_known);
520        // get word IDs
521        $wids = array();
522        foreach ($indexes_known as $ixlen) {
523            $word_idx = $this->_getIndex('w', $ixlen);
524            // handle exact search
525            if (isset($tokenlength[$ixlen])) {
526                foreach ($tokenlength[$ixlen] as $xword) {
527                    $wid = array_search($xword, $word_idx);
528                    if ($wid !== false) {
529                        $wids[$ixlen][] = $wid;
530                        foreach ($tokens[$xword] as $w)
531                            $result[$w[0]][] = "$ixlen*$wid";
532                    }
533                }
534            }
535            // handle wildcard search
536            foreach ($tokenwild as $xword => $wlen) {
537                if ($wlen >= $ixlen) break;
538                foreach ($tokens[$xword] as $w) {
539                    if (is_null($w[1])) continue;
540                    foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) {
541                        $wids[$ixlen][] = $wid;
542                        $result[$w[0]][] = "$ixlen*$wid";
543                    }
544                }
545            }
546        }
547        return $wids;
548    }
549
550    /**
551     * Return a list of all pages
552     *
553     * @param string    $key    list only pages containing the metadata key (optional)
554     * @return array            list of page names
555     * @author Tom N Harris <tnharris@whoopdedo.org>
556     */
557    public function getPages($key=null) {
558        $page_idx = $this->_getIndex('page', '');
559        if (is_null($key)) return $page_idx;
560    }
561
562    /**
563     * Return a list of words sorted by number of times used
564     *
565     * @param int       $min    bottom frequency threshold
566     * @param int       $max    upper frequency limit. No limit if $max<$min
567     * @param string    $key    metadata key to list. Uses the fulltext index if not given
568     * @return array            list of words as the keys and frequency as values
569     * @author Tom N Harris <tnharris@whoopdedo.org>
570     */
571    public function histogram($min=1, $max=0, $key=null) {
572    }
573
574    /**
575     * Lock the indexer.
576     *
577     * @author Tom N Harris <tnharris@whoopdedo.org>
578     */
579    private function _lock() {
580        global $conf;
581        $status = true;
582        $lock = $conf['lockdir'].'/_indexer.lock';
583        while (!@mkdir($lock, $conf['dmode'])) {
584            usleep(50);
585            if (time() - @filemtime($lock) > 60*5) {
586                // looks like a stale lock, remove it
587                @rmdir($lock);
588                $status = "stale lock removed";
589            } else {
590                return false;
591            }
592        }
593        if ($conf['dperm'])
594            chmod($lock, $conf['dperm']);
595        return $status;
596    }
597
598    /**
599     * Release the indexer lock.
600     *
601     * @author Tom N Harris <tnharris@whoopdedo.org>
602     */
603    private function _unlock() {
604        global $conf;
605        @rmdir($conf['lockdir'].'/_indexer.lock');
606        return true;
607    }
608
609    /**
610     * Retrieve the entire index.
611     *
612     * @author Tom N Harris <tnharris@whoopdedo.org>
613     */
614    private function _getIndex($idx, $suffix) {
615        global $conf;
616        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
617        if (!@file_exists($fn)) return array();
618        return file($fn, FILE_IGNORE_NEW_LINES);
619    }
620
621    /**
622     * Replace the contents of the index with an array.
623     *
624     * @author Tom N Harris <tnharris@whoopdedo.org>
625     */
626    private function _saveIndex($idx, $suffix, &$lines) {
627        global $conf;
628        $fn = $conf['indexdir'].'/'.$idx.$suffix;
629        $fh = @fopen($fn.'.tmp', 'w');
630        if (!$fh) return false;
631        fwrite($fh, join("\n", $lines));
632        fclose($fh);
633        if (isset($conf['fperm']))
634            chmod($fn.'.tmp', $conf['fperm']);
635        io_rename($fn.'.tmp', $fn.'.idx');
636        if ($suffix !== '')
637            $this->_cacheIndexDir($idx, $suffix, empty($lines));
638        return true;
639    }
640
641    /**
642     * Retrieve a line from the index.
643     *
644     * @author Tom N Harris <tnharris@whoopdedo.org>
645     */
646    private function _getIndexKey($idx, $suffix, $id) {
647        global $conf;
648        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
649        if (!@file_exists($fn)) return '';
650        $fh = @fopen($fn, 'r');
651        if (!$fh) return '';
652        $ln = -1;
653        while (($line = fgets($fh)) !== false) {
654            if (++$ln == $id) break;
655        }
656        fclose($fh);
657        return rtrim((string)$line);
658    }
659
660    /**
661     * Write a line into the index.
662     *
663     * @author Tom N Harris <tnharris@whoopdedo.org>
664     */
665    private function _saveIndexKey($idx, $suffix, $id, $line) {
666        global $conf;
667        if (substr($line, -1) != "\n")
668            $line .= "\n";
669        $fn = $conf['indexdir'].'/'.$idx.$suffix;
670        $fh = @fopen($fn.'.tmp', 'w');
671        if (!fh) return false;
672        $ih = @fopen($fn.'.idx', 'r');
673        if ($ih) {
674            $ln = -1;
675            while (($curline = fgets($ih)) !== false) {
676                fwrite($fh, (++$ln == $id) ? $line : $curline);
677            }
678            if ($id > $ln) {
679                while ($id > ++$ln)
680                    fwrite($fh, "\n");
681                fwrite($fh, $line);
682            }
683            fclose($ih);
684        } else {
685            $ln = -1;
686            while ($id > ++$ln)
687                fwrite($fh, "\n");
688            fwrite($fh, $line);
689        }
690        fclose($fh);
691        if (isset($conf['fperm']))
692            chmod($fn.'.tmp', $conf['fperm']);
693        io_rename($fn.'.tmp', $fn.'.idx');
694        if ($suffix !== '')
695            $this->_cacheIndexDir($idx, $suffix);
696        return true;
697    }
698
699    /**
700     * Retrieve or insert a value in the index.
701     *
702     * @author Tom N Harris <tnharris@whoopdedo.org>
703     */
704    private function _addIndexKey($idx, $suffix, $value) {
705        $index = $this->_getIndex($idx, $suffix);
706        $id = array_search($value, $index);
707        if ($id === false) {
708            $id = count($index);
709            $index[$id] = $value;
710            if (!$this->_saveIndex($idx, $suffix, $index)) {
711                trigger_error("Failed to write $idx index", E_USER_ERROR);
712                return false;
713            }
714        }
715        return $id;
716    }
717
718    private function _cacheIndexDir($idx, $suffix, $delete=false) {
719        global $conf;
720        if ($idx == 'i')
721            $cachename = $conf['indexdir'].'/lengths';
722        else
723            $cachename = $conf['indexdir'].'/'.$idx.'lengths';
724        $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
725        if ($lengths === false) $lengths = array();
726        $old = array_search((string)$suffix, $lengths);
727        if (empty($lines)) {
728            if ($old === false) return;
729            unset($lengths[$old]);
730        } else {
731            if ($old !== false) return;
732            $lengths[] = $suffix;
733            sort($lengths);
734        }
735        $fh = @fopen($cachename.'.tmp', 'w');
736        if (!$fh) {
737            trigger_error("Failed to write index cache", E_USER_ERROR);
738            return;
739        }
740        @fwrite($fh, implode("\n", $lengths));
741        @fclose($fh);
742        if (isset($conf['fperm']))
743            chmod($cachename.'.tmp', $conf['fperm']);
744        io_rename($cachename.'.tmp', $cachename.'.idx');
745    }
746
747    /**
748     * Get the list of lengths indexed in the wiki.
749     *
750     * Read the index directory or a cache file and returns
751     * a sorted array of lengths of the words used in the wiki.
752     *
753     * @author YoBoY <yoboy.leguesh@gmail.com>
754     */
755    private function _listIndexLengths() {
756        global $conf;
757        $cachename = $conf['indexdir'].'/lengths';
758        clearstatcache();
759        if (@file_exists($cachename.'.idx')) {
760            $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
761            if ($lengths !== false) {
762                $idx = array();
763                foreach ($lengths as $length)
764                    $idx[] = (int)$length;
765                return $idx;
766            }
767        }
768
769        $dir = @opendir($conf['indexdir']);
770        if ($dir === false)
771            return array();
772        $lengths[] = array();
773        while (($f = readdir($dir)) !== false) {
774            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
775                $i = substr($f, 1, -4);
776                if (is_numeric($i))
777                    $lengths[] = (int)$i;
778            }
779        }
780        closedir($dir);
781        sort($lengths);
782        // save this in a file
783        $fh = @fopen($cachename.'.tmp', 'w');
784        if (!$fh) {
785            trigger_error("Failed to write index cache", E_USER_ERROR);
786            return;
787        }
788        @fwrite($fh, implode("\n", $lengths));
789        @fclose($fh);
790        if (isset($conf['fperm']))
791            chmod($cachename.'.tmp', $conf['fperm']);
792        io_rename($cachename.'.tmp', $cachename.'.idx');
793
794        return $lengths;
795    }
796
797    /**
798     * Get the word lengths that have been indexed.
799     *
800     * Reads the index directory and returns an array of lengths
801     * that there are indices for.
802     *
803     * @author YoBoY <yoboy.leguesh@gmail.com>
804     */
805    private function _indexLengths($filter) {
806        global $conf;
807        $idx = array();
808        if (is_array($filter)) {
809            // testing if index files exist only
810            $path = $conf['indexdir']."/i";
811            foreach ($filter as $key => $value) {
812                if (@file_exists($path.$key.'.idx'))
813                    $idx[] = $key;
814            }
815        } else {
816            $lengths = idx_listIndexLengths();
817            foreach ($lengths as $key => $length) {
818                // keep all the values equal or superior
819                if ((int)$length >= (int)$filter)
820                    $idx[] = $length;
821            }
822        }
823        return $idx;
824    }
825
826    /**
827     * Insert or replace a tuple in a line.
828     *
829     * @author Tom N Harris <tnharris@whoopdedo.org>
830     */
831    private function _updateTuple($line, $id, $count) {
832        $newLine = $line;
833        if ($newLine !== '')
834            $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine);
835        $newLine = trim($newLine, ':');
836        if ($count) {
837            if (strlen($newLine) > 0)
838                return "$id*$count:".$newLine;
839            else
840                return "$id*$count".$newLine;
841        }
842        return $newLine;
843    }
844
845    /**
846     * Split a line into an array of tuples.
847     *
848     * @author Tom N Harris <tnharris@whoopdedo.org>
849     * @author Andreas Gohr <andi@splitbrain.org>
850     */
851    private function _parseTuples(&$keys, $line) {
852        $result = array();
853        if ($line == '') return $result;
854        $parts = explode(':', $line);
855        foreach ($parts as $tuple) {
856            if ($tuple == '') continue;
857            list($key, $cnt) = explode('*', $tuple);
858            if (!$cnt) continue;
859            $key = $keys[$key];
860            if (!$key) continue;
861            $result[$key] = $cnt;
862        }
863        return $result;
864    }
865}
866
867/**
868 * Create an instance of the indexer.
869 *
870 * @return object               a Doku_Indexer
871 * @author Tom N Harris <tnharris@whoopdedo.org>
872 */
873function idx_get_indexer() {
874    static $Indexer = null;
875    if (is_null($Indexer)) {
876        $Indexer = new Doku_Indexer();
877    }
878    return $Indexer;
879}
880
881/**
882 * Returns words that will be ignored.
883 *
884 * @return array                list of stop words
885 * @author Tom N Harris <tnharris@whoopdedo.org>
886 */
887function & idx_get_stopwords() {
888    static $stopwords = null;
889    if (is_null($stopwords)) {
890        global $conf;
891        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
892        if(@file_exists($swfile)){
893            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
894        }else{
895            $stopwords = array();
896        }
897    }
898    return $stopwords;
899}
900
901/**
902 * Adds/updates the search index for the given page
903 *
904 * Locking is handled internally.
905 *
906 * @param string        $page   name of the page to index
907 * @param boolean       $verbose    print status messages
908 * @return boolean              the function completed successfully
909 * @author Tom N Harris <tnharris@whoopdedo.org>
910 */
911function idx_addPage($page, $verbose=false) {
912    // check if indexing needed
913    $idxtag = metaFN($page,'.indexed');
914    if(@file_exists($idxtag)){
915        if(trim(io_readFile($idxtag)) == idx_get_version()){
916            $last = @filemtime($idxtag);
917            if($last > @filemtime(wikiFN($ID))){
918                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
919                return false;
920            }
921        }
922    }
923
924    $body = '';
925    $data = array($page, $body);
926    $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
927    if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page);
928    $evt->advise_after();
929    unset($evt);
930    list($page,$body) = $data;
931
932    $Indexer = idx_get_indexer();
933    $result = $Indexer->addPageWords($page, $body);
934    if ($result === "locked") {
935        if ($verbose) print("Indexer: locked".DOKU_LF);
936        return false;
937    }
938    if ($result)
939        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
940    if ($verbose) {
941        print("Indexer: finished".DOKU_LF);
942        return true;
943    }
944    return $result;
945}
946
947/**
948 * Find tokens in the fulltext index
949 *
950 * Takes an array of words and will return a list of matching
951 * pages for each one.
952 *
953 * Important: No ACL checking is done here! All results are
954 *            returned, regardless of permissions
955 *
956 * @param arrayref      $words  list of words to search for
957 * @return array                list of pages found, associated with the search terms
958 */
959function idx_lookup(&$words) {
960    $Indexer = idx_get_indexer();
961    return $Indexer->lookup($words);
962}
963
964/**
965 * Split a string into tokens
966 *
967 */
968function idx_tokenizer($string, $wc=false) {
969    $Indexer = idx_get_indexer();
970    return $Indexer->tokenizer($string, $wc);
971}
972
973/* For compatibility */
974
975/**
976 * Read the list of words in an index (if it exists).
977 *
978 * @author Tom N Harris <tnharris@whoopdedo.org>
979 */
980function idx_getIndex($idx, $suffix) {
981    global $conf;
982    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
983    if (!@file_exists($fn)) return array();
984    return file($fn);
985}
986
987/**
988 * Get the list of lengths indexed in the wiki.
989 *
990 * Read the index directory or a cache file and returns
991 * a sorted array of lengths of the words used in the wiki.
992 *
993 * @author YoBoY <yoboy.leguesh@gmail.com>
994 */
995function idx_listIndexLengths() {
996    global $conf;
997    // testing what we have to do, create a cache file or not.
998    if ($conf['readdircache'] == 0) {
999        $docache = false;
1000    } else {
1001        clearstatcache();
1002        if (@file_exists($conf['indexdir'].'/lengths.idx')
1003        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
1004            if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) {
1005                $idx = array();
1006                foreach ($lengths as $length) {
1007                    $idx[] = (int)$length;
1008                }
1009                return $idx;
1010            }
1011        }
1012        $docache = true;
1013    }
1014
1015    if ($conf['readdircache'] == 0 || $docache) {
1016        $dir = @opendir($conf['indexdir']);
1017        if ($dir === false)
1018            return array();
1019        $idx[] = array();
1020        while (($f = readdir($dir)) !== false) {
1021            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
1022                $i = substr($f, 1, -4);
1023                if (is_numeric($i))
1024                    $idx[] = (int)$i;
1025            }
1026        }
1027        closedir($dir);
1028        sort($idx);
1029        // save this in a file
1030        if ($docache) {
1031            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
1032            @fwrite($handle, implode("\n", $idx));
1033            @fclose($handle);
1034        }
1035        return $idx;
1036    }
1037
1038    return array();
1039}
1040
1041/**
1042 * Get the word lengths that have been indexed.
1043 *
1044 * Reads the index directory and returns an array of lengths
1045 * that there are indices for.
1046 *
1047 * @author YoBoY <yoboy.leguesh@gmail.com>
1048 */
1049function idx_indexLengths($filter) {
1050    global $conf;
1051    $idx = array();
1052    if (is_array($filter)) {
1053        // testing if index files exist only
1054        $path = $conf['indexdir']."/i";
1055        foreach ($filter as $key => $value) {
1056            if (@file_exists($path.$key.'.idx'))
1057                $idx[] = $key;
1058        }
1059    } else {
1060        $lengths = idx_listIndexLengths();
1061        foreach ($lengths as $key => $length) {
1062            // keep all the values equal or superior
1063            if ((int)$length >= (int)$filter)
1064                $idx[] = $length;
1065        }
1066    }
1067    return $idx;
1068}
1069
1070/**
1071 * Clean a name of a key for use as a file name.
1072 *
1073 * Romanizes non-latin characters, then strips away anything that's
1074 * not a letter, number, or underscore.
1075 *
1076 * @author Tom N Harris <tnharris@whoopdedo.org>
1077 */
1078function idx_cleanName($name) {
1079    $name = utf8_romanize(trim((string)$name));
1080    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
1081    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
1082    return strtolower($name);
1083}
1084
1085//Setup VIM: ex: et ts=4 :
1086