xref: /dokuwiki/inc/indexer.php (revision 3fc667cfbc05c58679586aeccdb47438798a8e15)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15// Asian characters are handled as words. The following regexp defines the
16// Unicode-Ranges for Asian characters
17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
18// I'm no language expert. If you think some ranges are wrongly chosen or
19// a range is missing, please contact me
20define('IDX_ASIAN','['.
21                   '\x{0E00}-\x{0E7F}'.  // Thai
22                   '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
23                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
24                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
25                   ']');
26
27
28/**
29 * Write a list of strings to an index file.
30 *
31 * @author Tom N Harris <tnharris@whoopdedo.org>
32 */
33function idx_saveIndex($pre, $wlen, $idx){
34    global $conf;
35    $fn = $conf['indexdir'].'/'.$pre.$wlen;
36    $fh = @fopen($fn.'.tmp','w');
37    if(!$fh) return false;
38    fwrite($fh,join('',$idx));
39    fclose($fh);
40    if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
41    io_rename($fn.'.tmp', $fn.'.idx');
42    return true;
43}
44
45/**
46 * Read the list of words in an index (if it exists).
47 *
48 * @author Tom N Harris <tnharris@whoopdedo.org>
49 */
50function idx_getIndex($pre, $wlen){
51    global $conf;
52    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
53    if(!@file_exists($fn)) return array();
54    return file($fn);
55}
56
57/**
58 * Create an empty index file if it doesn't exist yet.
59 *
60 * @author Tom N Harris <tnharris@whoopdedo.org>
61 */
62function idx_touchIndex($pre, $wlen){
63    global $conf;
64    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
65    if(!@file_exists($fn)){
66        touch($fn);
67        if($conf['fperm']) chmod($fn, $conf['fperm']);
68    }
69}
70
71/**
72 * Split a page into words
73 *
74 * Returns an array of word counts, false if an error occured.
75 * Array is keyed on the word length, then the word index.
76 *
77 * @author Andreas Gohr <andi@splitbrain.org>
78 * @author Christopher Smith <chris@jalakai.co.uk>
79 */
80function idx_getPageWords($page){
81    global $conf;
82    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
83    if(@file_exists($swfile)){
84        $stopwords = file($swfile);
85    }else{
86        $stopwords = array();
87    }
88
89    $body   = rawWiki($page);
90    $body   = strtr($body, "\r\n\t", '   ');
91    $tokens = explode(' ', $body);
92    $tokens = array_count_values($tokens);   // count the frequency of each token
93
94// ensure the deaccented or romanised page names of internal links are added to the token array
95// (this is necessary for the backlink function -- there maybe a better way!)
96    if ($conf['deaccent']) {
97      $links = p_get_metadata($page,'relation references');
98
99      if (!empty($links)) {
100        $tmp = join(' ',array_keys($links));                // make a single string
101        $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
102        $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
103
104        foreach ($link_tokens as $link_token) {
105          if (isset($tokens[$link_token])) continue;
106          $tokens[$link_token] = 1;
107        }
108      }
109    }
110
111    $words = array();
112    foreach ($tokens as $word => $count) {
113        $arr = idx_tokenizer($word,$stopwords);
114        $arr = array_count_values($arr);
115        foreach ($arr as $w => $c) {
116            $l = strlen($w);
117            if(isset($words[$l])){
118                $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0;
119            }else{
120                $words[$l] = array($w => $c * $count);
121            }
122        }
123    }
124
125    // arrive here with $words = array(wordlen => array(word => frequency))
126
127    $index = array(); //resulting index
128    foreach (array_keys($words) as $wlen){
129        $word_idx = idx_getIndex('w',$wlen);
130        foreach ($words[$wlen] as $word => $freq) {
131            $wid = array_search("$word\n",$word_idx);
132            if(!is_int($wid)){
133                $word_idx[] = "$word\n";
134                $wid = count($word_idx)-1;
135            }
136            if(!isset($index[$wlen]))
137                $index[$wlen] = array();
138            $index[$wlen][$wid] = $freq;
139        }
140
141        // save back word index
142        if(!idx_saveIndex('w',$wlen,$word_idx)){
143            trigger_error("Failed to write word index", E_USER_ERROR);
144            return false;
145        }
146    }
147
148    return $index;
149}
150
151/**
152 * Adds/updates the search for the given page
153 *
154 * This is the core function of the indexer which does most
155 * of the work. This function needs to be called with proper
156 * locking!
157 *
158 * @author Andreas Gohr <andi@splitbrain.org>
159 */
160function idx_addPage($page){
161    global $conf;
162
163    // load known documents
164    $page_idx = idx_getIndex('page','');
165
166    // get page id (this is the linenumber in page.idx)
167    $pid = array_search("$page\n",$page_idx);
168    if(!is_int($pid)){
169        $page_idx[] = "$page\n";
170        $pid = count($page_idx)-1;
171        // page was new - write back
172        if (!idx_saveIndex('page','',$page_idx))
173            return false;
174    }
175
176    // get word usage in page
177    $words = idx_getPageWords($page);
178    if($words === false) return false;
179    if(!count($words)) return true;
180
181    foreach(array_keys($words) as $wlen){
182        // Open index and temp file
183        $fn = $conf['indexdir']."/i$wlen";
184        idx_touchIndex('i',$wlen);
185        $idx = fopen($fn.'.idx','r');
186        $tmp = fopen($fn.'.tmp','w');
187        if(!$idx || !$tmp){
188            trigger_error("Failed to open index files", E_USER_ERROR);
189            return false;
190        }
191
192        // copy from index to temp file, modifying where needed
193        $lno = 0;
194        $line = '';
195        while (!feof($idx)) {
196            // read full line
197            $line .= fgets($idx, 4096);
198            if(substr($line,-1) != "\n") continue;
199
200            // write a new Line to temp file
201            idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]);
202
203            $line = ''; // reset line buffer
204            $lno++;     // increase linecounter
205        }
206        fclose($idx);
207
208        // add missing lines (usually index and word should contain
209        // the same number of lines, however if the page contained
210        // new words the word file has some more lines which need to
211        // be added here
212        $word_idx = idx_getIndex('w',$wlen);
213        $wcnt = count($word_idx);
214        for($lno; $lno<$wcnt; $lno++){
215            idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]);
216        }
217
218        // close the temp file and move it over to be the new one
219        fclose($tmp);
220        if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
221        // try rename first (fast) fallback to copy (slow)
222        io_rename($fn.'.tmp', $fn.'.idx');
223    }
224
225    return true;
226}
227
228/**
229 * Write a new index line to the filehandle
230 *
231 * This function writes an line for the index file to the
232 * given filehandle. It removes the given document from
233 * the given line and readds it when $count is >0.
234 *
235 * @author Andreas Gohr <andi@splitbrain.org>
236 */
237function idx_writeIndexLine($fh,$line,$pid,$count){
238    $line = trim($line);
239
240    if($line != ''){
241        $parts = explode(':',$line);
242        // remove doc from given line
243        foreach($parts as $part){
244            if($part == '') continue;
245            list($doc,$cnt) = explode('*',$part);
246            if($doc != $pid){
247                fwrite($fh,"$doc*$cnt:");
248            }
249        }
250    }
251
252    // add doc
253    if ($count){
254        fwrite($fh,"$pid*$count");
255    }
256
257    // add newline
258    fwrite($fh,"\n");
259}
260
261/**
262 * Get the word lengths that have been indexed.
263 *
264 * Reads the index directory and returns an array of lengths
265 * that there are indices for.
266 *
267 * @author Tom N Harris <tnharris@whoopdedo.org>
268 */
269function idx_indexLengths($minlen){
270    global $conf;
271    $dir = @opendir($conf['indexdir']);
272    if($dir===false)
273        return array();
274    $idx = array();
275    // Exact match first.
276    if(@file_exists($conf['indexdir']."/i$minlen.idx"))
277        $idx[] = $minlen;
278    while (($f = readdir($dir)) !== false) {
279        if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
280            $i = substr($f,1,-4);
281            if (is_numeric($i) && $i > $minlen)
282                $idx[] = $i;
283        }
284    }
285    closedir($dir);
286    return $idx;
287}
288
289/**
290 * Lookup words in index
291 *
292 * Takes an array of word and will return a list of matching
293 * documents for each one.
294 *
295 * Important: No ACL checking is done here! All results are
296 *            returned, regardless of permissions
297 *
298 * @author Andreas Gohr <andi@splitbrain.org>
299 */
300function idx_lookup($words){
301    global $conf;
302
303    $result = array();
304
305    // load known words and documents
306    $page_idx = idx_getIndex('page','');
307
308    // get word IDs
309    $wids = array();
310    foreach($words as $word){
311        $result[$word] = array();
312        $wild = 0;
313        $xword = $word;
314        $wlen = strlen($word);
315
316        // check for wildcards
317        if(substr($xword,0,1) == '*'){
318            $xword = substr($xword,1);
319            $wild  = 1;
320            $ptn = '/'.preg_quote($xword,'/').'$/';
321            $wlen -= 1;
322#            $l = -1*strlen($xword)-1;
323        }
324        if(substr($xword,-1,1) == '*'){
325            $xword = substr($xword,0,-1);
326            $wild += 2;
327            $wlen -= 1;
328        }
329        if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
330
331        // look for the ID(s) for the given word
332        if($wild){  // handle wildcard search
333            foreach (idx_indexLengths($wlen) as $ixlen){
334                $word_idx = idx_getIndex('w',$ixlen);
335                $cnt = count($word_idx);
336                for($wid=0; $wid<$cnt; $wid++){
337                    $iword = $word_idx[$wid];
338                    if( (($wild==3) && is_int(strpos($iword,$xword))) ||
339#                        (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
340                        (($wild==1) && preg_match($ptn,$iword)) ||
341#                        (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
342                        (($wild==2) && (0 === strpos($iword,$xword)))
343
344                      ){
345                        if(!isset($wids[$ixlen])) $wids[$ixlen] = array();
346                        $wids[$ixlen][] = $wid;
347                        $result[$word][] = "$ixlen*$wid";
348                    }
349                }
350            }
351        }else{     // handle exact search
352            $word_idx = idx_getIndex('w',$wlen);
353            $wid = array_search("$word\n",$word_idx);
354            if(is_int($wid)){
355                $wids[$wlen] = array($wid);
356                $result[$word][] = "$wlen*$wid";
357            }else{
358                $result[$word] = array();
359            }
360        }
361    }
362
363    $docs = array();                          // hold docs found
364    foreach(array_keys($wids) as $wlen){
365        sort($wids[$wlen]);
366        $wids[$wlen] = array_unique($wids[$wlen]);
367
368        // Open index
369        idx_touchIndex('i',$wlen);
370        $idx = fopen($conf['indexdir']."/i$wlen.idx",'r');
371        if(!$idx){
372            msg("Failed to open index file",-1);
373            return false;
374        }
375
376        // Walk the index til the lines are found
377        $lno  = 0;
378        $line = '';
379        $ixids =& $wids[$wlen];
380        $srch = array_shift($ixids);               // which word do we look for?
381        while (!feof($idx)) {
382            // read full line
383            $line .= fgets($idx, 4096);
384            if(substr($line,-1) != "\n") continue;
385            if($lno > $srch)             break;   // shouldn't happen
386
387            // do we want this line?
388            if($lno == $srch){
389                // add docs to list
390                $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line);
391
392                $srch = array_shift($ixids);        // next word to look up
393                if($srch == null) break;           // no more words
394            }
395
396            $line = ''; // reset line buffer
397            $lno++;     // increase linecounter
398        }
399        fclose($idx);
400    }
401
402
403    // merge found pages into final result array
404    $final = array();
405    foreach(array_keys($result) as $word){
406        $final[$word] = array();
407        foreach($result[$word] as $wid){
408            $hits = &$docs[$wid];
409            foreach ($hits as $hitkey => $hitcnt) {
410                $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
411            }
412        }
413    }
414    return $final;
415}
416
417/**
418 * Returns a list of documents and counts from a index line
419 *
420 * It omits docs with a count of 0 and pages that no longer
421 * exist.
422 *
423 * @param  array  $page_idx The list of known pages
424 * @param  string $line     A line from the main index
425 * @author Andreas Gohr <andi@splitbrain.org>
426 */
427function idx_parseIndexLine(&$page_idx,$line){
428    $result = array();
429
430    $line = trim($line);
431    if($line == '') return $result;
432
433    $parts = explode(':',$line);
434    foreach($parts as $part){
435        if($part == '') continue;
436        list($doc,$cnt) = explode('*',$part);
437        if(!$cnt) continue;
438        $doc = trim($page_idx[$doc]);
439        if(!$doc) continue;
440        // make sure the document still exists
441        if(!@file_exists(wikiFN($doc,'',false))) continue;
442
443        $result[$doc] = $cnt;
444    }
445    return $result;
446}
447
448/**
449 * Tokenizes a string into an array of search words
450 *
451 * Uses the same algorithm as idx_getPageWords()
452 *
453 * @param string   $string     the query as given by the user
454 * @param arrayref $stopwords  array of stopwords
455 * @param boolean  $wc         are wildcards allowed?
456 *
457 * @todo make combined function to use alone or in getPageWords
458 */
459function idx_tokenizer($string,&$stopwords,$wc=false){
460    $words = array();
461    $wc = ($wc) ? '' : $wc = '\*';
462
463    if(preg_match('/[^0-9A-Za-z]/u', $string)){
464        // handle asian chars as single words (may fail on older PHP version)
465        $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
466        if(!is_null($asia)) $string = $asia; //recover from regexp failure
467
468        $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
469        foreach ($arr as $w) {
470            if (!is_numeric($w) && strlen($w) < 3) continue;
471            $w = utf8_strtolower($w);
472            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
473            $words[] = $w;
474        }
475    }else{
476        $w = $string;
477        if (!is_numeric($w) && strlen($w) < 3) return $words;
478        $w = strtolower($w);
479        if(is_int(array_search("$w\n",$stopwords))) return $words;
480        $words[] = $w;
481    }
482
483    return $words;
484}
485
486//Setup VIM: ex: et ts=4 enc=utf-8 :
487