xref: /dokuwiki/inc/indexer.php (revision 579b0f7e8d80287b11fd441dfa68d15e9d4bb74c)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15// Asian characters are handled as words. The following regexp defines the
16// Unicode-Ranges for Asian characters
17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
18// I'm no language expert. If you think some ranges are wrongly chosen or
19// a range is missing, please contact me
20define('IDX_ASIAN','['.
21                   '\x{0E00}-\x{0E7F}'.  // Thai
22                   '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
23                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
24                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
25                   ']');
26
27
28/**
29 * Write a list of strings to an index file.
30 *
31 * @author Tom N Harris <tnharris@whoopdedo.org>
32 */
33function idx_saveIndex($pre, $wlen, $idx){
34    global $conf;
35    $fn = $conf['indexdir'].'/'.$pre.$wlen;
36    $fh = @fopen($fn.'.tmp','w');
37    if(!$fh) return false;
38    fwrite($fh,join('',$idx));
39    fclose($fh);
40    if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
41    io_rename($fn.'.tmp', $fn.'.idx');
42    return true;
43}
44
45/**
46 * Read the list of words in an index (if it exists).
47 *
48 * @author Tom N Harris <tnharris@whoopdedo.org>
49 */
50function idx_getIndex($pre, $wlen){
51    global $conf;
52    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
53    if(!@file_exists($fn)) return array();
54    return file($fn);
55}
56
57/**
58 * Create an empty index file if it doesn't exist yet.
59 *
60 * @author Tom N Harris <tnharris@whoopdedo.org>
61 */
62function idx_touchIndex($pre, $wlen){
63    global $conf;
64    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
65    if(!@file_exists($fn)){
66        touch($fn);
67        if($conf['fperm']) chmod($fn, $conf['fperm']);
68    }
69}
70
71/**
72 * Split a page into words
73 *
74 * Returns an array of word counts, false if an error occured.
75 * Array is keyed on the word length, then the word index.
76 *
77 * @author Andreas Gohr <andi@splitbrain.org>
78 * @author Christopher Smith <chris@jalakai.co.uk>
79 */
80function idx_getPageWords($page){
81    global $conf;
82    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
83    if(@file_exists($swfile)){
84        $stopwords = file($swfile);
85    }else{
86        $stopwords = array();
87    }
88
89    $body   = rawWiki($page);
90    $body   = strtr($body, "\r\n\t", '   ');
91    $tokens = explode(' ', $body);
92    $tokens = array_count_values($tokens);   // count the frequency of each token
93
94// ensure the deaccented or romanised page names of internal links are added to the token array
95// (this is necessary for the backlink function -- there maybe a better way!)
96    if ($conf['deaccent']) {
97      $links = p_get_metadata($page,'relation references');
98
99      $tmp = join(' ',array_keys($links));                // make a single string
100      $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
101      $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
102
103      foreach ($link_tokens as $link_token) {
104        if (isset($tokens[$link_token])) continue;
105        $tokens[$link_token] = 1;
106      }
107    }
108
109    $words = array();
110    foreach ($tokens as $word => $count) {
111        $arr = idx_tokenizer($word,$stopwords);
112        $arr = array_count_values($arr);
113        foreach ($arr as $w => $c) {
114            $l = strlen($w);
115            if(isset($words[$l])){
116                $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0;
117            }else{
118                $words[$l] = array($w => $c * $count);
119            }
120        }
121    }
122
123    // arrive here with $words = array(wordlen => array(word => frequency))
124
125    $index = array(); //resulting index
126    foreach (array_keys($words) as $wlen){
127        $word_idx = idx_getIndex('w',$wlen);
128        foreach ($words[$wlen] as $word => $freq) {
129            $wid = array_search("$word\n",$word_idx);
130            if(!is_int($wid)){
131                $word_idx[] = "$word\n";
132                $wid = count($word_idx)-1;
133            }
134            if(!isset($index[$wlen]))
135                $index[$wlen] = array();
136            $index[$wlen][$wid] = $freq;
137        }
138
139        // save back word index
140        if(!idx_saveIndex('w',$wlen,$word_idx)){
141            trigger_error("Failed to write word index", E_USER_ERROR);
142            return false;
143        }
144    }
145
146    return $index;
147}
148
149/**
150 * Adds/updates the search for the given page
151 *
152 * This is the core function of the indexer which does most
153 * of the work. This function needs to be called with proper
154 * locking!
155 *
156 * @author Andreas Gohr <andi@splitbrain.org>
157 */
158function idx_addPage($page){
159    global $conf;
160
161    // load known documents
162    $page_idx = idx_getIndex('page','');
163
164    // get page id (this is the linenumber in page.idx)
165    $pid = array_search("$page\n",$page_idx);
166    if(!is_int($pid)){
167        $page_idx[] = "$page\n";
168        $pid = count($page_idx)-1;
169        // page was new - write back
170        if (!idx_saveIndex('page','',$page_idx))
171            return false;
172    }
173
174    // get word usage in page
175    $words = idx_getPageWords($page);
176    if($words === false) return false;
177    if(!count($words)) return true;
178
179    foreach(array_keys($words) as $wlen){
180        // Open index and temp file
181        $fn = $conf['indexdir']."/i$wlen";
182        idx_touchIndex('i',$wlen);
183        $idx = fopen($fn.'.idx','r');
184        $tmp = fopen($fn.'.tmp','w');
185        if(!$idx || !$tmp){
186            trigger_error("Failed to open index files", E_USER_ERROR);
187            return false;
188        }
189
190        // copy from index to temp file, modifying where needed
191        $lno = 0;
192        $line = '';
193        while (!feof($idx)) {
194            // read full line
195            $line .= fgets($idx, 4096);
196            if(substr($line,-1) != "\n") continue;
197
198            // write a new Line to temp file
199            idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]);
200
201            $line = ''; // reset line buffer
202            $lno++;     // increase linecounter
203        }
204        fclose($idx);
205
206        // add missing lines (usually index and word should contain
207        // the same number of lines, however if the page contained
208        // new words the word file has some more lines which need to
209        // be added here
210        $word_idx = idx_getIndex('w',$wlen);
211        $wcnt = count($word_idx);
212        for($lno; $lno<$wcnt; $lno++){
213            idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]);
214        }
215
216        // close the temp file and move it over to be the new one
217        fclose($tmp);
218        if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
219        // try rename first (fast) fallback to copy (slow)
220        io_rename($fn.'.tmp', $fn.'.idx');
221    }
222
223    return true;
224}
225
226/**
227 * Write a new index line to the filehandle
228 *
229 * This function writes an line for the index file to the
230 * given filehandle. It removes the given document from
231 * the given line and readds it when $count is >0.
232 *
233 * @author Andreas Gohr <andi@splitbrain.org>
234 */
235function idx_writeIndexLine($fh,$line,$pid,$count){
236    $line = trim($line);
237
238    if($line != ''){
239        $parts = explode(':',$line);
240        // remove doc from given line
241        foreach($parts as $part){
242            if($part == '') continue;
243            list($doc,$cnt) = explode('*',$part);
244            if($doc != $pid){
245                fwrite($fh,"$doc*$cnt:");
246            }
247        }
248    }
249
250    // add doc
251    if ($count){
252        fwrite($fh,"$pid*$count");
253    }
254
255    // add newline
256    fwrite($fh,"\n");
257}
258
259/**
260 * Get the word lengths that have been indexed.
261 *
262 * Reads the index directory and returns an array of lengths
263 * that there are indices for.
264 *
265 * @author Tom N Harris <tnharris@whoopdedo.org>
266 */
267function idx_indexLengths($minlen){
268    global $conf;
269    $dir = @opendir($conf['indexdir']);
270    if($dir===false)
271        return array();
272    $idx = array();
273    // Exact match first.
274    if(@file_exists($conf['indexdir']."/i$minlen.idx"))
275        $idx[] = $minlen;
276    while (($f = readdir($dir)) !== false) {
277        if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
278            $i = substr($f,1,-4);
279            if (is_numeric($i) && $i > $minlen)
280                $idx[] = $i;
281        }
282    }
283    closedir($dir);
284    return $idx;
285}
286
287/**
288 * Lookup words in index
289 *
290 * Takes an array of word and will return a list of matching
291 * documents for each one.
292 *
293 * Important: No ACL checking is done here! All results are
294 *            returned, regardless of permissions
295 *
296 * @author Andreas Gohr <andi@splitbrain.org>
297 */
298function idx_lookup($words){
299    global $conf;
300
301    $result = array();
302
303    // load known words and documents
304    $page_idx = idx_getIndex('page','');
305
306    // get word IDs
307    $wids = array();
308    foreach($words as $word){
309        $result[$word] = array();
310        $wild = 0;
311        $xword = $word;
312        $wlen = strlen($word);
313
314        // check for wildcards
315        if(substr($xword,0,1) == '*'){
316            $xword = substr($xword,1);
317            $wild  = 1;
318            $ptn = '/'.preg_quote($xword,'/').'$/';
319            $wlen -= 1;
320#            $l = -1*strlen($xword)-1;
321        }
322        if(substr($xword,-1,1) == '*'){
323            $xword = substr($xword,0,-1);
324            $wild += 2;
325            $wlen -= 1;
326        }
327        if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
328
329        // look for the ID(s) for the given word
330        if($wild){  // handle wildcard search
331            foreach (idx_indexLengths($wlen) as $ixlen){
332                $word_idx = idx_getIndex('w',$ixlen);
333                $cnt = count($word_idx);
334                for($wid=0; $wid<$cnt; $wid++){
335                    $iword = $word_idx[$wid];
336                    if( (($wild==3) && is_int(strpos($iword,$xword))) ||
337#                        (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
338                        (($wild==1) && preg_match($ptn,$iword)) ||
339#                        (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
340                        (($wild==2) && (0 === strpos($iword,$xword)))
341
342                      ){
343                        if(!isset($wids[$ixlen])) $wids[$ixlen] = array();
344                        $wids[$ixlen][] = $wid;
345                        $result[$word][] = "$ixlen*$wid";
346                    }
347                }
348            }
349        }else{     // handle exact search
350            $word_idx = idx_getIndex('w',$wlen);
351            $wid = array_search("$word\n",$word_idx);
352            if(is_int($wid)){
353                $wids[$wlen] = array($wid);
354                $result[$word][] = "$wlen*$wid";
355            }else{
356                $result[$word] = array();
357            }
358        }
359    }
360
361    $docs = array();                          // hold docs found
362    foreach(array_keys($wids) as $wlen){
363        sort($wids[$wlen]);
364        $wids[$wlen] = array_unique($wids[$wlen]);
365
366        // Open index
367        idx_touchIndex('i',$wlen);
368        $idx = fopen($conf['indexdir']."/i$wlen.idx",'r');
369        if(!$idx){
370            msg("Failed to open index file",-1);
371            return false;
372        }
373
374        // Walk the index til the lines are found
375        $lno  = 0;
376        $line = '';
377        $ixids =& $wids[$wlen];
378        $srch = array_shift($ixids);               // which word do we look for?
379        while (!feof($idx)) {
380            // read full line
381            $line .= fgets($idx, 4096);
382            if(substr($line,-1) != "\n") continue;
383            if($lno > $srch)             break;   // shouldn't happen
384
385            // do we want this line?
386            if($lno == $srch){
387                // add docs to list
388                $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line);
389
390                $srch = array_shift($ixids);        // next word to look up
391                if($srch == null) break;           // no more words
392            }
393
394            $line = ''; // reset line buffer
395            $lno++;     // increase linecounter
396        }
397        fclose($idx);
398    }
399
400
401    // merge found pages into final result array
402    $final = array();
403    foreach(array_keys($result) as $word){
404        $final[$word] = array();
405        foreach($result[$word] as $wid){
406            $hits = &$docs[$wid];
407            foreach ($hits as $hitkey => $hitcnt) {
408                $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
409            }
410        }
411    }
412    return $final;
413}
414
415/**
416 * Returns a list of documents and counts from a index line
417 *
418 * It omits docs with a count of 0 and pages that no longer
419 * exist.
420 *
421 * @param  array  $page_idx The list of known pages
422 * @param  string $line     A line from the main index
423 * @author Andreas Gohr <andi@splitbrain.org>
424 */
425function idx_parseIndexLine(&$page_idx,$line){
426    $result = array();
427
428    $line = trim($line);
429    if($line == '') return $result;
430
431    $parts = explode(':',$line);
432    foreach($parts as $part){
433        if($part == '') continue;
434        list($doc,$cnt) = explode('*',$part);
435        if(!$cnt) continue;
436        $doc = trim($page_idx[$doc]);
437        if(!$doc) continue;
438        // make sure the document still exists
439        if(!@file_exists(wikiFN($doc,'',false))) continue;
440
441        $result[$doc] = $cnt;
442    }
443    return $result;
444}
445
446/**
447 * Tokenizes a string into an array of search words
448 *
449 * Uses the same algorithm as idx_getPageWords()
450 *
451 * @param string   $string     the query as given by the user
452 * @param arrayref $stopwords  array of stopwords
453 * @param boolean  $wc         are wildcards allowed?
454 *
455 * @todo make combined function to use alone or in getPageWords
456 */
457function idx_tokenizer($string,&$stopwords,$wc=false){
458    $words = array();
459    $wc = ($wc) ? '' : $wc = '\*';
460
461    if(preg_match('/[^0-9A-Za-z]/u', $string)){
462        // handle asian chars as single words (may fail on older PHP version)
463        $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
464        if(!is_null($asia)) $string = $asia; //recover from regexp failure
465
466        $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
467        foreach ($arr as $w) {
468            if (!is_numeric($w) && strlen($w) < 3) continue;
469            $w = utf8_strtolower($w);
470            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
471            $words[] = $w;
472        }
473    }else{
474        $w = $string;
475        if (!is_numeric($w) && strlen($w) < 3) return $words;
476        $w = strtolower($w);
477        if(is_int(array_search("$w\n",$stopwords))) return $words;
478        $words[] = $w;
479    }
480
481    return $words;
482}
483
484//Setup VIM: ex: et ts=4 enc=utf-8 :
485