xref: /dokuwiki/inc/indexer.php (revision 168f9feebfac547ac832cdc2561a85158f9bed2d)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15// Asian characters are handled as words. The following regexp defines the
16// Unicode-Ranges for Asian characters
17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
18// I'm no language expert. If you think some ranges are wrongly chosen or
19// a range is missing, please contact me
20define('IDX_ASIAN','['.
21                   '\x{0E00}-\x{0E7F}'.  // Thai
22                   '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
23                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
24                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
25                   ']');
26
27
28/**
29 * Split a page into words
30 *
31 * Returns an array of of word counts, false if an error occured
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @author Christopher Smith <chris@jalakai.co.uk>
35 */
36function idx_getPageWords($page){
37    global $conf;
38    $word_idx = file($conf['cachedir'].'/word.idx');
39    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
40    if(@file_exists($swfile)){
41        $stopwords = file($swfile);
42    }else{
43        $stopwords = array();
44    }
45
46    $body   = rawWiki($page);
47    $body   = strtr($body, "\r\n\t", '   ');
48    $tokens = explode(' ', $body);
49    $tokens = array_count_values($tokens);   // count the frequency of each token
50
51// ensure the deaccented or romanised page names of internal links are added to the token array
52// (this is necessary for the backlink function -- there maybe a better way!)
53    if ($conf['deaccent']) {
54      $links = p_get_metadata($page,'relation references');
55
56      $tmp = join(' ',array_keys($links));                // make a single string
57      $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
58      $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
59
60      foreach ($link_tokens as $link_token) {
61        if (isset($tokens[$link_token])) continue;
62        $tokens[$link_token] = 1;
63      }
64    }
65
66    $words = array();
67    foreach ($tokens as $word => $count) {
68        // simple filter to restrict use of utf8_stripspecials
69        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
70            // handle asian chars as single words (may fail on older PHP version)
71            $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
72            if(!is_null($asia)) $word = $asia; //recover from regexp failure
73            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
74            $arr = array_count_values($arr);
75
76            foreach ($arr as $w => $c) {
77                if (!is_numeric($w) && strlen($w) < 3) continue;
78                $w = utf8_strtolower($w);
79                $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
80            }
81        } else {
82            if (!is_numeric($word) && strlen($word) < 3) continue;
83            $word = strtolower($word);
84            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
85        }
86    }
87
88    // arrive here with $words = array(word => frequency)
89
90    $index = array(); //resulting index
91    foreach ($words as $word => $freq) {
92    if (is_int(array_search("$word\n",$stopwords))) continue;
93        $wid = array_search("$word\n",$word_idx);
94        if(!is_int($wid)){
95            $word_idx[] = "$word\n";
96            $wid = count($word_idx)-1;
97        }
98        $index[$wid] = $freq;
99    }
100
101    // save back word index
102    $fh = fopen($conf['cachedir'].'/word.idx','w');
103    if(!$fh){
104        trigger_error("Failed to write word.idx", E_USER_ERROR);
105        return false;
106    }
107    fwrite($fh,join('',$word_idx));
108    fclose($fh);
109
110    return $index;
111}
112
113/**
114 * Adds/updates the search for the given page
115 *
116 * This is the core function of the indexer which does most
117 * of the work. This function needs to be called with proper
118 * locking!
119 *
120 * @author Andreas Gohr <andi@splitbrain.org>
121 */
122function idx_addPage($page){
123    global $conf;
124
125    // load known documents
126    $page_idx = file($conf['cachedir'].'/page.idx');
127
128    // get page id (this is the linenumber in page.idx)
129    $pid = array_search("$page\n",$page_idx);
130    if(!is_int($pid)){
131        $page_idx[] = "$page\n";
132        $pid = count($page_idx)-1;
133        // page was new - write back
134        $fh = fopen($conf['cachedir'].'/page.idx','w');
135        if(!$fh) return false;
136        fwrite($fh,join('',$page_idx));
137        fclose($fh);
138    }
139
140    // get word usage in page
141    $words = idx_getPageWords($page);
142    if($words === false) return false;
143    if(!count($words)) return true;
144
145    // Open index and temp file
146    $idx = fopen($conf['cachedir'].'/index.idx','r');
147    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
148    if(!$idx || !$tmp){
149       trigger_error("Failed to open index files", E_USER_ERROR);
150       return false;
151    }
152
153    // copy from index to temp file, modifying were needed
154    $lno = 0;
155    $line = '';
156    while (!feof($idx)) {
157        // read full line
158        $line .= fgets($idx, 4096);
159        if(substr($line,-1) != "\n") continue;
160
161        // write a new Line to temp file
162        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
163
164        $line = ''; // reset line buffer
165        $lno++;     // increase linecounter
166    }
167    fclose($idx);
168
169    // add missing lines (usually index and word should contain
170    // the same number of lines, however if the page contained
171    // new words the word file has some more lines which need to
172    // be added here
173    $word_idx = file($conf['cachedir'].'/word.idx');
174    $wcnt = count($word_idx);
175    for($lno; $lno<$wcnt; $lno++){
176        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
177    }
178
179    // close the temp file and move it over to be the new one
180    fclose($tmp);
181    // try rename first (fast) fallback to copy (slow)
182    io_rename($conf['cachedir'].'/index.tmp',
183              $conf['cachedir'].'/index.idx');
184    return false;
185}
186
187/**
188 * Write a new index line to the filehandle
189 *
190 * This function writes an line for the index file to the
191 * given filehandle. It removes the given document from
192 * the given line and readds it when $count is >0.
193 *
194 * @author Andreas Gohr <andi@splitbrain.org>
195 */
196function idx_writeIndexLine($fh,$line,$pid,$count){
197    $line = trim($line);
198
199    if($line != ''){
200        $parts = explode(':',$line);
201        // remove doc from given line
202        foreach($parts as $part){
203            if($part == '') continue;
204            list($doc,$cnt) = explode('*',$part);
205            if($doc != $pid){
206                fwrite($fh,"$doc*$cnt:");
207            }
208        }
209    }
210
211    // add doc
212    if ($count){
213        fwrite($fh,"$pid*$count");
214    }
215
216    // add newline
217    fwrite($fh,"\n");
218}
219
220/**
221 * Lookup words in index
222 *
223 * Takes an array of word and will return a list of matching
224 * documents for each one.
225 *
226 * Important: No ACL checking is done here! All results are
227 *            returned, regardless of permissions
228 *
229 * @author Andreas Gohr <andi@splitbrain.org>
230 */
231function idx_lookup($words){
232    global $conf;
233
234    $result = array();
235
236    // load known words and documents
237    $page_idx = file($conf['cachedir'].'/page.idx');
238    $word_idx = file($conf['cachedir'].'/word.idx');
239
240    // get word IDs
241    $wids = array();
242    foreach($words as $word){
243        $result[$word] = array();
244        $wild = 0;
245        $xword = $word;
246
247        // check for wildcards
248        if(substr($xword,0,1) == '*'){
249            $xword = substr($xword,1);
250            $wild  = 1;
251            $ptn = '/'.preg_quote($xword,'/').'$/';
252#            $l = -1*strlen($xword)-1;
253        }
254        if(substr($xword,-1,1) == '*'){
255            $xword = substr($xword,0,-1);
256            $wild += 2;
257        }
258
259        // look for the ID(s) for the given word
260        if($wild){  // handle wildcard search
261            $cnt = count($word_idx);
262            for($wid=0; $wid<$cnt; $wid++){
263                $iword = $word_idx[$wid];
264                if( (($wild==3) && is_int(strpos($iword,$xword))) ||
265#                    (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
266                    (($wild==1) && preg_match($ptn,$iword)) ||
267#                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
268                    (($wild==2) && (0 === strpos($iword,$xword)))
269
270                  ){
271                    $wids[] = $wid;
272                    $result[$word][] = $wid;
273                }
274            }
275        }else{     // handle exact search
276            $wid = array_search("$word\n",$word_idx);
277            if(is_int($wid)){
278                $wids[] = $wid;
279                $result[$word][] = $wid;
280            }else{
281                $result[$word] = array();
282            }
283        }
284    }
285    sort($wids);
286    $wids = array_unique($wids);
287
288    // Open index
289    $idx = fopen($conf['cachedir'].'/index.idx','r');
290    if(!$idx){
291       msg("Failed to open index file",-1);
292       return false;
293    }
294
295    // Walk the index til the lines are found
296    $docs = array();                          // hold docs found
297    $lno  = 0;
298    $line = '';
299    $srch = array_shift($wids);               // which word do we look for?
300    while (!feof($idx)) {
301        // read full line
302        $line .= fgets($idx, 4096);
303        if(substr($line,-1) != "\n") continue;
304        if($lno > $srch)             break;   // shouldn't happen
305
306
307        // do we want this line?
308        if($lno == $srch){
309            // add docs to list
310            $docs[$srch] = idx_parseIndexLine($page_idx,$line);
311
312            $srch = array_shift($wids);        // next word to look up
313            if($srch == null) break;           // no more words
314        }
315
316        $line = ''; // reset line buffer
317        $lno++;     // increase linecounter
318    }
319    fclose($idx);
320
321
322    // merge found pages into final result array
323    $final = array();
324    foreach(array_keys($result) as $word){
325        $final[$word] = array();
326        foreach($result[$word] as $wid){
327            $hits = &$docs[$wid];
328            foreach ($hits as $hitkey => $hitcnt) {
329                $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
330            }
331        }
332    }
333    return $final;
334}
335
336/**
337 * Returns a list of documents and counts from a index line
338 *
339 * It omits docs with a count of 0 and pages that no longer
340 * exist.
341 *
342 * @param  array  $page_idx The list of known pages
343 * @param  string $line     A line from the main index
344 * @author Andreas Gohr <andi@splitbrain.org>
345 */
346function idx_parseIndexLine(&$page_idx,$line){
347    $result = array();
348
349    $line = trim($line);
350    if($line == '') return $result;
351
352    $parts = explode(':',$line);
353    foreach($parts as $part){
354        if($part == '') continue;
355        list($doc,$cnt) = explode('*',$part);
356        if(!$cnt) continue;
357        $doc = trim($page_idx[$doc]);
358        if(!$doc) continue;
359        // make sure the document still exists
360        if(!@file_exists(wikiFN($doc,'',false))) continue;
361
362        $result[$doc] = $cnt;
363    }
364    return $result;
365}
366
367/**
368 * Tokenizes a string into an array of search words
369 *
370 * Uses the same algorithm as idx_getPageWords()
371 *
372 * @param string   $string     the query as given by the user
373 * @param arrayref $stopwords  array of stopwords
374 * @param boolean  $wc         are wildcards allowed?
375 *
376 * @todo make combined function to use alone or in getPageWords
377 */
378function idx_tokenizer($string,&$stopwords,$wc=false){
379    $words = array();
380    $wc = ($wc) ? '' : $wc = '\*';
381
382    if(preg_match('/[^0-9A-Za-z]/u', $string)){
383        // handle asian chars as single words (may fail on older PHP version)
384        $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
385        if(!is_null($asia)) $string = $asia; //recover from regexp failure
386
387        $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
388        foreach ($arr as $w) {
389            if (!is_numeric($w) && strlen($w) < 3) continue;
390            $w = utf8_strtolower($w);
391            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
392            $words[] = $w;
393        }
394    }else{
395        $w = $string;
396        if (!is_numeric($w) && strlen($w) < 3) return $words;
397        $w = strtolower($w);
398        if(is_int(array_search("$w\n",$stopwords))) return $words;
399        $words[] = $w;
400    }
401
402    return $words;
403}
404
405//Setup VIM: ex: et ts=4 enc=utf-8 :
406