xref: /dokuwiki/inc/indexer.php (revision ccf5ab4be633f1da1ed602acaf3f1dc0ae27d388)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15// Asian characters are handled as words. The following regexp defines the
16// Unicode-Ranges for Asian characters
17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
18// I'm no language expert. If you think some ranges are wrongly chosen or
19// a range is missing, please contact me
20define('IDX_ASIAN','['.
21                   '\x{0E00}-\x{0E7F}'.  // Thai
22                   '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
23                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
24                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
25                   ']');
26
27
28/**
29 * Split a page into words
30 *
31 * Returns an array of of word counts, false if an error occured
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @author Christopher Smith <chris@jalakai.co.uk>
35 */
36function idx_getPageWords($page){
37    global $conf;
38    $word_idx = file($conf['cachedir'].'/word.idx');
39    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
40    if(@file_exists($swfile)){
41        $stopwords = file($swfile);
42    }else{
43        $stopwords = array();
44    }
45
46    $body   = rawWiki($page);
47    $body   = strtr($body, "\r\n\t", '   ');
48    $tokens = explode(' ', $body);
49    $tokens = array_count_values($tokens);   // count the frequency of each token
50
51    $words = array();
52    foreach ($tokens as $word => $count) {
53        // simple filter to restrict use of utf8_stripspecials
54        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
55            // handle asian chars as single words (may fail on older PHP version)
56            $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
57            if(!is_null($asia)) $word = $asia; //recover from regexp failure
58            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
59            $arr = array_count_values($arr);
60
61            foreach ($arr as $w => $c) {
62                if (!is_numeric($w) && strlen($w) < 3) continue;
63                $w = utf8_strtolower($w);
64                $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
65            }
66        } else {
67            if (!is_numeric($word) && strlen($word) < 3) continue;
68            $word = strtolower($word);
69            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
70        }
71    }
72
73    // arrive here with $words = array(word => frequency)
74
75    $index = array(); //resulting index
76    foreach ($words as $word => $freq) {
77    if (is_int(array_search("$word\n",$stopwords))) continue;
78        $wid = array_search("$word\n",$word_idx);
79        if(!is_int($wid)){
80            $word_idx[] = "$word\n";
81            $wid = count($word_idx)-1;
82        }
83        $index[$wid] = $freq;
84    }
85
86    // save back word index
87    $fh = fopen($conf['cachedir'].'/word.idx','w');
88    if(!$fh){
89        trigger_error("Failed to write word.idx", E_USER_ERROR);
90        return false;
91    }
92    fwrite($fh,join('',$word_idx));
93    fclose($fh);
94
95    return $index;
96}
97
98/**
99 * Adds/updates the search for the given page
100 *
101 * This is the core function of the indexer which does most
102 * of the work. This function needs to be called with proper
103 * locking!
104 *
105 * @author Andreas Gohr <andi@splitbrain.org>
106 */
107function idx_addPage($page){
108    global $conf;
109
110    // load known documents
111    $page_idx = file($conf['cachedir'].'/page.idx');
112
113    // get page id (this is the linenumber in page.idx)
114    $pid = array_search("$page\n",$page_idx);
115    if(!is_int($pid)){
116        $page_idx[] = "$page\n";
117        $pid = count($page_idx)-1;
118        // page was new - write back
119        $fh = fopen($conf['cachedir'].'/page.idx','w');
120        if(!$fh) return false;
121        fwrite($fh,join('',$page_idx));
122        fclose($fh);
123    }
124
125    // get word usage in page
126    $words = idx_getPageWords($page);
127    if($words === false) return false;
128    if(!count($words)) return true;
129
130    // Open index and temp file
131    $idx = fopen($conf['cachedir'].'/index.idx','r');
132    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
133    if(!$idx || !$tmp){
134       trigger_error("Failed to open index files", E_USER_ERROR);
135       return false;
136    }
137
138    // copy from index to temp file, modifying were needed
139    $lno = 0;
140    $line = '';
141    while (!feof($idx)) {
142        // read full line
143        $line .= fgets($idx, 4096);
144        if(substr($line,-1) != "\n") continue;
145
146        // write a new Line to temp file
147        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
148
149        $line = ''; // reset line buffer
150        $lno++;     // increase linecounter
151    }
152    fclose($idx);
153
154    // add missing lines (usually index and word should contain
155    // the same number of lines, however if the page contained
156    // new words the word file has some more lines which need to
157    // be added here
158    $word_idx = file($conf['cachedir'].'/word.idx');
159    $wcnt = count($word_idx);
160    for($lno; $lno<$wcnt; $lno++){
161        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
162    }
163
164    // close the temp file and move it over to be the new one
165    fclose($tmp);
166    // try rename first (fast) fallback to copy (slow)
167    if(@rename($conf['cachedir'].'/index.tmp',
168              $conf['cachedir'].'/index.idx')){
169        return true;
170    }elseif(copy($conf['cachedir'].'/index.tmp',
171            $conf['cachedir'].'/index.idx')){
172        unlink($conf['cachedir'].'/index.tmp');
173        return true;
174    }
175    return false;
176}
177
178/**
179 * Write a new index line to the filehandle
180 *
181 * This function writes an line for the index file to the
182 * given filehandle. It removes the given document from
183 * the given line and readds it when $count is >0.
184 *
185 * @author Andreas Gohr <andi@splitbrain.org>
186 */
187function idx_writeIndexLine($fh,$line,$pid,$count){
188    $line = trim($line);
189
190    if($line != ''){
191        $parts = explode(':',$line);
192        // remove doc from given line
193        foreach($parts as $part){
194            if($part == '') continue;
195            list($doc,$cnt) = explode('*',$part);
196            if($doc != $pid){
197                fwrite($fh,"$doc*$cnt:");
198            }
199        }
200    }
201
202    // add doc
203    if ($count){
204        fwrite($fh,"$pid*$count");
205    }
206
207    // add newline
208    fwrite($fh,"\n");
209}
210
211/**
212 * Lookup words in index
213 *
214 * Takes an array of word and will return a list of matching
215 * documents for each one.
216 *
217 * Important: No ACL checking is done here! All results are
218 *            returned, regardless of permissions
219 *
220 * @author Andreas Gohr <andi@splitbrain.org>
221 */
222function idx_lookup($words){
223    global $conf;
224
225    $result = array();
226
227    // load known words and documents
228    $page_idx = file($conf['cachedir'].'/page.idx');
229    $word_idx = file($conf['cachedir'].'/word.idx');
230
231    // get word IDs
232    $wids = array();
233    foreach($words as $word){
234        $result[$word] = array();
235        $wild = 0;
236        $xword = $word;
237
238        // check for wildcards
239        if(substr($xword,0,1) == '*'){
240            $xword = substr($xword,1);
241            $wild  = 1;
242        }
243        if(substr($xword,-1,1) == '*'){
244            $xword = substr($xword,0,-1);
245            $wild += 2;
246        }
247
248        // look for the ID(s) for the given word
249        if($wild){  // handle wildcard search
250            $cnt = count($word_idx);
251            for($wid=0; $wid<$cnt; $wid++){
252                $iword = $word_idx[$wid];
253                if( (($wild==3) && is_int(strpos($iword,$xword))) ||
254                    (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) ||
255                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
256                  ){
257                    $wids[] = $wid;
258                    $result[$word][] = $wid;
259                }
260            }
261        }else{     // handle exact search
262            $wid = array_search("$word\n",$word_idx);
263            if(is_int($wid)){
264                $wids[] = $wid;
265                $result[$word][] = $wid;
266            }else{
267                $result[$word] = array();
268            }
269        }
270    }
271    sort($wids);
272    $wids = array_unique($wids);
273
274    // Open index
275    $idx = fopen($conf['cachedir'].'/index.idx','r');
276    if(!$idx){
277       msg("Failed to open index file",-1);
278       return false;
279    }
280
281    // Walk the index til the lines are found
282    $docs = array();                          // hold docs found
283    $lno  = 0;
284    $line = '';
285    $srch = array_shift($wids);               // which word do we look for?
286    while (!feof($idx)) {
287        // read full line
288        $line .= fgets($idx, 4096);
289        if(substr($line,-1) != "\n") continue;
290        if($lno > $srch)             break;   // shouldn't happen
291
292
293        // do we want this line?
294        if($lno == $srch){
295            // add docs to list
296            $docs[$srch] = idx_parseIndexLine($page_idx,$line);
297
298            $srch = array_shift($wids);        // next word to look up
299            if($srch == null) break;           // no more words
300        }
301
302        $line = ''; // reset line buffer
303        $lno++;     // increase linecounter
304    }
305    fclose($idx);
306
307
308    // merge found pages into final result array
309    $final = array();
310    foreach(array_keys($result) as $word){
311        $final[$word] = array();
312        foreach($result[$word] as $wid){
313            $hits = &$docs[$wid];
314            foreach ($hits as $hitkey => $hitcnt) {
315                $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
316            }
317        }
318    }
319    return $final;
320}
321
322/**
323 * Returns a list of documents and counts from a index line
324 *
325 * It omits docs with a count of 0 and pages that no longer
326 * exist.
327 *
328 * @param  array  $page_idx The list of known pages
329 * @param  string $line     A line from the main index
330 * @author Andreas Gohr <andi@splitbrain.org>
331 */
332function idx_parseIndexLine(&$page_idx,$line){
333    $result = array();
334
335    $line = trim($line);
336    if($line == '') return $result;
337
338    $parts = explode(':',$line);
339    foreach($parts as $part){
340        if($part == '') continue;
341        list($doc,$cnt) = explode('*',$part);
342        if(!$cnt) continue;
343        $doc = trim($page_idx[$doc]);
344        if(!$doc) continue;
345        // make sure the document still exists
346        if(!@file_exists(wikiFN($doc))) continue;
347
348        $result[$doc] = $cnt;
349    }
350    return $result;
351}
352
353/**
354 * Tokenizes a string into an array of search words
355 *
356 * Uses the same algorithm as idx_getPageWords()
357 *
358 * @param string   $string     the query as given by the user
359 * @param arrayref $stopwords  array of stopwords
360 * @param boolean  $wc         are wildcards allowed?
361 *
362 * @todo make combined function to use alone or in getPageWords
363 */
364function idx_tokenizer($string,&$stopwords,$wc=false){
365    $words = array();
366    if(!$wc) $wc = '\*';
367
368    if(preg_match('/[^0-9A-Za-z]/u', $string)){
369        // handle asian chars as single words (may fail on older PHP version)
370        $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
371        if(!is_null($asia)) $string = $asia; //recover from regexp failure
372
373        $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'.$wc));
374        foreach ($arr as $w) {
375            if (!is_numeric($w) && strlen($w) < 3) continue;
376            $w = utf8_strtolower($w);
377            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
378            $words[] = $w;
379        }
380    }else{
381        $w = $string;
382        if (!is_numeric($w) && strlen($w) < 3) return $words;
383        $w = strtolower($w);
384        if(is_int(array_search("$w\n",$stopwords))) return $words;
385        $words[] = $w;
386    }
387
388    return $words;
389}
390
391//Setup VIM: ex: et ts=4 enc=utf-8 :
392