xref: /dokuwiki/inc/indexer.php (revision 5783998f1518db5000b33432885f3153de6b579f)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15// Asian characters are handled as words. The following regexp defines the
16// Unicode-Ranges for Asian characters
17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
18// I'm no language expert. If you think some ranges are wrongly chosen or
19// a range is missing, please contact me
20define(IDX_ASIAN,'['.
21                 '\x{0E00}-\x{0E7F}'.  // Thai
22                 '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
23                 '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
24                 '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
25                 ']');
26
27
28/**
29 * Split a page into words
30 *
31 * Returns an array of of word counts, false if an error occured
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @author Christopher Smith <chris@jalakai.co.uk>
35 */
36function idx_getPageWords($page){
37    global $conf;
38    $word_idx = file($conf['cachedir'].'/word.idx');
39    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
40    if(@file_exists($swfile)){
41        $stopwords = file($swfile);
42    }else{
43        $stopwords = array();
44    }
45
46    $body   = rawWiki($page);
47    $body   = strtr($body, "\r\n\t", '   ');
48    $tokens = explode(' ', $body);
49    $tokens = array_count_values($tokens);   // count the frequency of each token
50
51    $words = array();
52    foreach ($tokens as $word => $count) {
53        // simple filter to restrict use of utf8_stripspecials
54        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
55            // handle asian chars as single words
56            $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
57            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
58            $arr = array_count_values($arr);
59
60            foreach ($arr as $w => $c) {
61                if (!is_numeric($w) && strlen($w) < 3) continue;
62    		    $w = utf8_strtolower($w);
63                $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0);
64            }
65        } else {
66            if (!is_numeric($word) && strlen($word) < 3) continue;
67	        $word = strtolower($word);
68            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
69        }
70    }
71
72    // arrive here with $words = array(word => frequency)
73
74    $index = array(); //resulting index
75    foreach ($words as $word => $freq) {
76	if (is_int(array_search("$word\n",$stopwords))) continue;
77        $wid = array_search("$word\n",$word_idx);
78        if(!is_int($wid)){
79            $word_idx[] = "$word\n";
80            $wid = count($word_idx)-1;
81        }
82        $index[$wid] = $freq;
83    }
84
85    // save back word index
86    $fh = fopen($conf['cachedir'].'/word.idx','w');
87    if(!$fh){
88        trigger_error("Failed to write word.idx", E_USER_ERROR);
89        return false;
90    }
91    fwrite($fh,join('',$word_idx));
92    fclose($fh);
93
94    return $index;
95}
96
97/**
98 * Adds/updates the search for the given page
99 *
100 * This is the core function of the indexer which does most
101 * of the work. This function needs to be called with proper
102 * locking!
103 *
104 * @author Andreas Gohr <andi@splitbrain.org>
105 */
106function idx_addPage($page){
107    global $conf;
108
109    // load known documents
110    $page_idx = file($conf['cachedir'].'/page.idx');
111
112    // get page id (this is the linenumber in page.idx)
113    $pid = array_search("$page\n",$page_idx);
114    if(!is_int($pid)){
115        $page_idx[] = "$page\n";
116        $pid = count($page_idx)-1;
117        // page was new - write back
118        $fh = fopen($conf['cachedir'].'/page.idx','w');
119        if(!$fh) return false;
120        fwrite($fh,join('',$page_idx));
121        fclose($fh);
122    }
123
124    // get word usage in page
125    $words = idx_getPageWords($page);
126    if($words === false) return false;
127    if(!count($words)) return true;
128
129    // Open index and temp file
130    $idx = fopen($conf['cachedir'].'/index.idx','r');
131    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
132    if(!$idx || !$tmp){
133       trigger_error("Failed to open index files", E_USER_ERROR);
134       return false;
135    }
136
137    // copy from index to temp file, modifying were needed
138    $lno = 0;
139    $line = '';
140    while (!feof($idx)) {
141        // read full line
142        $line .= fgets($idx, 4096);
143        if(substr($line,-1) != "\n") continue;
144
145        // write a new Line to temp file
146        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
147
148        $line = ''; // reset line buffer
149        $lno++;     // increase linecounter
150    }
151    fclose($idx);
152
153    // add missing lines (usually index and word should contain
154    // the same number of lines, however if the page contained
155    // new words the word file has some more lines which need to
156    // be added here
157    $word_idx = file($conf['cachedir'].'/word.idx');
158    $wcnt = count($word_idx);
159    for($lno; $lno<$wcnt; $lno++){
160        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
161    }
162
163    // close the temp file and move it over to be the new one
164    fclose($tmp);
165    // try rename first (fast) fallback to copy (slow)
166    if(@rename($conf['cachedir'].'/index.tmp',
167              $conf['cachedir'].'/index.idx')){
168        return true;
169    }elseif(copy($conf['cachedir'].'/index.tmp',
170            $conf['cachedir'].'/index.idx')){
171        unlink($conf['cachedir'].'/index.tmp');
172        return true;
173    }
174    return false;
175}
176
177/**
178 * Write a new index line to the filehandle
179 *
180 * This function writes an line for the index file to the
181 * given filehandle. It removes the given document from
182 * the given line and readds it when $count is >0.
183 *
184 * @author Andreas Gohr <andi@splitbrain.org>
185 */
186function idx_writeIndexLine($fh,$line,$pid,$count){
187    $line = trim($line);
188
189    if($line != ''){
190        $parts = explode(':',$line);
191        // remove doc from given line
192        foreach($parts as $part){
193            if($part == '') continue;
194            list($doc,$cnt) = explode('*',$part);
195            if($doc != $pid){
196                fwrite($fh,"$doc*$cnt:");
197            }
198        }
199    }
200
201    // add doc
202    if ($count){
203        fwrite($fh,"$pid*$count");
204    }
205
206    // add newline
207    fwrite($fh,"\n");
208}
209
210/**
211 * Lookup words in index
212 *
213 * Takes an array of word and will return a list of matching
214 * documents for each one.
215 *
216 * Important: No ACL checking is done here! All results are
217 *            returned, regardless of permissions
218 *
219 * @author Andreas Gohr <andi@splitbrain.org>
220 */
221function idx_lookup($words){
222    global $conf;
223
224    $result = array();
225
226    // load known words and documents
227    $page_idx = file($conf['cachedir'].'/page.idx');
228    $word_idx = file($conf['cachedir'].'/word.idx');
229
230    // get word IDs
231    $wids = array();
232    foreach($words as $word){
233        $wid = array_search("$word\n",$word_idx);
234        if(is_int($wid)){
235            $wids[] = $wid;
236            $result[$word] = $wid;
237        }else{
238            $result[$word] = array();
239        }
240    }
241    sort($wids);
242    $wids = array_unique($wids);
243
244    // Open index
245    $idx = fopen($conf['cachedir'].'/index.idx','r');
246    if(!$idx){
247       msg("Failed to open index files",-1);
248       return false;
249    }
250
251    // Walk the index til the lines are found
252    $docs = array();                          // hold docs found
253    $lno  = 0;
254    $line = '';
255    $srch = array_shift($wids);               // which word do we look for?
256    while (!feof($idx)) {
257        // read full line
258        $line .= fgets($idx, 4096);
259        if(substr($line,-1) != "\n") continue;
260        if($lno > $srch)             break;   // shouldn't happen
261
262
263        // do we want this line?
264        if($lno == $srch){
265            // add docs to list
266            $docs[$srch] = idx_parseIndexLine($page_idx,$line);
267
268            $srch = array_shift($wids);        // next word to look up
269            if($srch == null) break;           // no more words
270        }
271
272        $line = ''; // reset line buffer
273        $lno++;     // increase linecounter
274    }
275    fclose($idx);
276
277    // merge found pages into result array
278    foreach(array_keys($result) as $word){
279        if(is_int($result[$word])){
280            $result[$word] = $docs[$result[$word]];
281        }
282    }
283
284    return $result;
285}
286
287/**
288 * Returns a list of documents and counts from a index line
289 *
290 * It omits docs with a count of 0 and pages that no longer
291 * exist.
292 *
293 * @param  array  $page_idx The list of known pages
294 * @param  string $line     A line from the main index
295 * @author Andreas Gohr <andi@splitbrain.org>
296 */
297function idx_parseIndexLine(&$page_idx,$line){
298    $result = array();
299
300    $line = trim($line);
301    if($line == '') return $result;
302
303    $parts = explode(':',$line);
304    foreach($parts as $part){
305        if($part == '') continue;
306        list($doc,$cnt) = explode('*',$part);
307        if(!$cnt) continue;
308        $doc = trim($page_idx[$doc]);
309        if(!$doc) continue;
310        // make sure the document still exists
311        if(!@file_exists(wikiFN($doc))) continue;
312
313        $result[$doc] = $cnt;
314    }
315    return $result;
316}
317
318/**
319 * Tokenizes a string into an array of search words
320 *
321 * Uses the same algorithm as idx_getPageWords()
322 *
323 * @todo make combined function to use alone or in getPageWords
324 */
325function idx_tokenizer($string,&$stopwords){
326    $words = array();
327
328    if(preg_match('/[^0-9A-Za-z]/u', $string)){
329        #handle asian chars as single words
330        $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
331
332        $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
333        foreach ($arr as $w) {
334            if (!is_numeric($w) && strlen($w) < 3) continue;
335            $w = utf8_strtolower($w);
336            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
337            $words[] = $w;
338        }
339    }else{
340        $w = $string;
341        if (!is_numeric($w) && strlen($w) < 3) return $words;
342        $w = strtolower($w);
343        if(is_int(array_search("$w\n",$stopwords))) return $words;
344        $words[] = $w;
345    }
346
347    return $words;
348}
349
350//Setup VIM: ex: et ts=4 enc=utf-8 :
351