xref: /dokuwiki/inc/indexer.php (revision ea3a66b20ac38cf452125dca0bc416d480d6d82c)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15/**
16 * Split a page into words
17 *
18 * Returns an array of of word counts, false if an error occured
19 *
20 * @author Andreas Gohr <andi@splitbrain.org>
21 * @author Christopher Smith <chris@jalakai.co.uk>
22 */
23function idx_getPageWords($page){
24    global $conf;
25    $word_idx = file($conf['cachedir'].'/word.idx');
26    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
27    if(@file_exists($swfile)){
28        $stopwords = file($swfile);
29    }else{
30        $stopwords = array();
31    }
32
33    $body   = rawWiki($page);
34    $body   = strtr($body, "\r\n\t", '   ');
35    $tokens = explode(' ', $body);
36    $tokens = array_count_values($tokens);   // count the frequency of each token
37
38    $words = array();
39    foreach ($tokens as $word => $count) {
40
41        // simple filter to restrict use of utf8_stripspecials
42        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
43            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
44            $arr = array_count_values($arr);
45
46            foreach ($arr as $w => $c) {
47                if (!is_numeric($w) && strlen($w) < 3) continue;
48    		    $w = utf8_strtolower($w);
49                $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0);
50            }
51        } else {
52            if (!is_numeric($word) && strlen($word) < 3) continue;
53	        $word = strtolower($word);
54            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
55        }
56    }
57
58    // arrive here with $words = array(word => frequency)
59
60    $index = array(); //resulting index
61    foreach ($words as $word => $freq) {
62	if (is_int(array_search("$word\n",$stopwords))) continue;
63        $wid = array_search("$word\n",$word_idx);
64        if(!is_int($wid)){
65            $word_idx[] = "$word\n";
66            $wid = count($word_idx)-1;
67        }
68        $index[$wid] = $freq;
69    }
70
71    // save back word index
72    $fh = fopen($conf['cachedir'].'/word.idx','w');
73    if(!$fh){
74        trigger_error("Failed to write word.idx", E_USER_ERROR);
75        return false;
76    }
77    fwrite($fh,join('',$word_idx));
78    fclose($fh);
79
80    return $index;
81}
82
83/**
84 * Adds/updates the search for the given page
85 *
86 * This is the core function of the indexer which does most
87 * of the work. This function needs to be called with proper
88 * locking!
89 *
90 * @author Andreas Gohr <andi@splitbrain.org>
91 */
92function idx_addPage($page){
93    global $conf;
94
95    // load known documents
96    $page_idx = file($conf['cachedir'].'/page.idx');
97
98    // get page id (this is the linenumber in page.idx)
99    $pid = array_search("$page\n",$page_idx);
100    if(!is_int($pid)){
101        $page_idx[] = "$page\n";
102        $pid = count($page_idx)-1;
103        // page was new - write back
104        $fh = fopen($conf['cachedir'].'/page.idx','w');
105        if(!$fh) return false;
106        fwrite($fh,join('',$page_idx));
107        fclose($fh);
108    }
109
110    // get word usage in page
111    $words = idx_getPageWords($page);
112    if($words === false) return false;
113    if(!count($words)) return true;
114
115    // Open index and temp file
116    $idx = fopen($conf['cachedir'].'/index.idx','r');
117    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
118    if(!$idx || !$tmp){
119       trigger_error("Failed to open index files", E_USER_ERROR);
120       return false;
121    }
122
123    // copy from index to temp file, modifying were needed
124    $lno = 0;
125    $line = '';
126    while (!feof($idx)) {
127        // read full line
128        $line .= fgets($idx, 4096);
129        if(substr($line,-1) != "\n") continue;
130
131        // write a new Line to temp file
132        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
133
134        $line = ''; // reset line buffer
135        $lno++;     // increase linecounter
136    }
137    fclose($idx);
138
139    // add missing lines (usually index and word should contain
140    // the same number of lines, however if the page contained
141    // new words the word file has some more lines which need to
142    // be added here
143    $word_idx = file($conf['cachedir'].'/word.idx');
144    $wcnt = count($word_idx);
145    for($lno; $lno<$wcnt; $lno++){
146        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
147    }
148
149    // close the temp file and move it over to be the new one
150    fclose($tmp);
151    // try rename first (fast) fallback to copy (slow)
152    if(@rename($conf['cachedir'].'/index.tmp',
153              $conf['cachedir'].'/index.idx')){
154        return true;
155    }elseif(copy($conf['cachedir'].'/index.tmp',
156            $conf['cachedir'].'/index.idx')){
157        unlink($conf['cachedir'].'/index.tmp');
158        return true;
159    }
160    return false;
161}
162
163/**
164 * Write a new index line to the filehandle
165 *
166 * This function writes an line for the index file to the
167 * given filehandle. It removes the given document from
168 * the given line and readds it when $count is >0.
169 *
170 * @author Andreas Gohr <andi@splitbrain.org>
171 */
172function idx_writeIndexLine($fh,$line,$pid,$count){
173    $line = trim($line);
174
175    if($line != ''){
176        $parts = explode(':',$line);
177        // remove doc from given line
178        foreach($parts as $part){
179            if($part == '') continue;
180            list($doc,$cnt) = explode('*',$part);
181            if($doc != $pid){
182                fwrite($fh,"$doc*$cnt:");
183            }
184        }
185    }
186
187    // add doc
188    if ($count){
189        fwrite($fh,"$pid*$count");
190    }
191
192    // add newline
193    fwrite($fh,"\n");
194}
195
196/**
197 * Lookup words in index
198 *
199 * Takes an array of word and will return a list of matching
200 * documents for each one.
201 *
202 * Important: No ACL checking is done here! All results are
203 *            returned, regardless of permissions
204 *
205 * @author Andreas Gohr <andi@splitbrain.org>
206 */
207function idx_lookup($words){
208    global $conf;
209
210    $result = array();
211
212    // load known words and documents
213    $page_idx = file($conf['cachedir'].'/page.idx');
214    $word_idx = file($conf['cachedir'].'/word.idx');
215
216    // get word IDs
217    $wids = array();
218    foreach($words as $word){
219        $wid = array_search("$word\n",$word_idx);
220        if(is_int($wid)){
221            $wids[] = $wid;
222            $result[$word] = $wid;
223        }else{
224            $result[$word] = array();
225        }
226    }
227    sort($wids);
228    $wids = array_unique($wids);
229
230    // Open index
231    $idx = fopen($conf['cachedir'].'/index.idx','r');
232    if(!$idx){
233       msg("Failed to open index files",-1);
234       return false;
235    }
236
237    // Walk the index til the lines are found
238    $docs = array();                          // hold docs found
239    $lno  = 0;
240    $line = '';
241    $srch = array_shift($wids);               // which word do we look for?
242    while (!feof($idx)) {
243        // read full line
244        $line .= fgets($idx, 4096);
245        if(substr($line,-1) != "\n") continue;
246        if($lno > $srch)             break;   // shouldn't happen
247
248
249        // do we want this line?
250        if($lno == $srch){
251            // add docs to list
252            $docs[$srch] = idx_parseIndexLine($page_idx,$line);
253
254            $srch = array_shift($wids);        // next word to look up
255            if($srch == null) break;           // no more words
256        }
257
258        $line = ''; // reset line buffer
259        $lno++;     // increase linecounter
260    }
261    fclose($idx);
262
263    // merge found pages into result array
264    foreach(array_keys($result) as $word){
265        if(is_int($result[$word])){
266            $result[$word] = $docs[$result[$word]];
267        }
268    }
269
270    return $result;
271}
272
273/**
274 * Returns a list of documents and counts from a index line
275 *
276 * It omits docs with a count of 0 and pages that no longer
277 * exist.
278 *
279 * @param  array  $page_idx The list of known pages
280 * @param  string $line     A line from the main index
281 * @author Andreas Gohr <andi@splitbrain.org>
282 */
283function idx_parseIndexLine(&$page_idx,$line){
284    $result = array();
285
286    $line = trim($line);
287    if($line == '') return $result;
288
289    $parts = explode(':',$line);
290    foreach($parts as $part){
291        if($part == '') continue;
292        list($doc,$cnt) = explode('*',$part);
293        if(!$cnt) continue;
294        $doc = trim($page_idx[$doc]);
295        if(!$doc) continue;
296        // make sure the document still exists
297        if(!@file_exists(wikiFN($doc))) continue;
298
299        $result[$doc] = $cnt;
300    }
301    return $result;
302}
303
304/**
305 * Tokenizes a string into an array of search words
306 *
307 * Uses the same algorithm as idx_getPageWords()
308 *
309 * @todo make combined function to use alone or in getPageWords
310 */
311function idx_tokenizer($string,&$stopwords){
312    $words = array();
313
314    if(preg_match('/[^0-9A-Za-z]/u', $string)){
315        $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
316        foreach ($arr as $w) {
317            if (!is_numeric($w) && strlen($w) < 3) continue;
318            $w = utf8_strtolower($w);
319            if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
320            $words[] = $w;
321        }
322    }else{
323        $w = $string;
324        if (!is_numeric($w) && strlen($w) < 3) return $words;
325        $w = strtolower($w);
326        if(is_int(array_search("$w\n",$stopwords))) return $words;
327        $words[] = $w;
328    }
329
330    return $words;
331}
332
333//Setup VIM: ex: et ts=4 enc=utf-8 :
334