xref: /dokuwiki/inc/indexer.php (revision 14c3d17d7c7922d6a775d248456f0fd102b57f0e)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15/**
16 * Split a page into words
17 *
18 * Returns an array of of word counts, false if an error occured
19 *
20 * @author Andreas Gohr <andi@splitbrain.org>
21 * @author Christopher Smith <chris@jalakai.co.uk>
22 */
23function idx_getPageWords($page){
24    global $conf;
25    $word_idx = file($conf['cachedir'].'/word.idx');
26    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
27    if(@file_exists($swfile)){
28        $stopwords = file($swfile);
29    }else{
30        $stopwords = array();
31    }
32
33    $body   = rawWiki($page);
34    $body   = strtr($body, "\r\n\t", '   ');
35    $tokens = explode(' ', $body);
36    $tokens = array_count_values($tokens);   // count the frequency of each token
37
38    $words = array();
39    foreach ($tokens as $word => $count) {
40        $word = utf8_strtolower($word);
41
42        // simple filter to restrict use of utf8_stripspecials
43        if (preg_match('/\W/', $word)) {
44            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
45            $arr = array_count_values($arr);
46
47            foreach ($arr as $w => $c) {
48                if (!is_numeric($w) && strlen($w) < 3) continue;
49                $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0);
50            }
51        } else {
52            if (!is_numeric($w) && strlen($w) < 3) continue;
53            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
54        }
55    }
56
57    // arrive here with $words = array(word => frequency)
58
59    $index = array(); //resulting index
60    foreach ($words as $word => $freq) {
61	if (is_int(array_search("$word\n",$stopwords))) continue;
62        $wid = array_search("$word\n",$word_idx);
63        if(!is_int($wid)){
64            $word_idx[] = "$word\n";
65            $wid = count($word_idx)-1;
66        }
67        $index[$wid] = $freq;
68    }
69
70    // save back word index
71    $fh = fopen($conf['cachedir'].'/word.idx','w');
72    if(!$fh){
73        trigger_error("Failed to write word.idx", E_USER_ERROR);
74        return false;
75    }
76    fwrite($fh,join('',$word_idx));
77    fclose($fh);
78
79    return $index;
80}
81
82/**
83 * Adds/updates the search for the given page
84 *
85 * This is the core function of the indexer which does most
86 * of the work. This function needs to be called with proper
87 * locking!
88 *
89 * @author Andreas Gohr <andi@splitbrain.org>
90 */
91function idx_addPage($page){
92    global $conf;
93
94    // load known words and documents
95    $page_idx = file($conf['cachedir'].'/page.idx');
96
97    // get page id (this is the linenumber in page.idx)
98    $pid = array_search("$page\n",$page_idx);
99    if(!is_int($pid)){
100        $page_idx[] = "$page\n";
101        $pid = count($page_idx)-1;
102        // page was new - write back
103        $fh = fopen($conf['cachedir'].'/page.idx','w');
104        if(!$fh) return false;
105        fwrite($fh,join('',$page_idx));
106        fclose($fh);
107    }
108
109    // get word usage in page
110    $words = idx_getPageWords($page);
111    if($words === false) return false;
112    if(!count($words)) return true;
113
114    // Open index and temp file
115    $idx = fopen($conf['cachedir'].'/index.idx','r');
116    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
117    if(!$idx || !$tmp){
118       trigger_error("Failed to open index files", E_USER_ERROR);
119       return false;
120    }
121
122    // copy from index to temp file, modifying were needed
123    $lno = 0;
124    $line = '';
125    while (!feof($idx)) {
126        // read full line
127        $line .= fgets($idx, 4096);
128        if(substr($line,-1) != "\n") continue;
129
130        // write a new Line to temp file
131        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
132
133        $line = ''; // reset line buffer
134        $lno++;     // increase linecounter
135    }
136    fclose($idx);
137
138    // add missing lines (usually index and word should contain
139    // the same number of lines, however if the page contained
140    // new words the word file has some more lines which need to
141    // be added here
142    $word_idx = file($conf['cachedir'].'/word.idx');
143    $wcnt = count($word_idx);
144    for($lno; $lno<$wcnt; $lno++){
145        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
146    }
147
148    // close the temp file and move it over to be the new one
149    fclose($tmp);
150    return rename($conf['cachedir'].'/index.tmp',
151                  $conf['cachedir'].'/index.idx');
152}
153
154/**
155 * Write a new index line to the filehandle
156 *
157 * This function writes an line for the index file to the
158 * given filehandle. It removes the given document from
159 * the given line and readds it when $count is >0.
160 *
161 * @author Andreas Gohr <andi@splitbrain.org>
162 */
163function idx_writeIndexLine($fh,$line,$pid,$count){
164    $line = trim($line);
165
166    if($line != ''){
167        $parts = explode(':',$line);
168        // remove doc from given line
169        foreach($parts as $part){
170            if($part == '') continue;
171            list($doc,$cnt) = explode('*',$part);
172            if($doc != $pid){
173                fwrite($fh,"$doc*$cnt:");
174            }
175        }
176    }
177
178    // add doc
179    if ($count){
180        fwrite($fh,"$pid*$count");
181    }
182
183    // add newline
184    fwrite($fh,"\n");
185}
186
187//Setup VIM: ex: et ts=4 enc=utf-8 :
188