xref: /dokuwiki/inc/indexer.php (revision 706882dcf32358de6c9300ced7b5fcfa9ba28771)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15/**
16 * Split a page into words
17 *
18 * It is based upon PHPCMS's indexer function index_entry
19 *
20 * Returns an array of of word counts, false if an error occured
21 *
22 * @author Andreas Gohr <andi@splitbrain.org>
23 */
24function idx_getPageWords($page){
25    global $conf;
26    $word_idx = file($conf['cachedir'].'/word.idx');
27
28    // split page into words
29    $body  = rawWiki($page);
30    $body  = utf8_stripspecials($body,' ','._\-:');
31    $body  = utf8_strtolower($body);
32    $body  = trim($body);
33    $words = explode(' ',$body);
34    sort($words);
35
36    $index = array(); //resulting index
37    $old   = '';
38    $wid   = -1;
39    $doit  = true;
40    $pos   = 0;
41
42    //compact wordlist FIXME check for stopwords
43    foreach($words as $word){
44        if(strlen($word) == 0) continue;
45
46        // it's the same word
47        if($word == $old){
48            if($doit == false) {
49                // we didn't wanted it last time
50                continue;
51            }
52            // just increase the counter
53            $index[$wid]++;
54            continue;
55        }
56
57        // rememember old word
58        $old  = $word;
59        $doit = true;
60
61        // checking minimum word-size (excepting numbers)
62        if(!is_numeric($word)) {
63            if(strlen($word) < 3) {  #FIXME add config option for max wordsize
64                $doit = false;
65                continue;
66            }
67        }
68
69        //FIXME add stopword check
70
71        // get word ID
72        $wid = array_search("$word\n",$word_idx);
73        if(!is_int($wid)){
74            $word_idx[] = "$word\n";
75            $wid = count($word_idx)-1;
76        }
77        // add to index
78        $index[$wid] = 1;
79    }
80
81    // save back word index
82    $fh = fopen($conf['cachedir'].'/word.idx','w');
83    if(!$fh){
84        trigger_error("Failed to write word.idx", E_USER_ERROR);
85        return false;
86    }
87    fwrite($fh,join('',$word_idx));
88    fclose($fh);
89
90    return $index;
91}
92
93/**
94 * Adds/updates the search for the given page
95 *
96 * This is the core function of the indexer which does most
97 * of the work. This function needs to be called with proper
98 * locking!
99 *
100 * @author Andreas Gohr <andi@splitbrain.org>
101 */
102function idx_addPage($page){
103    global $conf;
104
105    // load known words and documents
106    $page_idx = file($conf['cachedir'].'/page.idx');
107
108    // get page id (this is the linenumber in page.idx)
109    $pid = array_search("$page\n",$page_idx);
110    if(!is_int($pid)){
111        $page_idx[] = "$page\n";
112        $pid = count($page_idx)-1;
113        // page was new - write back
114        $fh = fopen($conf['cachedir'].'/page.idx','w');
115        if(!$fh) return false;
116        fwrite($fh,join('',$page_idx));
117        fclose($fh);
118    }
119
120    // get word usage in page
121    $words = idx_getPageWords($page);
122    if($words === false) return false;
123    if(!count($words)) return true;
124
125    // Open index and temp file
126    $idx = fopen($conf['cachedir'].'/index.idx','r');
127    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
128    if(!$idx || !$tmp){
129       trigger_error("Failed to open index files", E_USER_ERROR);
130       return false;
131    }
132
133    // copy from index to temp file, modifying were needed
134    $lno = 0;
135    $line = '';
136    while (!feof($idx)) {
137        // read full line
138        $line .= fgets($idx, 4096);
139        if(substr($line,-1) != "\n") continue;
140
141        // write a new Line to temp file
142        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
143
144        $line = ''; // reset line buffer
145        $lno++;     // increase linecounter
146    }
147    fclose($idx);
148
149    // add missing lines (usually index and word should contain
150    // the same number of lines, however if the page contained
151    // new words the word file has some more lines which need to
152    // be added here
153    $word_idx = file($conf['cachedir'].'/word.idx');
154    $wcnt = count($word_idx);
155    for($lno; $lno<$wcnt; $lno++){
156        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
157    }
158
159    // close the temp file and move it over to be the new one
160    fclose($tmp);
161    return rename($conf['cachedir'].'/index.tmp',
162                  $conf['cachedir'].'/index.idx');
163}
164
165/**
166 * Write a new index line to the filehandle
167 *
168 * This function writes an line for the index file to the
169 * given filehandle. It removes the given document from
170 * the given line and readds it when $count is >0.
171 *
172 * @author Andreas Gohr <andi@splitbrain.org>
173 */
174function idx_writeIndexLine($fh,$line,$pid,$count){
175    $line = trim($line);
176
177    if($line != ''){
178        $parts = explode(':',$line);
179        // remove doc from given line
180        foreach($parts as $part){
181            if($part == '') continue;
182            list($doc,$cnt) = explode('*',$part);
183            if($doc != $pid){
184                fwrite($fh,"$doc*$cnt:");
185            }
186        }
187    }
188
189    // add doc
190    if ($count){
191        fwrite($fh,"$pid*$count");
192    }
193
194    // add newline
195    fwrite($fh,"\n");
196}
197
198//Setup VIM: ex: et ts=4 enc=utf-8 :
199