xref: /dokuwiki/inc/indexer.php (revision 7367b36877bca568d785e01be802652b6a719884)
1<?php
2/**
3 * Common DokuWiki functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_CONF.'dokuwiki.php');
11  require_once(DOKU_INC.'inc/io.php');
12  require_once(DOKU_INC.'inc/utf8.php');
13  require_once(DOKU_INC.'inc/parserutils.php');
14
15/**
16 * Split a page into words
17 *
18 * It is based upon PHPCMS's indexer function index_entry
19 *
20 * Returns an array of of word counts, false if an error occured
21 *
22 * @author Andreas Gohr <andi@splitbrain.org>
23 */
24function idx_getPageWords($page){
25    global $conf;
26    $word_idx = file($conf['cachedir'].'/word.idx');
27    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
28    if(@file_exists($swfile)){
29        $stopwords = file($swfile);
30    }else{
31        $stopwords = array();
32    }
33
34    // split page into words
35    $body  = rawWiki($page);
36    $body  = utf8_stripspecials($body,' ','._\-:');
37    $body  = utf8_strtolower($body);
38    $body  = trim($body);
39    $words = explode(' ',$body);
40    sort($words);
41
42    $index = array(); //resulting index
43    $old   = '';
44    $wid   = -1;
45    $doit  = true;
46    $pos   = 0;
47
48    //compact wordlist FIXME check for stopwords
49    foreach($words as $word){
50        if(strlen($word) == 0) continue;
51
52        // it's the same word
53        if($word == $old){
54            if($doit == false) {
55                // we didn't wanted it last time
56                continue;
57            }
58            // just increase the counter
59            $index[$wid]++;
60            continue;
61        }
62
63        // rememember old word
64        $old  = $word;
65        $doit = true;
66
67        // checking minimum word-size (excepting numbers)
68        if(!is_numeric($word)) {
69            if(strlen($word) < 3) {
70                $doit = false;
71                continue;
72            }
73        }
74
75        // stopword check
76        if(is_int(array_search("$word\n",$stopwords))){
77            $doit = false;
78            continue;
79        }
80
81        // get word ID
82        $wid = array_search("$word\n",$word_idx);
83        if(!is_int($wid)){
84            $word_idx[] = "$word\n";
85            $wid = count($word_idx)-1;
86        }
87        // add to index
88        $index[$wid] = 1;
89    }
90
91    // save back word index
92    $fh = fopen($conf['cachedir'].'/word.idx','w');
93    if(!$fh){
94        trigger_error("Failed to write word.idx", E_USER_ERROR);
95        return false;
96    }
97    fwrite($fh,join('',$word_idx));
98    fclose($fh);
99
100    return $index;
101}
102
103/**
104 * Adds/updates the search for the given page
105 *
106 * This is the core function of the indexer which does most
107 * of the work. This function needs to be called with proper
108 * locking!
109 *
110 * @author Andreas Gohr <andi@splitbrain.org>
111 */
112function idx_addPage($page){
113    global $conf;
114
115    // load known words and documents
116    $page_idx = file($conf['cachedir'].'/page.idx');
117
118    // get page id (this is the linenumber in page.idx)
119    $pid = array_search("$page\n",$page_idx);
120    if(!is_int($pid)){
121        $page_idx[] = "$page\n";
122        $pid = count($page_idx)-1;
123        // page was new - write back
124        $fh = fopen($conf['cachedir'].'/page.idx','w');
125        if(!$fh) return false;
126        fwrite($fh,join('',$page_idx));
127        fclose($fh);
128    }
129
130    // get word usage in page
131    $words = idx_getPageWords($page);
132    if($words === false) return false;
133    if(!count($words)) return true;
134
135    // Open index and temp file
136    $idx = fopen($conf['cachedir'].'/index.idx','r');
137    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
138    if(!$idx || !$tmp){
139       trigger_error("Failed to open index files", E_USER_ERROR);
140       return false;
141    }
142
143    // copy from index to temp file, modifying were needed
144    $lno = 0;
145    $line = '';
146    while (!feof($idx)) {
147        // read full line
148        $line .= fgets($idx, 4096);
149        if(substr($line,-1) != "\n") continue;
150
151        // write a new Line to temp file
152        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
153
154        $line = ''; // reset line buffer
155        $lno++;     // increase linecounter
156    }
157    fclose($idx);
158
159    // add missing lines (usually index and word should contain
160    // the same number of lines, however if the page contained
161    // new words the word file has some more lines which need to
162    // be added here
163    $word_idx = file($conf['cachedir'].'/word.idx');
164    $wcnt = count($word_idx);
165    for($lno; $lno<$wcnt; $lno++){
166        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
167    }
168
169    // close the temp file and move it over to be the new one
170    fclose($tmp);
171    return rename($conf['cachedir'].'/index.tmp',
172                  $conf['cachedir'].'/index.idx');
173}
174
175/**
176 * Write a new index line to the filehandle
177 *
178 * This function writes an line for the index file to the
179 * given filehandle. It removes the given document from
180 * the given line and readds it when $count is >0.
181 *
182 * @author Andreas Gohr <andi@splitbrain.org>
183 */
184function idx_writeIndexLine($fh,$line,$pid,$count){
185    $line = trim($line);
186
187    if($line != ''){
188        $parts = explode(':',$line);
189        // remove doc from given line
190        foreach($parts as $part){
191            if($part == '') continue;
192            list($doc,$cnt) = explode('*',$part);
193            if($doc != $pid){
194                fwrite($fh,"$doc*$cnt:");
195            }
196        }
197    }
198
199    // add doc
200    if ($count){
201        fwrite($fh,"$pid*$count");
202    }
203
204    // add newline
205    fwrite($fh,"\n");
206}
207
208//Setup VIM: ex: et ts=4 enc=utf-8 :
209