xref: /dokuwiki/inc/indexer.php (revision 44ca0adf4877b3bb6ac8bf0ff41fb908fed7a827)
1b4ce25e9SAndreas Gohr<?php
2b4ce25e9SAndreas Gohr/**
3b4ce25e9SAndreas Gohr * Common DokuWiki functions
4b4ce25e9SAndreas Gohr *
5b4ce25e9SAndreas Gohr * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6b4ce25e9SAndreas Gohr * @author     Andreas Gohr <andi@splitbrain.org>
7b4ce25e9SAndreas Gohr */
8b4ce25e9SAndreas Gohr
9b4ce25e9SAndreas Gohr  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10b4ce25e9SAndreas Gohr  require_once(DOKU_CONF.'dokuwiki.php');
11b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/io.php');
12b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/utf8.php');
13b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/parserutils.php');
14b4ce25e9SAndreas Gohr
15b4ce25e9SAndreas Gohr/**
16*44ca0adfSAndreas Gohr * Split a page into words
17*44ca0adfSAndreas Gohr *
18*44ca0adfSAndreas Gohr * It is based upon PHPCMS's indexer function index_entry
19*44ca0adfSAndreas Gohr *
20*44ca0adfSAndreas Gohr * Returns an array of of word counts, false if an error occured
21*44ca0adfSAndreas Gohr *
22*44ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
23b4ce25e9SAndreas Gohr */
24*44ca0adfSAndreas Gohrfunction idx_getPageWords($page){
25*44ca0adfSAndreas Gohr    global $conf;
26*44ca0adfSAndreas Gohr    $word_idx = file($conf['cachedir'].'/word.idx');
27*44ca0adfSAndreas Gohr
28*44ca0adfSAndreas Gohr    // split page into words
29*44ca0adfSAndreas Gohr    $body  = rawWiki($page);
30b4ce25e9SAndreas Gohr    $body  = utf8_stripspecials($body,' ','._\-:');
31b4ce25e9SAndreas Gohr    $body  = utf8_strtolower($body);
32b4ce25e9SAndreas Gohr    $body  = trim($body);
33b4ce25e9SAndreas Gohr    $words = explode(' ',$body);
34b4ce25e9SAndreas Gohr    sort($words);
35b4ce25e9SAndreas Gohr
36b4ce25e9SAndreas Gohr    $index = array(); //resulting index
37b4ce25e9SAndreas Gohr    $old   = '';
38*44ca0adfSAndreas Gohr    $wid   = -1;
39b4ce25e9SAndreas Gohr    $doit  = true;
40b4ce25e9SAndreas Gohr    $pos   = 0;
41b4ce25e9SAndreas Gohr
42b4ce25e9SAndreas Gohr    //compact wordlist FIXME check for stopwords
43b4ce25e9SAndreas Gohr    foreach($words as $word){
44b4ce25e9SAndreas Gohr        if(strlen($word) == 0) continue;
45b4ce25e9SAndreas Gohr
46b4ce25e9SAndreas Gohr        // it's the same word
47b4ce25e9SAndreas Gohr        if($word == $old){
48b4ce25e9SAndreas Gohr            if($doit == false) {
49b4ce25e9SAndreas Gohr                // we didn't wanted it last time
50b4ce25e9SAndreas Gohr                continue;
51b4ce25e9SAndreas Gohr            }
52b4ce25e9SAndreas Gohr            // just increase the counter
53*44ca0adfSAndreas Gohr            $index[$wid]++;
54b4ce25e9SAndreas Gohr            continue;
55b4ce25e9SAndreas Gohr        }
56b4ce25e9SAndreas Gohr
57b4ce25e9SAndreas Gohr        // rememember old word
58b4ce25e9SAndreas Gohr        $old  = $word;
59b4ce25e9SAndreas Gohr        $doit = true;
60b4ce25e9SAndreas Gohr
61b4ce25e9SAndreas Gohr        // checking minimum word-size (excepting numbers)
62b4ce25e9SAndreas Gohr        if(!is_numeric($word)) {
63b4ce25e9SAndreas Gohr            if(strlen($word) < 3) {  #FIXME add config option for max wordsize
64b4ce25e9SAndreas Gohr                $doit = false;
65b4ce25e9SAndreas Gohr                continue;
66b4ce25e9SAndreas Gohr            }
67b4ce25e9SAndreas Gohr        }
68b4ce25e9SAndreas Gohr
69b4ce25e9SAndreas Gohr        //FIXME add stopword check
70b4ce25e9SAndreas Gohr
71*44ca0adfSAndreas Gohr        // get word ID
72*44ca0adfSAndreas Gohr        $wid = array_search("$word\n",$word_idx);
73*44ca0adfSAndreas Gohr        if(!is_int($wid)){
74*44ca0adfSAndreas Gohr            $word_idx[] = "$word\n";
75*44ca0adfSAndreas Gohr            $wid = count($word_idx)-1;
76b4ce25e9SAndreas Gohr        }
77*44ca0adfSAndreas Gohr        // add to index
78*44ca0adfSAndreas Gohr        $index[$wid] = 1;
79*44ca0adfSAndreas Gohr    }
80*44ca0adfSAndreas Gohr
81*44ca0adfSAndreas Gohr    // save back word index
82*44ca0adfSAndreas Gohr    $fh = fopen($conf['cachedir'].'/word.idx','w');
83*44ca0adfSAndreas Gohr    if(!$fh){
84*44ca0adfSAndreas Gohr        trigger_error("Failed to write word.idx", E_USER_ERROR);
85*44ca0adfSAndreas Gohr        return false;
86*44ca0adfSAndreas Gohr    }
87*44ca0adfSAndreas Gohr    fwrite($fh,join('',$word_idx));
88*44ca0adfSAndreas Gohr    fclose($fh);
89b4ce25e9SAndreas Gohr
90b4ce25e9SAndreas Gohr    return $index;
91b4ce25e9SAndreas Gohr}
92b4ce25e9SAndreas Gohr
93*44ca0adfSAndreas Gohr/**
94*44ca0adfSAndreas Gohr * Adds/updates the search for the given page
95*44ca0adfSAndreas Gohr *
96*44ca0adfSAndreas Gohr * This is the core function of the indexer which does most
97*44ca0adfSAndreas Gohr * of the work. This function needs to be called with proper
98*44ca0adfSAndreas Gohr * locking!
99*44ca0adfSAndreas Gohr *
100*44ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
101*44ca0adfSAndreas Gohr */
102*44ca0adfSAndreas Gohrfunction idx_addPage($page){
103*44ca0adfSAndreas Gohr    global $conf;
104b4ce25e9SAndreas Gohr
105*44ca0adfSAndreas Gohr    // load known words and documents
106*44ca0adfSAndreas Gohr    $page_idx = file($conf['cachedir'].'/page.idx');
107*44ca0adfSAndreas Gohr
108*44ca0adfSAndreas Gohr    // get page id (this is the linenumber in page.idx)
109*44ca0adfSAndreas Gohr    $pid = array_search("$page\n",$page_idx);
110*44ca0adfSAndreas Gohr    if(!is_int($pid)){
111*44ca0adfSAndreas Gohr        $page_idx[] = "$page\n";
112*44ca0adfSAndreas Gohr        $pid = count($page_idx)-1;
113*44ca0adfSAndreas Gohr        // page was new - write back
114*44ca0adfSAndreas Gohr        $fh = fopen($conf['cachedir'].'/page.idx','w');
115*44ca0adfSAndreas Gohr        if(!$fh) return false;
116*44ca0adfSAndreas Gohr        fwrite($fh,join('',$page_idx));
117*44ca0adfSAndreas Gohr        fclose($fh);
118*44ca0adfSAndreas Gohr    }
119*44ca0adfSAndreas Gohr
120*44ca0adfSAndreas Gohr    // get word usage in page
121*44ca0adfSAndreas Gohr    $words = idx_getPageWords($page);
122*44ca0adfSAndreas Gohr    if($words === false) return false;
123*44ca0adfSAndreas Gohr    if(!count($words)) return true;
124*44ca0adfSAndreas Gohr
125*44ca0adfSAndreas Gohr    // Open index and temp file
126*44ca0adfSAndreas Gohr    $idx = fopen($conf['cachedir'].'/index.idx','r');
127*44ca0adfSAndreas Gohr    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
128*44ca0adfSAndreas Gohr    if(!$idx || !$tmp){
129*44ca0adfSAndreas Gohr       trigger_error("Failed to open index files", E_USER_ERROR);
130*44ca0adfSAndreas Gohr       return false;
131*44ca0adfSAndreas Gohr    }
132*44ca0adfSAndreas Gohr
133*44ca0adfSAndreas Gohr    // copy from index to temp file, modifying were needed
134*44ca0adfSAndreas Gohr    $lno = 0;
135*44ca0adfSAndreas Gohr    $line = '';
136*44ca0adfSAndreas Gohr    while (!feof($idx)) {
137*44ca0adfSAndreas Gohr        // read full line
138*44ca0adfSAndreas Gohr        $line .= fgets($idx, 4096);
139*44ca0adfSAndreas Gohr        if(substr($line,-1) != "\n") continue;
140*44ca0adfSAndreas Gohr
141*44ca0adfSAndreas Gohr        // write a new Line to temp file
142*44ca0adfSAndreas Gohr        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
143*44ca0adfSAndreas Gohr
144*44ca0adfSAndreas Gohr        $line = ''; // reset line buffer
145*44ca0adfSAndreas Gohr        $lno++;     // increase linecounter
146*44ca0adfSAndreas Gohr    }
147*44ca0adfSAndreas Gohr    fclose($idx);
148*44ca0adfSAndreas Gohr
149*44ca0adfSAndreas Gohr    // add missing lines (usually index and word should contain
150*44ca0adfSAndreas Gohr    // the same number of lines, however if the page contained
151*44ca0adfSAndreas Gohr    // new words the word file has some more lines which need to
152*44ca0adfSAndreas Gohr    // be added here
153*44ca0adfSAndreas Gohr    $word_idx = file($conf['cachedir'].'/word.idx');
154*44ca0adfSAndreas Gohr    $wcnt = count($word_idx);
155*44ca0adfSAndreas Gohr    for($lno; $lno<$wcnt; $lno++){
156*44ca0adfSAndreas Gohr        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
157*44ca0adfSAndreas Gohr    }
158*44ca0adfSAndreas Gohr
159*44ca0adfSAndreas Gohr    // close the temp file and move it over to be the new one
160*44ca0adfSAndreas Gohr    fclose($tmp);
161*44ca0adfSAndreas Gohr    return rename($conf['cachedir'].'/index.tmp',
162*44ca0adfSAndreas Gohr                  $conf['cachedir'].'/index.idx');
163*44ca0adfSAndreas Gohr}
164*44ca0adfSAndreas Gohr
165*44ca0adfSAndreas Gohr/**
166*44ca0adfSAndreas Gohr * Write a new index line to the filehandle
167*44ca0adfSAndreas Gohr *
168*44ca0adfSAndreas Gohr * This function writes an line for the index file to the
169*44ca0adfSAndreas Gohr * given filehandle. It removes the given document from
170*44ca0adfSAndreas Gohr * the given line and readds it when $count is >0.
171*44ca0adfSAndreas Gohr *
172*44ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
173*44ca0adfSAndreas Gohr */
174*44ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){
175*44ca0adfSAndreas Gohr    $line = trim($line);
176*44ca0adfSAndreas Gohr
177*44ca0adfSAndreas Gohr    if($line != ''){
178*44ca0adfSAndreas Gohr        $parts = explode(':',$line);
179*44ca0adfSAndreas Gohr        // remove doc from given line
180*44ca0adfSAndreas Gohr        foreach($parts as $part){
181*44ca0adfSAndreas Gohr            if($part == '') continue;
182*44ca0adfSAndreas Gohr            list($doc,$cnt) = explode('*',$part);
183*44ca0adfSAndreas Gohr            if($doc != $pid){
184*44ca0adfSAndreas Gohr                fwrite($fh,"$doc*$cnt:");
185*44ca0adfSAndreas Gohr            }
186*44ca0adfSAndreas Gohr        }
187*44ca0adfSAndreas Gohr    }
188*44ca0adfSAndreas Gohr
189*44ca0adfSAndreas Gohr    // add doc
190*44ca0adfSAndreas Gohr    if ($count){
191*44ca0adfSAndreas Gohr        fwrite($fh,"$pid*$count");
192*44ca0adfSAndreas Gohr    }
193*44ca0adfSAndreas Gohr
194*44ca0adfSAndreas Gohr    // add newline
195*44ca0adfSAndreas Gohr    fwrite($fh,"\n");
196*44ca0adfSAndreas Gohr}
197b4ce25e9SAndreas Gohr
198b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 :
199