xref: /dokuwiki/inc/indexer.php (revision 7367b36877bca568d785e01be802652b6a719884)
1b4ce25e9SAndreas Gohr<?php
2b4ce25e9SAndreas Gohr/**
3b4ce25e9SAndreas Gohr * Common DokuWiki functions
4b4ce25e9SAndreas Gohr *
5b4ce25e9SAndreas Gohr * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6b4ce25e9SAndreas Gohr * @author     Andreas Gohr <andi@splitbrain.org>
7b4ce25e9SAndreas Gohr */
8b4ce25e9SAndreas Gohr
9b4ce25e9SAndreas Gohr  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10b4ce25e9SAndreas Gohr  require_once(DOKU_CONF.'dokuwiki.php');
11b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/io.php');
12b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/utf8.php');
13b4ce25e9SAndreas Gohr  require_once(DOKU_INC.'inc/parserutils.php');
14b4ce25e9SAndreas Gohr
15b4ce25e9SAndreas Gohr/**
1644ca0adfSAndreas Gohr * Split a page into words
1744ca0adfSAndreas Gohr *
1844ca0adfSAndreas Gohr * It is based upon PHPCMS's indexer function index_entry
1944ca0adfSAndreas Gohr *
2044ca0adfSAndreas Gohr * Returns an array of of word counts, false if an error occured
2144ca0adfSAndreas Gohr *
2244ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
23b4ce25e9SAndreas Gohr */
2444ca0adfSAndreas Gohrfunction idx_getPageWords($page){
2544ca0adfSAndreas Gohr    global $conf;
2644ca0adfSAndreas Gohr    $word_idx = file($conf['cachedir'].'/word.idx');
27*7367b368SAndreas Gohr    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
28*7367b368SAndreas Gohr    if(@file_exists($swfile)){
29*7367b368SAndreas Gohr        $stopwords = file($swfile);
30*7367b368SAndreas Gohr    }else{
31*7367b368SAndreas Gohr        $stopwords = array();
32*7367b368SAndreas Gohr    }
3344ca0adfSAndreas Gohr
3444ca0adfSAndreas Gohr    // split page into words
3544ca0adfSAndreas Gohr    $body  = rawWiki($page);
36b4ce25e9SAndreas Gohr    $body  = utf8_stripspecials($body,' ','._\-:');
37b4ce25e9SAndreas Gohr    $body  = utf8_strtolower($body);
38b4ce25e9SAndreas Gohr    $body  = trim($body);
39b4ce25e9SAndreas Gohr    $words = explode(' ',$body);
40b4ce25e9SAndreas Gohr    sort($words);
41b4ce25e9SAndreas Gohr
42b4ce25e9SAndreas Gohr    $index = array(); //resulting index
43b4ce25e9SAndreas Gohr    $old   = '';
4444ca0adfSAndreas Gohr    $wid   = -1;
45b4ce25e9SAndreas Gohr    $doit  = true;
46b4ce25e9SAndreas Gohr    $pos   = 0;
47b4ce25e9SAndreas Gohr
48b4ce25e9SAndreas Gohr    //compact wordlist FIXME check for stopwords
49b4ce25e9SAndreas Gohr    foreach($words as $word){
50b4ce25e9SAndreas Gohr        if(strlen($word) == 0) continue;
51b4ce25e9SAndreas Gohr
52b4ce25e9SAndreas Gohr        // it's the same word
53b4ce25e9SAndreas Gohr        if($word == $old){
54b4ce25e9SAndreas Gohr            if($doit == false) {
55b4ce25e9SAndreas Gohr                // we didn't wanted it last time
56b4ce25e9SAndreas Gohr                continue;
57b4ce25e9SAndreas Gohr            }
58b4ce25e9SAndreas Gohr            // just increase the counter
5944ca0adfSAndreas Gohr            $index[$wid]++;
60b4ce25e9SAndreas Gohr            continue;
61b4ce25e9SAndreas Gohr        }
62b4ce25e9SAndreas Gohr
63b4ce25e9SAndreas Gohr        // rememember old word
64b4ce25e9SAndreas Gohr        $old  = $word;
65b4ce25e9SAndreas Gohr        $doit = true;
66b4ce25e9SAndreas Gohr
67b4ce25e9SAndreas Gohr        // checking minimum word-size (excepting numbers)
68b4ce25e9SAndreas Gohr        if(!is_numeric($word)) {
69*7367b368SAndreas Gohr            if(strlen($word) < 3) {
70b4ce25e9SAndreas Gohr                $doit = false;
71b4ce25e9SAndreas Gohr                continue;
72b4ce25e9SAndreas Gohr            }
73b4ce25e9SAndreas Gohr        }
74b4ce25e9SAndreas Gohr
75*7367b368SAndreas Gohr        // stopword check
76*7367b368SAndreas Gohr        if(is_int(array_search("$word\n",$stopwords))){
77*7367b368SAndreas Gohr            $doit = false;
78*7367b368SAndreas Gohr            continue;
79*7367b368SAndreas Gohr        }
80b4ce25e9SAndreas Gohr
8144ca0adfSAndreas Gohr        // get word ID
8244ca0adfSAndreas Gohr        $wid = array_search("$word\n",$word_idx);
8344ca0adfSAndreas Gohr        if(!is_int($wid)){
8444ca0adfSAndreas Gohr            $word_idx[] = "$word\n";
8544ca0adfSAndreas Gohr            $wid = count($word_idx)-1;
86b4ce25e9SAndreas Gohr        }
8744ca0adfSAndreas Gohr        // add to index
8844ca0adfSAndreas Gohr        $index[$wid] = 1;
8944ca0adfSAndreas Gohr    }
9044ca0adfSAndreas Gohr
9144ca0adfSAndreas Gohr    // save back word index
9244ca0adfSAndreas Gohr    $fh = fopen($conf['cachedir'].'/word.idx','w');
9344ca0adfSAndreas Gohr    if(!$fh){
9444ca0adfSAndreas Gohr        trigger_error("Failed to write word.idx", E_USER_ERROR);
9544ca0adfSAndreas Gohr        return false;
9644ca0adfSAndreas Gohr    }
9744ca0adfSAndreas Gohr    fwrite($fh,join('',$word_idx));
9844ca0adfSAndreas Gohr    fclose($fh);
99b4ce25e9SAndreas Gohr
100b4ce25e9SAndreas Gohr    return $index;
101b4ce25e9SAndreas Gohr}
102b4ce25e9SAndreas Gohr
10344ca0adfSAndreas Gohr/**
10444ca0adfSAndreas Gohr * Adds/updates the search for the given page
10544ca0adfSAndreas Gohr *
10644ca0adfSAndreas Gohr * This is the core function of the indexer which does most
10744ca0adfSAndreas Gohr * of the work. This function needs to be called with proper
10844ca0adfSAndreas Gohr * locking!
10944ca0adfSAndreas Gohr *
11044ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
11144ca0adfSAndreas Gohr */
11244ca0adfSAndreas Gohrfunction idx_addPage($page){
11344ca0adfSAndreas Gohr    global $conf;
114b4ce25e9SAndreas Gohr
11544ca0adfSAndreas Gohr    // load known words and documents
11644ca0adfSAndreas Gohr    $page_idx = file($conf['cachedir'].'/page.idx');
11744ca0adfSAndreas Gohr
11844ca0adfSAndreas Gohr    // get page id (this is the linenumber in page.idx)
11944ca0adfSAndreas Gohr    $pid = array_search("$page\n",$page_idx);
12044ca0adfSAndreas Gohr    if(!is_int($pid)){
12144ca0adfSAndreas Gohr        $page_idx[] = "$page\n";
12244ca0adfSAndreas Gohr        $pid = count($page_idx)-1;
12344ca0adfSAndreas Gohr        // page was new - write back
12444ca0adfSAndreas Gohr        $fh = fopen($conf['cachedir'].'/page.idx','w');
12544ca0adfSAndreas Gohr        if(!$fh) return false;
12644ca0adfSAndreas Gohr        fwrite($fh,join('',$page_idx));
12744ca0adfSAndreas Gohr        fclose($fh);
12844ca0adfSAndreas Gohr    }
12944ca0adfSAndreas Gohr
13044ca0adfSAndreas Gohr    // get word usage in page
13144ca0adfSAndreas Gohr    $words = idx_getPageWords($page);
13244ca0adfSAndreas Gohr    if($words === false) return false;
13344ca0adfSAndreas Gohr    if(!count($words)) return true;
13444ca0adfSAndreas Gohr
13544ca0adfSAndreas Gohr    // Open index and temp file
13644ca0adfSAndreas Gohr    $idx = fopen($conf['cachedir'].'/index.idx','r');
13744ca0adfSAndreas Gohr    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
13844ca0adfSAndreas Gohr    if(!$idx || !$tmp){
13944ca0adfSAndreas Gohr       trigger_error("Failed to open index files", E_USER_ERROR);
14044ca0adfSAndreas Gohr       return false;
14144ca0adfSAndreas Gohr    }
14244ca0adfSAndreas Gohr
14344ca0adfSAndreas Gohr    // copy from index to temp file, modifying were needed
14444ca0adfSAndreas Gohr    $lno = 0;
14544ca0adfSAndreas Gohr    $line = '';
14644ca0adfSAndreas Gohr    while (!feof($idx)) {
14744ca0adfSAndreas Gohr        // read full line
14844ca0adfSAndreas Gohr        $line .= fgets($idx, 4096);
14944ca0adfSAndreas Gohr        if(substr($line,-1) != "\n") continue;
15044ca0adfSAndreas Gohr
15144ca0adfSAndreas Gohr        // write a new Line to temp file
15244ca0adfSAndreas Gohr        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
15344ca0adfSAndreas Gohr
15444ca0adfSAndreas Gohr        $line = ''; // reset line buffer
15544ca0adfSAndreas Gohr        $lno++;     // increase linecounter
15644ca0adfSAndreas Gohr    }
15744ca0adfSAndreas Gohr    fclose($idx);
15844ca0adfSAndreas Gohr
15944ca0adfSAndreas Gohr    // add missing lines (usually index and word should contain
16044ca0adfSAndreas Gohr    // the same number of lines, however if the page contained
16144ca0adfSAndreas Gohr    // new words the word file has some more lines which need to
16244ca0adfSAndreas Gohr    // be added here
16344ca0adfSAndreas Gohr    $word_idx = file($conf['cachedir'].'/word.idx');
16444ca0adfSAndreas Gohr    $wcnt = count($word_idx);
16544ca0adfSAndreas Gohr    for($lno; $lno<$wcnt; $lno++){
16644ca0adfSAndreas Gohr        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
16744ca0adfSAndreas Gohr    }
16844ca0adfSAndreas Gohr
16944ca0adfSAndreas Gohr    // close the temp file and move it over to be the new one
17044ca0adfSAndreas Gohr    fclose($tmp);
17144ca0adfSAndreas Gohr    return rename($conf['cachedir'].'/index.tmp',
17244ca0adfSAndreas Gohr                  $conf['cachedir'].'/index.idx');
17344ca0adfSAndreas Gohr}
17444ca0adfSAndreas Gohr
17544ca0adfSAndreas Gohr/**
17644ca0adfSAndreas Gohr * Write a new index line to the filehandle
17744ca0adfSAndreas Gohr *
17844ca0adfSAndreas Gohr * This function writes an line for the index file to the
17944ca0adfSAndreas Gohr * given filehandle. It removes the given document from
18044ca0adfSAndreas Gohr * the given line and readds it when $count is >0.
18144ca0adfSAndreas Gohr *
18244ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
18344ca0adfSAndreas Gohr */
18444ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){
18544ca0adfSAndreas Gohr    $line = trim($line);
18644ca0adfSAndreas Gohr
18744ca0adfSAndreas Gohr    if($line != ''){
18844ca0adfSAndreas Gohr        $parts = explode(':',$line);
18944ca0adfSAndreas Gohr        // remove doc from given line
19044ca0adfSAndreas Gohr        foreach($parts as $part){
19144ca0adfSAndreas Gohr            if($part == '') continue;
19244ca0adfSAndreas Gohr            list($doc,$cnt) = explode('*',$part);
19344ca0adfSAndreas Gohr            if($doc != $pid){
19444ca0adfSAndreas Gohr                fwrite($fh,"$doc*$cnt:");
19544ca0adfSAndreas Gohr            }
19644ca0adfSAndreas Gohr        }
19744ca0adfSAndreas Gohr    }
19844ca0adfSAndreas Gohr
19944ca0adfSAndreas Gohr    // add doc
20044ca0adfSAndreas Gohr    if ($count){
20144ca0adfSAndreas Gohr        fwrite($fh,"$pid*$count");
20244ca0adfSAndreas Gohr    }
20344ca0adfSAndreas Gohr
20444ca0adfSAndreas Gohr    // add newline
20544ca0adfSAndreas Gohr    fwrite($fh,"\n");
20644ca0adfSAndreas Gohr}
207b4ce25e9SAndreas Gohr
208b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 :
209