1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15/** 16 * Split a page into words 17 * 18 * It is based upon PHPCMS's indexer function index_entry 19 * 20 * Returns an array of of word counts, false if an error occured 21 * 22 * @author Andreas Gohr <andi@splitbrain.org> 23 */ 24function idx_getPageWords($page){ 25 global $conf; 26 $word_idx = file($conf['cachedir'].'/word.idx'); 27 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 28 if(@file_exists($swfile)){ 29 $stopwords = file($swfile); 30 }else{ 31 $stopwords = array(); 32 } 33 34 // split page into words 35 $body = rawWiki($page); 36 $body = utf8_stripspecials($body,' ','._\-:'); 37 $body = utf8_strtolower($body); 38 $body = trim($body); 39 $words = explode(' ',$body); 40 sort($words); 41 42 $index = array(); //resulting index 43 $old = ''; 44 $wid = -1; 45 $doit = true; 46 $pos = 0; 47 48 //compact wordlist FIXME check for stopwords 49 foreach($words as $word){ 50 if(strlen($word) == 0) continue; 51 52 // it's the same word 53 if($word == $old){ 54 if($doit == false) { 55 // we didn't wanted it last time 56 continue; 57 } 58 // just increase the counter 59 $index[$wid]++; 60 continue; 61 } 62 63 // rememember old word 64 $old = $word; 65 $doit = true; 66 67 // checking minimum word-size (excepting numbers) 68 if(!is_numeric($word)) { 69 if(strlen($word) < 3) { 70 $doit = false; 71 continue; 72 } 73 } 74 75 // stopword check 76 if(is_int(array_search("$word\n",$stopwords))){ 77 $doit = false; 78 continue; 79 } 80 81 // get word ID 82 $wid = array_search("$word\n",$word_idx); 83 if(!is_int($wid)){ 84 $word_idx[] = "$word\n"; 85 $wid = count($word_idx)-1; 86 } 87 // add to index 88 $index[$wid] = 1; 89 } 90 91 // save back word index 92 $fh = fopen($conf['cachedir'].'/word.idx','w'); 93 if(!$fh){ 94 trigger_error("Failed to write word.idx", E_USER_ERROR); 95 return false; 96 } 97 fwrite($fh,join('',$word_idx)); 98 fclose($fh); 99 100 return $index; 101} 102 103/** 104 * Adds/updates the search for the given page 105 * 106 * This is the core function of the indexer which does most 107 * of the work. This function needs to be called with proper 108 * locking! 109 * 110 * @author Andreas Gohr <andi@splitbrain.org> 111 */ 112function idx_addPage($page){ 113 global $conf; 114 115 // load known words and documents 116 $page_idx = file($conf['cachedir'].'/page.idx'); 117 118 // get page id (this is the linenumber in page.idx) 119 $pid = array_search("$page\n",$page_idx); 120 if(!is_int($pid)){ 121 $page_idx[] = "$page\n"; 122 $pid = count($page_idx)-1; 123 // page was new - write back 124 $fh = fopen($conf['cachedir'].'/page.idx','w'); 125 if(!$fh) return false; 126 fwrite($fh,join('',$page_idx)); 127 fclose($fh); 128 } 129 130 // get word usage in page 131 $words = idx_getPageWords($page); 132 if($words === false) return false; 133 if(!count($words)) return true; 134 135 // Open index and temp file 136 $idx = fopen($conf['cachedir'].'/index.idx','r'); 137 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 138 if(!$idx || !$tmp){ 139 trigger_error("Failed to open index files", E_USER_ERROR); 140 return false; 141 } 142 143 // copy from index to temp file, modifying were needed 144 $lno = 0; 145 $line = ''; 146 while (!feof($idx)) { 147 // read full line 148 $line .= fgets($idx, 4096); 149 if(substr($line,-1) != "\n") continue; 150 151 // write a new Line to temp file 152 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 153 154 $line = ''; // reset line buffer 155 $lno++; // increase linecounter 156 } 157 fclose($idx); 158 159 // add missing lines (usually index and word should contain 160 // the same number of lines, however if the page contained 161 // new words the word file has some more lines which need to 162 // be added here 163 $word_idx = file($conf['cachedir'].'/word.idx'); 164 $wcnt = count($word_idx); 165 for($lno; $lno<$wcnt; $lno++){ 166 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 167 } 168 169 // close the temp file and move it over to be the new one 170 fclose($tmp); 171 return rename($conf['cachedir'].'/index.tmp', 172 $conf['cachedir'].'/index.idx'); 173} 174 175/** 176 * Write a new index line to the filehandle 177 * 178 * This function writes an line for the index file to the 179 * given filehandle. It removes the given document from 180 * the given line and readds it when $count is >0. 181 * 182 * @author Andreas Gohr <andi@splitbrain.org> 183 */ 184function idx_writeIndexLine($fh,$line,$pid,$count){ 185 $line = trim($line); 186 187 if($line != ''){ 188 $parts = explode(':',$line); 189 // remove doc from given line 190 foreach($parts as $part){ 191 if($part == '') continue; 192 list($doc,$cnt) = explode('*',$part); 193 if($doc != $pid){ 194 fwrite($fh,"$doc*$cnt:"); 195 } 196 } 197 } 198 199 // add doc 200 if ($count){ 201 fwrite($fh,"$pid*$count"); 202 } 203 204 // add newline 205 fwrite($fh,"\n"); 206} 207 208//Setup VIM: ex: et ts=4 enc=utf-8 : 209