1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15/** 16 * Split a page into words 17 * 18 * Returns an array of of word counts, false if an error occured 19 * 20 * @author Andreas Gohr <andi@splitbrain.org> 21 * @author Christopher Smith <chris@jalakai.co.uk> 22 */ 23function idx_getPageWords($page){ 24 global $conf; 25 $word_idx = file($conf['cachedir'].'/word.idx'); 26 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 27 if(@file_exists($swfile)){ 28 $stopwords = file($swfile); 29 }else{ 30 $stopwords = array(); 31 } 32 33 $body = rawWiki($page); 34 $body = strtr($body, "\r\n\t", ' '); 35 $tokens = explode(' ', $body); 36 $tokens = array_count_values($tokens); // count the frequency of each token 37 38 $words = array(); 39 foreach ($tokens as $word => $count) { 40 41 // simple filter to restrict use of utf8_stripspecials 42 if (preg_match('/[^0-9A-Za-z]/u', $word)) { 43 $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); 44 $arr = array_count_values($arr); 45 46 foreach ($arr as $w => $c) { 47 if (!is_numeric($w) && strlen($w) < 3) continue; 48 $w = utf8_strtolower($w); 49 $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); 50 } 51 } else { 52 if (!is_numeric($word) && strlen($word) < 3) continue; 53 $word = strtolower($word); 54 $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 55 } 56 } 57 58 // arrive here with $words = array(word => frequency) 59 60 $index = array(); //resulting index 61 foreach ($words as $word => $freq) { 62 if (is_int(array_search("$word\n",$stopwords))) continue; 63 $wid = array_search("$word\n",$word_idx); 64 if(!is_int($wid)){ 65 $word_idx[] = "$word\n"; 66 $wid = count($word_idx)-1; 67 } 68 $index[$wid] = $freq; 69 } 70 71 // save back word index 72 $fh = fopen($conf['cachedir'].'/word.idx','w'); 73 if(!$fh){ 74 trigger_error("Failed to write word.idx", E_USER_ERROR); 75 return false; 76 } 77 fwrite($fh,join('',$word_idx)); 78 fclose($fh); 79 80 return $index; 81} 82 83/** 84 * Adds/updates the search for the given page 85 * 86 * This is the core function of the indexer which does most 87 * of the work. This function needs to be called with proper 88 * locking! 89 * 90 * @author Andreas Gohr <andi@splitbrain.org> 91 */ 92function idx_addPage($page){ 93 global $conf; 94 95 // load known documents 96 $page_idx = file($conf['cachedir'].'/page.idx'); 97 98 // get page id (this is the linenumber in page.idx) 99 $pid = array_search("$page\n",$page_idx); 100 if(!is_int($pid)){ 101 $page_idx[] = "$page\n"; 102 $pid = count($page_idx)-1; 103 // page was new - write back 104 $fh = fopen($conf['cachedir'].'/page.idx','w'); 105 if(!$fh) return false; 106 fwrite($fh,join('',$page_idx)); 107 fclose($fh); 108 } 109 110 // get word usage in page 111 $words = idx_getPageWords($page); 112 if($words === false) return false; 113 if(!count($words)) return true; 114 115 // Open index and temp file 116 $idx = fopen($conf['cachedir'].'/index.idx','r'); 117 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 118 if(!$idx || !$tmp){ 119 trigger_error("Failed to open index files", E_USER_ERROR); 120 return false; 121 } 122 123 // copy from index to temp file, modifying were needed 124 $lno = 0; 125 $line = ''; 126 while (!feof($idx)) { 127 // read full line 128 $line .= fgets($idx, 4096); 129 if(substr($line,-1) != "\n") continue; 130 131 // write a new Line to temp file 132 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 133 134 $line = ''; // reset line buffer 135 $lno++; // increase linecounter 136 } 137 fclose($idx); 138 139 // add missing lines (usually index and word should contain 140 // the same number of lines, however if the page contained 141 // new words the word file has some more lines which need to 142 // be added here 143 $word_idx = file($conf['cachedir'].'/word.idx'); 144 $wcnt = count($word_idx); 145 for($lno; $lno<$wcnt; $lno++){ 146 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 147 } 148 149 // close the temp file and move it over to be the new one 150 fclose($tmp); 151 // try rename first (fast) fallback to copy (slow) 152 if(@rename($conf['cachedir'].'/index.tmp', 153 $conf['cachedir'].'/index.idx')){ 154 return true; 155 }elseif(copy($conf['cachedir'].'/index.tmp', 156 $conf['cachedir'].'/index.idx')){ 157 unlink($conf['cachedir'].'/index.tmp'); 158 return true; 159 } 160 return false; 161} 162 163/** 164 * Write a new index line to the filehandle 165 * 166 * This function writes an line for the index file to the 167 * given filehandle. It removes the given document from 168 * the given line and readds it when $count is >0. 169 * 170 * @author Andreas Gohr <andi@splitbrain.org> 171 */ 172function idx_writeIndexLine($fh,$line,$pid,$count){ 173 $line = trim($line); 174 175 if($line != ''){ 176 $parts = explode(':',$line); 177 // remove doc from given line 178 foreach($parts as $part){ 179 if($part == '') continue; 180 list($doc,$cnt) = explode('*',$part); 181 if($doc != $pid){ 182 fwrite($fh,"$doc*$cnt:"); 183 } 184 } 185 } 186 187 // add doc 188 if ($count){ 189 fwrite($fh,"$pid*$count"); 190 } 191 192 // add newline 193 fwrite($fh,"\n"); 194} 195 196/** 197 * Lookup words in index 198 * 199 * Takes an array of word and will return a list of matching 200 * documents for each one. 201 * 202 * Important: No ACL checking is done here! All results are 203 * returned, regardless of permissions 204 * 205 * @author Andreas Gohr <andi@splitbrain.org> 206 */ 207function idx_lookup($words){ 208 global $conf; 209 210 $result = array(); 211 212 // load known words and documents 213 $page_idx = file($conf['cachedir'].'/page.idx'); 214 $word_idx = file($conf['cachedir'].'/word.idx'); 215 216 // get word IDs 217 $wids = array(); 218 foreach($words as $word){ 219 $wid = array_search("$word\n",$word_idx); 220 if(is_int($wid)){ 221 $wids[] = $wid; 222 $result[$word] = $wid; 223 }else{ 224 $result[$word] = array(); 225 } 226 } 227 sort($wids); 228 $wids = array_unique($wids); 229 230 // Open index 231 $idx = fopen($conf['cachedir'].'/index.idx','r'); 232 if(!$idx){ 233 msg("Failed to open index files",-1); 234 return false; 235 } 236 237 // Walk the index til the lines are found 238 $docs = array(); // hold docs found 239 $lno = 0; 240 $line = ''; 241 $srch = array_shift($wids); // which word do we look for? 242 while (!feof($idx)) { 243 // read full line 244 $line .= fgets($idx, 4096); 245 if(substr($line,-1) != "\n") continue; 246 if($lno > $srch) break; // shouldn't happen 247 248 249 // do we want this line? 250 if($lno == $srch){ 251 // add docs to list 252 $docs[$srch] = idx_parseIndexLine($page_idx,$line); 253 254 $srch = array_shift($wids); // next word to look up 255 if($srch == null) break; // no more words 256 } 257 258 $line = ''; // reset line buffer 259 $lno++; // increase linecounter 260 } 261 fclose($idx); 262 263 // merge found pages into result array 264 foreach(array_keys($result) as $word){ 265 if(is_int($result[$word])){ 266 $result[$word] = $docs[$result[$word]]; 267 } 268 } 269 270 return $result; 271} 272 273/** 274 * Returns a list of documents and counts from a index line 275 * 276 * It omits docs with a count of 0 and pages that no longer 277 * exist. 278 * 279 * @param array $page_idx The list of known pages 280 * @param string $line A line from the main index 281 * @author Andreas Gohr <andi@splitbrain.org> 282 */ 283function idx_parseIndexLine(&$page_idx,$line){ 284 $result = array(); 285 286 $line = trim($line); 287 if($line == '') return $result; 288 289 $parts = explode(':',$line); 290 foreach($parts as $part){ 291 if($part == '') continue; 292 list($doc,$cnt) = explode('*',$part); 293 if(!$cnt) continue; 294 $doc = trim($page_idx[$doc]); 295 if(!$doc) continue; 296 // make sure the document still exists 297 if(!@file_exists(wikiFN($doc))) continue; 298 299 $result[$doc] = $cnt; 300 } 301 return $result; 302} 303 304/** 305 * Tokenizes a string into an array of search words 306 * 307 * Uses the same algorithm as idx_getPageWords() 308 * 309 * @todo make combined function to use alone or in getPageWords 310 */ 311function idx_tokenizer($string,&$stopwords){ 312 $words = array(); 313 314 if(preg_match('/[^0-9A-Za-z]/u', $string)){ 315 $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); 316 foreach ($arr as $w) { 317 if (!is_numeric($w) && strlen($w) < 3) continue; 318 $w = utf8_strtolower($w); 319 if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 320 $words[] = $w; 321 } 322 }else{ 323 $w = $string; 324 if (!is_numeric($w) && strlen($w) < 3) return $words; 325 $w = strtolower($w); 326 if(is_int(array_search("$w\n",$stopwords))) return $words; 327 $words[] = $w; 328 } 329 330 return $words; 331} 332 333//Setup VIM: ex: et ts=4 enc=utf-8 : 334