1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15// Asian characters are handled as words. The following regexp defines the 16// Unicode-Ranges for Asian characters 17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 18// I'm no language expert. If you think some ranges are wrongly chosen or 19// a range is missing, please contact me 20define('IDX_ASIAN','['. 21 '\x{0E00}-\x{0E7F}'. // Thai 22 '\x{2E80}-\x{D7AF}'. // CJK -> Hangul 23 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 24 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 25 ']'); 26 27 28/** 29 * Split a page into words 30 * 31 * Returns an array of of word counts, false if an error occured 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @author Christopher Smith <chris@jalakai.co.uk> 35 */ 36function idx_getPageWords($page){ 37 global $conf; 38 $word_idx = file($conf['cachedir'].'/word.idx'); 39 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 40 if(@file_exists($swfile)){ 41 $stopwords = file($swfile); 42 }else{ 43 $stopwords = array(); 44 } 45 46 $body = rawWiki($page); 47 $body = strtr($body, "\r\n\t", ' '); 48 $tokens = explode(' ', $body); 49 $tokens = array_count_values($tokens); // count the frequency of each token 50 51 $words = array(); 52 foreach ($tokens as $word => $count) { 53 // simple filter to restrict use of utf8_stripspecials 54 if (preg_match('/[^0-9A-Za-z]/u', $word)) { 55 // handle asian chars as single words (may fail on older PHP version) 56 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); 57 if(!is_null($asia)) $word = $asia; //recover from regexp failure 58 $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*')); 59 $arr = array_count_values($arr); 60 61 foreach ($arr as $w => $c) { 62 if (!is_numeric($w) && strlen($w) < 3) continue; 63 $w = utf8_strtolower($w); 64 $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); 65 } 66 } else { 67 if (!is_numeric($word) && strlen($word) < 3) continue; 68 $word = strtolower($word); 69 $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 70 } 71 } 72 73 // arrive here with $words = array(word => frequency) 74 75 $index = array(); //resulting index 76 foreach ($words as $word => $freq) { 77 if (is_int(array_search("$word\n",$stopwords))) continue; 78 $wid = array_search("$word\n",$word_idx); 79 if(!is_int($wid)){ 80 $word_idx[] = "$word\n"; 81 $wid = count($word_idx)-1; 82 } 83 $index[$wid] = $freq; 84 } 85 86 // save back word index 87 $fh = fopen($conf['cachedir'].'/word.idx','w'); 88 if(!$fh){ 89 trigger_error("Failed to write word.idx", E_USER_ERROR); 90 return false; 91 } 92 fwrite($fh,join('',$word_idx)); 93 fclose($fh); 94 95 return $index; 96} 97 98/** 99 * Adds/updates the search for the given page 100 * 101 * This is the core function of the indexer which does most 102 * of the work. This function needs to be called with proper 103 * locking! 104 * 105 * @author Andreas Gohr <andi@splitbrain.org> 106 */ 107function idx_addPage($page){ 108 global $conf; 109 110 // load known documents 111 $page_idx = file($conf['cachedir'].'/page.idx'); 112 113 // get page id (this is the linenumber in page.idx) 114 $pid = array_search("$page\n",$page_idx); 115 if(!is_int($pid)){ 116 $page_idx[] = "$page\n"; 117 $pid = count($page_idx)-1; 118 // page was new - write back 119 $fh = fopen($conf['cachedir'].'/page.idx','w'); 120 if(!$fh) return false; 121 fwrite($fh,join('',$page_idx)); 122 fclose($fh); 123 } 124 125 // get word usage in page 126 $words = idx_getPageWords($page); 127 if($words === false) return false; 128 if(!count($words)) return true; 129 130 // Open index and temp file 131 $idx = fopen($conf['cachedir'].'/index.idx','r'); 132 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 133 if(!$idx || !$tmp){ 134 trigger_error("Failed to open index files", E_USER_ERROR); 135 return false; 136 } 137 138 // copy from index to temp file, modifying were needed 139 $lno = 0; 140 $line = ''; 141 while (!feof($idx)) { 142 // read full line 143 $line .= fgets($idx, 4096); 144 if(substr($line,-1) != "\n") continue; 145 146 // write a new Line to temp file 147 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 148 149 $line = ''; // reset line buffer 150 $lno++; // increase linecounter 151 } 152 fclose($idx); 153 154 // add missing lines (usually index and word should contain 155 // the same number of lines, however if the page contained 156 // new words the word file has some more lines which need to 157 // be added here 158 $word_idx = file($conf['cachedir'].'/word.idx'); 159 $wcnt = count($word_idx); 160 for($lno; $lno<$wcnt; $lno++){ 161 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 162 } 163 164 // close the temp file and move it over to be the new one 165 fclose($tmp); 166 // try rename first (fast) fallback to copy (slow) 167 if(@rename($conf['cachedir'].'/index.tmp', 168 $conf['cachedir'].'/index.idx')){ 169 return true; 170 }elseif(copy($conf['cachedir'].'/index.tmp', 171 $conf['cachedir'].'/index.idx')){ 172 unlink($conf['cachedir'].'/index.tmp'); 173 return true; 174 } 175 return false; 176} 177 178/** 179 * Write a new index line to the filehandle 180 * 181 * This function writes an line for the index file to the 182 * given filehandle. It removes the given document from 183 * the given line and readds it when $count is >0. 184 * 185 * @author Andreas Gohr <andi@splitbrain.org> 186 */ 187function idx_writeIndexLine($fh,$line,$pid,$count){ 188 $line = trim($line); 189 190 if($line != ''){ 191 $parts = explode(':',$line); 192 // remove doc from given line 193 foreach($parts as $part){ 194 if($part == '') continue; 195 list($doc,$cnt) = explode('*',$part); 196 if($doc != $pid){ 197 fwrite($fh,"$doc*$cnt:"); 198 } 199 } 200 } 201 202 // add doc 203 if ($count){ 204 fwrite($fh,"$pid*$count"); 205 } 206 207 // add newline 208 fwrite($fh,"\n"); 209} 210 211/** 212 * Lookup words in index 213 * 214 * Takes an array of word and will return a list of matching 215 * documents for each one. 216 * 217 * Important: No ACL checking is done here! All results are 218 * returned, regardless of permissions 219 * 220 * @author Andreas Gohr <andi@splitbrain.org> 221 */ 222function idx_lookup($words){ 223 global $conf; 224 225 $result = array(); 226 227 // load known words and documents 228 $page_idx = file($conf['cachedir'].'/page.idx'); 229 $word_idx = file($conf['cachedir'].'/word.idx'); 230 231 // get word IDs 232 $wids = array(); 233 foreach($words as $word){ 234 $result[$word] = array(); 235 $wild = 0; 236 $xword = $word; 237 238 // check for wildcards 239 if(substr($xword,0,1) == '*'){ 240 $xword = substr($xword,1); 241 $wild = 1; 242 } 243 if(substr($xword,-1,1) == '*'){ 244 $xword = substr($xword,0,-1); 245 $wild += 2; 246 } 247 248 // look for the ID(s) for the given word 249 if($wild){ // handle wildcard search 250 $cnt = count($word_idx); 251 for($wid=0; $wid<$cnt; $wid++){ 252 $iword = $word_idx[$wid]; 253 if( (($wild==3) && is_int(strpos($iword,$xword))) || 254 (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) || 255 (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) 256 ){ 257 $wids[] = $wid; 258 $result[$word][] = $wid; 259 } 260 } 261 }else{ // handle exact search 262 $wid = array_search("$word\n",$word_idx); 263 if(is_int($wid)){ 264 $wids[] = $wid; 265 $result[$word][] = $wid; 266 }else{ 267 $result[$word] = array(); 268 } 269 } 270 } 271 sort($wids); 272 $wids = array_unique($wids); 273 274 // Open index 275 $idx = fopen($conf['cachedir'].'/index.idx','r'); 276 if(!$idx){ 277 msg("Failed to open index file",-1); 278 return false; 279 } 280 281 // Walk the index til the lines are found 282 $docs = array(); // hold docs found 283 $lno = 0; 284 $line = ''; 285 $srch = array_shift($wids); // which word do we look for? 286 while (!feof($idx)) { 287 // read full line 288 $line .= fgets($idx, 4096); 289 if(substr($line,-1) != "\n") continue; 290 if($lno > $srch) break; // shouldn't happen 291 292 293 // do we want this line? 294 if($lno == $srch){ 295 // add docs to list 296 $docs[$srch] = idx_parseIndexLine($page_idx,$line); 297 298 $srch = array_shift($wids); // next word to look up 299 if($srch == null) break; // no more words 300 } 301 302 $line = ''; // reset line buffer 303 $lno++; // increase linecounter 304 } 305 fclose($idx); 306 307 308 // merge found pages into final result array 309 $final = array(); 310 foreach(array_keys($result) as $word){ 311 $final[$word] = array(); 312 foreach($result[$word] as $wid){ 313 $hits = &$docs[$wid]; 314 foreach ($hits as $hitkey => $hitcnt) { 315 $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; 316 } 317 } 318 } 319 return $final; 320} 321 322/** 323 * Returns a list of documents and counts from a index line 324 * 325 * It omits docs with a count of 0 and pages that no longer 326 * exist. 327 * 328 * @param array $page_idx The list of known pages 329 * @param string $line A line from the main index 330 * @author Andreas Gohr <andi@splitbrain.org> 331 */ 332function idx_parseIndexLine(&$page_idx,$line){ 333 $result = array(); 334 335 $line = trim($line); 336 if($line == '') return $result; 337 338 $parts = explode(':',$line); 339 foreach($parts as $part){ 340 if($part == '') continue; 341 list($doc,$cnt) = explode('*',$part); 342 if(!$cnt) continue; 343 $doc = trim($page_idx[$doc]); 344 if(!$doc) continue; 345 // make sure the document still exists 346 if(!@file_exists(wikiFN($doc))) continue; 347 348 $result[$doc] = $cnt; 349 } 350 return $result; 351} 352 353/** 354 * Tokenizes a string into an array of search words 355 * 356 * Uses the same algorithm as idx_getPageWords() 357 * 358 * @param string $string the query as given by the user 359 * @param arrayref $stopwords array of stopwords 360 * @param boolean $wc are wildcards allowed? 361 * 362 * @todo make combined function to use alone or in getPageWords 363 */ 364function idx_tokenizer($string,&$stopwords,$wc=false){ 365 $words = array(); 366 if(!$wc) $wc = '\*'; 367 368 if(preg_match('/[^0-9A-Za-z]/u', $string)){ 369 // handle asian chars as single words (may fail on older PHP version) 370 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); 371 if(!is_null($asia)) $string = $asia; //recover from regexp failure 372 373 $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'.$wc)); 374 foreach ($arr as $w) { 375 if (!is_numeric($w) && strlen($w) < 3) continue; 376 $w = utf8_strtolower($w); 377 if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 378 $words[] = $w; 379 } 380 }else{ 381 $w = $string; 382 if (!is_numeric($w) && strlen($w) < 3) return $words; 383 $w = strtolower($w); 384 if(is_int(array_search("$w\n",$stopwords))) return $words; 385 $words[] = $w; 386 } 387 388 return $words; 389} 390 391//Setup VIM: ex: et ts=4 enc=utf-8 : 392