1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15// Asian characters are handled as words. The following regexp defines the 16// Unicode-Ranges for Asian characters 17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 18// I'm no language expert. If you think some ranges are wrongly chosen or 19// a range is missing, please contact me 20define('IDX_ASIAN','['. 21 '\x{0E00}-\x{0E7F}'. // Thai 22 '\x{2E80}-\x{D7AF}'. // CJK -> Hangul 23 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 24 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 25 ']'); 26 27 28/** 29 * Split a page into words 30 * 31 * Returns an array of of word counts, false if an error occured 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @author Christopher Smith <chris@jalakai.co.uk> 35 */ 36function idx_getPageWords($page){ 37 global $conf; 38 $word_idx = file($conf['cachedir'].'/word.idx'); 39 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 40 if(@file_exists($swfile)){ 41 $stopwords = file($swfile); 42 }else{ 43 $stopwords = array(); 44 } 45 46 $body = rawWiki($page); 47 $body = strtr($body, "\r\n\t", ' '); 48 $tokens = explode(' ', $body); 49 $tokens = array_count_values($tokens); // count the frequency of each token 50 51 $words = array(); 52 foreach ($tokens as $word => $count) { 53 // simple filter to restrict use of utf8_stripspecials 54 if (preg_match('/[^0-9A-Za-z]/u', $word)) { 55 // handle asian chars as single words (may fail on older PHP version) 56 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); 57 if(!is_null($asia)) $word = $asia; //recover from regexp failure 58 $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*')); 59 $arr = array_count_values($arr); 60 61 foreach ($arr as $w => $c) { 62 if (!is_numeric($w) && strlen($w) < 3) continue; 63 $w = utf8_strtolower($w); 64 $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0); 65 } 66 } else { 67 if (!is_numeric($word) && strlen($word) < 3) continue; 68 $word = strtolower($word); 69 $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 70 } 71 } 72 73 // arrive here with $words = array(word => frequency) 74 75 $index = array(); //resulting index 76 foreach ($words as $word => $freq) { 77 if (is_int(array_search("$word\n",$stopwords))) continue; 78 $wid = array_search("$word\n",$word_idx); 79 if(!is_int($wid)){ 80 $word_idx[] = "$word\n"; 81 $wid = count($word_idx)-1; 82 } 83 $index[$wid] = $freq; 84 } 85 86 // save back word index 87 $fh = fopen($conf['cachedir'].'/word.idx','w'); 88 if(!$fh){ 89 trigger_error("Failed to write word.idx", E_USER_ERROR); 90 return false; 91 } 92 fwrite($fh,join('',$word_idx)); 93 fclose($fh); 94 95 return $index; 96} 97 98/** 99 * Adds/updates the search for the given page 100 * 101 * This is the core function of the indexer which does most 102 * of the work. This function needs to be called with proper 103 * locking! 104 * 105 * @author Andreas Gohr <andi@splitbrain.org> 106 */ 107function idx_addPage($page){ 108 global $conf; 109 110 // load known documents 111 $page_idx = file($conf['cachedir'].'/page.idx'); 112 113 // get page id (this is the linenumber in page.idx) 114 $pid = array_search("$page\n",$page_idx); 115 if(!is_int($pid)){ 116 $page_idx[] = "$page\n"; 117 $pid = count($page_idx)-1; 118 // page was new - write back 119 $fh = fopen($conf['cachedir'].'/page.idx','w'); 120 if(!$fh) return false; 121 fwrite($fh,join('',$page_idx)); 122 fclose($fh); 123 } 124 125 // get word usage in page 126 $words = idx_getPageWords($page); 127 if($words === false) return false; 128 if(!count($words)) return true; 129 130 // Open index and temp file 131 $idx = fopen($conf['cachedir'].'/index.idx','r'); 132 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 133 if(!$idx || !$tmp){ 134 trigger_error("Failed to open index files", E_USER_ERROR); 135 return false; 136 } 137 138 // copy from index to temp file, modifying were needed 139 $lno = 0; 140 $line = ''; 141 while (!feof($idx)) { 142 // read full line 143 $line .= fgets($idx, 4096); 144 if(substr($line,-1) != "\n") continue; 145 146 // write a new Line to temp file 147 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 148 149 $line = ''; // reset line buffer 150 $lno++; // increase linecounter 151 } 152 fclose($idx); 153 154 // add missing lines (usually index and word should contain 155 // the same number of lines, however if the page contained 156 // new words the word file has some more lines which need to 157 // be added here 158 $word_idx = file($conf['cachedir'].'/word.idx'); 159 $wcnt = count($word_idx); 160 for($lno; $lno<$wcnt; $lno++){ 161 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 162 } 163 164 // close the temp file and move it over to be the new one 165 fclose($tmp); 166 // try rename first (fast) fallback to copy (slow) 167 io_rename($conf['cachedir'].'/index.tmp', 168 $conf['cachedir'].'/index.idx'); 169 return false; 170} 171 172/** 173 * Write a new index line to the filehandle 174 * 175 * This function writes an line for the index file to the 176 * given filehandle. It removes the given document from 177 * the given line and readds it when $count is >0. 178 * 179 * @author Andreas Gohr <andi@splitbrain.org> 180 */ 181function idx_writeIndexLine($fh,$line,$pid,$count){ 182 $line = trim($line); 183 184 if($line != ''){ 185 $parts = explode(':',$line); 186 // remove doc from given line 187 foreach($parts as $part){ 188 if($part == '') continue; 189 list($doc,$cnt) = explode('*',$part); 190 if($doc != $pid){ 191 fwrite($fh,"$doc*$cnt:"); 192 } 193 } 194 } 195 196 // add doc 197 if ($count){ 198 fwrite($fh,"$pid*$count"); 199 } 200 201 // add newline 202 fwrite($fh,"\n"); 203} 204 205/** 206 * Lookup words in index 207 * 208 * Takes an array of word and will return a list of matching 209 * documents for each one. 210 * 211 * Important: No ACL checking is done here! All results are 212 * returned, regardless of permissions 213 * 214 * @author Andreas Gohr <andi@splitbrain.org> 215 */ 216function idx_lookup($words){ 217 global $conf; 218 219 $result = array(); 220 221 // load known words and documents 222 $page_idx = file($conf['cachedir'].'/page.idx'); 223 $word_idx = file($conf['cachedir'].'/word.idx'); 224 225 // get word IDs 226 $wids = array(); 227 foreach($words as $word){ 228 $result[$word] = array(); 229 $wild = 0; 230 $xword = $word; 231 232 // check for wildcards 233 if(substr($xword,0,1) == '*'){ 234 $xword = substr($xword,1); 235 $wild = 1; 236 $ptn = '/'.preg_quote($xword,'/').'$/'; 237# $l = -1*strlen($xword)-1; 238 } 239 if(substr($xword,-1,1) == '*'){ 240 $xword = substr($xword,0,-1); 241 $wild += 2; 242 } 243 244 // look for the ID(s) for the given word 245 if($wild){ // handle wildcard search 246 $cnt = count($word_idx); 247 for($wid=0; $wid<$cnt; $wid++){ 248 $iword = $word_idx[$wid]; 249 if( (($wild==3) && is_int(strpos($iword,$xword))) || 250# (($wild==1) && ("$xword\n" == substr($iword,$l))) || 251 (($wild==1) && preg_match($ptn,$iword)) || 252# (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) 253 (($wild==2) && (0 === strpos($iword,$xword))) 254 255 ){ 256 $wids[] = $wid; 257 $result[$word][] = $wid; 258 } 259 } 260 }else{ // handle exact search 261 $wid = array_search("$word\n",$word_idx); 262 if(is_int($wid)){ 263 $wids[] = $wid; 264 $result[$word][] = $wid; 265 }else{ 266 $result[$word] = array(); 267 } 268 } 269 } 270 sort($wids); 271 $wids = array_unique($wids); 272 273 // Open index 274 $idx = fopen($conf['cachedir'].'/index.idx','r'); 275 if(!$idx){ 276 msg("Failed to open index file",-1); 277 return false; 278 } 279 280 // Walk the index til the lines are found 281 $docs = array(); // hold docs found 282 $lno = 0; 283 $line = ''; 284 $srch = array_shift($wids); // which word do we look for? 285 while (!feof($idx)) { 286 // read full line 287 $line .= fgets($idx, 4096); 288 if(substr($line,-1) != "\n") continue; 289 if($lno > $srch) break; // shouldn't happen 290 291 292 // do we want this line? 293 if($lno == $srch){ 294 // add docs to list 295 $docs[$srch] = idx_parseIndexLine($page_idx,$line); 296 297 $srch = array_shift($wids); // next word to look up 298 if($srch == null) break; // no more words 299 } 300 301 $line = ''; // reset line buffer 302 $lno++; // increase linecounter 303 } 304 fclose($idx); 305 306 307 // merge found pages into final result array 308 $final = array(); 309 foreach(array_keys($result) as $word){ 310 $final[$word] = array(); 311 foreach($result[$word] as $wid){ 312 $hits = &$docs[$wid]; 313 foreach ($hits as $hitkey => $hitcnt) { 314 $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; 315 } 316 } 317 } 318 return $final; 319} 320 321/** 322 * Returns a list of documents and counts from a index line 323 * 324 * It omits docs with a count of 0 and pages that no longer 325 * exist. 326 * 327 * @param array $page_idx The list of known pages 328 * @param string $line A line from the main index 329 * @author Andreas Gohr <andi@splitbrain.org> 330 */ 331function idx_parseIndexLine(&$page_idx,$line){ 332 $result = array(); 333 334 $line = trim($line); 335 if($line == '') return $result; 336 337 $parts = explode(':',$line); 338 foreach($parts as $part){ 339 if($part == '') continue; 340 list($doc,$cnt) = explode('*',$part); 341 if(!$cnt) continue; 342 $doc = trim($page_idx[$doc]); 343 if(!$doc) continue; 344 // make sure the document still exists 345 if(!@file_exists(wikiFN($doc,'',false))) continue; 346 347 $result[$doc] = $cnt; 348 } 349 return $result; 350} 351 352/** 353 * Tokenizes a string into an array of search words 354 * 355 * Uses the same algorithm as idx_getPageWords() 356 * 357 * @param string $string the query as given by the user 358 * @param arrayref $stopwords array of stopwords 359 * @param boolean $wc are wildcards allowed? 360 * 361 * @todo make combined function to use alone or in getPageWords 362 */ 363function idx_tokenizer($string,&$stopwords,$wc=false){ 364 $words = array(); 365 $wc = ($wc) ? '' : $wc = '\*'; 366 367 if(preg_match('/[^0-9A-Za-z]/u', $string)){ 368 // handle asian chars as single words (may fail on older PHP version) 369 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); 370 if(!is_null($asia)) $string = $asia; //recover from regexp failure 371 372 $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); 373 foreach ($arr as $w) { 374 if (!is_numeric($w) && strlen($w) < 3) continue; 375 $w = utf8_strtolower($w); 376 if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 377 $words[] = $w; 378 } 379 }else{ 380 $w = $string; 381 if (!is_numeric($w) && strlen($w) < 3) return $words; 382 $w = strtolower($w); 383 if(is_int(array_search("$w\n",$stopwords))) return $words; 384 $words[] = $w; 385 } 386 387 return $words; 388} 389 390//Setup VIM: ex: et ts=4 enc=utf-8 : 391