1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15// Asian characters are handled as words. The following regexp defines the 16// Unicode-Ranges for Asian characters 17// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 18// I'm no language expert. If you think some ranges are wrongly chosen or 19// a range is missing, please contact me 20define(IDX_ASIAN,'['. 21 '\x{0E00}-\x{0E7F}'. // Thai 22 '\x{2E80}-\x{D7AF}'. // CJK -> Hangul 23 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 24 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 25 ']'); 26 27 28/** 29 * Split a page into words 30 * 31 * Returns an array of of word counts, false if an error occured 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @author Christopher Smith <chris@jalakai.co.uk> 35 */ 36function idx_getPageWords($page){ 37 global $conf; 38 $word_idx = file($conf['cachedir'].'/word.idx'); 39 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 40 if(@file_exists($swfile)){ 41 $stopwords = file($swfile); 42 }else{ 43 $stopwords = array(); 44 } 45 46 $body = rawWiki($page); 47 $body = strtr($body, "\r\n\t", ' '); 48 $tokens = explode(' ', $body); 49 $tokens = array_count_values($tokens); // count the frequency of each token 50 51 $words = array(); 52 foreach ($tokens as $word => $count) { 53 // simple filter to restrict use of utf8_stripspecials 54 if (preg_match('/[^0-9A-Za-z]/u', $word)) { 55 // handle asian chars as single words 56 $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); 57 $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); 58 $arr = array_count_values($arr); 59 60 foreach ($arr as $w => $c) { 61 if (!is_numeric($w) && strlen($w) < 3) continue; 62 $w = utf8_strtolower($w); 63 $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); 64 } 65 } else { 66 if (!is_numeric($word) && strlen($word) < 3) continue; 67 $word = strtolower($word); 68 $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 69 } 70 } 71 72 // arrive here with $words = array(word => frequency) 73 74 $index = array(); //resulting index 75 foreach ($words as $word => $freq) { 76 if (is_int(array_search("$word\n",$stopwords))) continue; 77 $wid = array_search("$word\n",$word_idx); 78 if(!is_int($wid)){ 79 $word_idx[] = "$word\n"; 80 $wid = count($word_idx)-1; 81 } 82 $index[$wid] = $freq; 83 } 84 85 // save back word index 86 $fh = fopen($conf['cachedir'].'/word.idx','w'); 87 if(!$fh){ 88 trigger_error("Failed to write word.idx", E_USER_ERROR); 89 return false; 90 } 91 fwrite($fh,join('',$word_idx)); 92 fclose($fh); 93 94 return $index; 95} 96 97/** 98 * Adds/updates the search for the given page 99 * 100 * This is the core function of the indexer which does most 101 * of the work. This function needs to be called with proper 102 * locking! 103 * 104 * @author Andreas Gohr <andi@splitbrain.org> 105 */ 106function idx_addPage($page){ 107 global $conf; 108 109 // load known documents 110 $page_idx = file($conf['cachedir'].'/page.idx'); 111 112 // get page id (this is the linenumber in page.idx) 113 $pid = array_search("$page\n",$page_idx); 114 if(!is_int($pid)){ 115 $page_idx[] = "$page\n"; 116 $pid = count($page_idx)-1; 117 // page was new - write back 118 $fh = fopen($conf['cachedir'].'/page.idx','w'); 119 if(!$fh) return false; 120 fwrite($fh,join('',$page_idx)); 121 fclose($fh); 122 } 123 124 // get word usage in page 125 $words = idx_getPageWords($page); 126 if($words === false) return false; 127 if(!count($words)) return true; 128 129 // Open index and temp file 130 $idx = fopen($conf['cachedir'].'/index.idx','r'); 131 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 132 if(!$idx || !$tmp){ 133 trigger_error("Failed to open index files", E_USER_ERROR); 134 return false; 135 } 136 137 // copy from index to temp file, modifying were needed 138 $lno = 0; 139 $line = ''; 140 while (!feof($idx)) { 141 // read full line 142 $line .= fgets($idx, 4096); 143 if(substr($line,-1) != "\n") continue; 144 145 // write a new Line to temp file 146 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 147 148 $line = ''; // reset line buffer 149 $lno++; // increase linecounter 150 } 151 fclose($idx); 152 153 // add missing lines (usually index and word should contain 154 // the same number of lines, however if the page contained 155 // new words the word file has some more lines which need to 156 // be added here 157 $word_idx = file($conf['cachedir'].'/word.idx'); 158 $wcnt = count($word_idx); 159 for($lno; $lno<$wcnt; $lno++){ 160 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 161 } 162 163 // close the temp file and move it over to be the new one 164 fclose($tmp); 165 // try rename first (fast) fallback to copy (slow) 166 if(@rename($conf['cachedir'].'/index.tmp', 167 $conf['cachedir'].'/index.idx')){ 168 return true; 169 }elseif(copy($conf['cachedir'].'/index.tmp', 170 $conf['cachedir'].'/index.idx')){ 171 unlink($conf['cachedir'].'/index.tmp'); 172 return true; 173 } 174 return false; 175} 176 177/** 178 * Write a new index line to the filehandle 179 * 180 * This function writes an line for the index file to the 181 * given filehandle. It removes the given document from 182 * the given line and readds it when $count is >0. 183 * 184 * @author Andreas Gohr <andi@splitbrain.org> 185 */ 186function idx_writeIndexLine($fh,$line,$pid,$count){ 187 $line = trim($line); 188 189 if($line != ''){ 190 $parts = explode(':',$line); 191 // remove doc from given line 192 foreach($parts as $part){ 193 if($part == '') continue; 194 list($doc,$cnt) = explode('*',$part); 195 if($doc != $pid){ 196 fwrite($fh,"$doc*$cnt:"); 197 } 198 } 199 } 200 201 // add doc 202 if ($count){ 203 fwrite($fh,"$pid*$count"); 204 } 205 206 // add newline 207 fwrite($fh,"\n"); 208} 209 210/** 211 * Lookup words in index 212 * 213 * Takes an array of word and will return a list of matching 214 * documents for each one. 215 * 216 * Important: No ACL checking is done here! All results are 217 * returned, regardless of permissions 218 * 219 * @author Andreas Gohr <andi@splitbrain.org> 220 */ 221function idx_lookup($words){ 222 global $conf; 223 224 $result = array(); 225 226 // load known words and documents 227 $page_idx = file($conf['cachedir'].'/page.idx'); 228 $word_idx = file($conf['cachedir'].'/word.idx'); 229 230 // get word IDs 231 $wids = array(); 232 foreach($words as $word){ 233 $wid = array_search("$word\n",$word_idx); 234 if(is_int($wid)){ 235 $wids[] = $wid; 236 $result[$word] = $wid; 237 }else{ 238 $result[$word] = array(); 239 } 240 } 241 sort($wids); 242 $wids = array_unique($wids); 243 244 // Open index 245 $idx = fopen($conf['cachedir'].'/index.idx','r'); 246 if(!$idx){ 247 msg("Failed to open index files",-1); 248 return false; 249 } 250 251 // Walk the index til the lines are found 252 $docs = array(); // hold docs found 253 $lno = 0; 254 $line = ''; 255 $srch = array_shift($wids); // which word do we look for? 256 while (!feof($idx)) { 257 // read full line 258 $line .= fgets($idx, 4096); 259 if(substr($line,-1) != "\n") continue; 260 if($lno > $srch) break; // shouldn't happen 261 262 263 // do we want this line? 264 if($lno == $srch){ 265 // add docs to list 266 $docs[$srch] = idx_parseIndexLine($page_idx,$line); 267 268 $srch = array_shift($wids); // next word to look up 269 if($srch == null) break; // no more words 270 } 271 272 $line = ''; // reset line buffer 273 $lno++; // increase linecounter 274 } 275 fclose($idx); 276 277 // merge found pages into result array 278 foreach(array_keys($result) as $word){ 279 if(is_int($result[$word])){ 280 $result[$word] = $docs[$result[$word]]; 281 } 282 } 283 284 return $result; 285} 286 287/** 288 * Returns a list of documents and counts from a index line 289 * 290 * It omits docs with a count of 0 and pages that no longer 291 * exist. 292 * 293 * @param array $page_idx The list of known pages 294 * @param string $line A line from the main index 295 * @author Andreas Gohr <andi@splitbrain.org> 296 */ 297function idx_parseIndexLine(&$page_idx,$line){ 298 $result = array(); 299 300 $line = trim($line); 301 if($line == '') return $result; 302 303 $parts = explode(':',$line); 304 foreach($parts as $part){ 305 if($part == '') continue; 306 list($doc,$cnt) = explode('*',$part); 307 if(!$cnt) continue; 308 $doc = trim($page_idx[$doc]); 309 if(!$doc) continue; 310 // make sure the document still exists 311 if(!@file_exists(wikiFN($doc))) continue; 312 313 $result[$doc] = $cnt; 314 } 315 return $result; 316} 317 318/** 319 * Tokenizes a string into an array of search words 320 * 321 * Uses the same algorithm as idx_getPageWords() 322 * 323 * @todo make combined function to use alone or in getPageWords 324 */ 325function idx_tokenizer($string,&$stopwords){ 326 $words = array(); 327 328 if(preg_match('/[^0-9A-Za-z]/u', $string)){ 329 #handle asian chars as single words 330 $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); 331 332 $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); 333 foreach ($arr as $w) { 334 if (!is_numeric($w) && strlen($w) < 3) continue; 335 $w = utf8_strtolower($w); 336 if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 337 $words[] = $w; 338 } 339 }else{ 340 $w = $string; 341 if (!is_numeric($w) && strlen($w) < 3) return $words; 342 $w = strtolower($w); 343 if(is_int(array_search("$w\n",$stopwords))) return $words; 344 $words[] = $w; 345 } 346 347 return $words; 348} 349 350//Setup VIM: ex: et ts=4 enc=utf-8 : 351