1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15/** 16 * Split a page into words 17 * 18 * Returns an array of of word counts, false if an error occured 19 * 20 * @author Andreas Gohr <andi@splitbrain.org> 21 * @author Christopher Smith <chris@jalakai.co.uk> 22 */ 23function idx_getPageWords($page){ 24 global $conf; 25 $word_idx = file($conf['cachedir'].'/word.idx'); 26 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 27 if(@file_exists($swfile)){ 28 $stopwords = file($swfile); 29 }else{ 30 $stopwords = array(); 31 } 32 33 $body = rawWiki($page); 34 $body = strtr($body, "\r\n\t", ' '); 35 $tokens = explode(' ', $body); 36 $tokens = array_count_values($tokens); // count the frequency of each token 37 38 $words = array(); 39 foreach ($tokens as $word => $count) { 40 $word = utf8_strtolower($word); 41 42 // simple filter to restrict use of utf8_stripspecials 43 if (preg_match('/\W/', $word)) { 44 $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); 45 $arr = array_count_values($arr); 46 47 foreach ($arr as $w => $c) { 48 if (!is_numeric($w) && strlen($w) < 3) continue; 49 $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); 50 } 51 } else { 52 if (!is_numeric($w) && strlen($w) < 3) continue; 53 $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 54 } 55 } 56 57 // arrive here with $words = array(word => frequency) 58 59 $index = array(); //resulting index 60 foreach ($words as $word => $freq) { 61 if (is_int(array_search("$word\n",$stopwords))) continue; 62 $wid = array_search("$word\n",$word_idx); 63 if(!is_int($wid)){ 64 $word_idx[] = "$word\n"; 65 $wid = count($word_idx)-1; 66 } 67 $index[$wid] = $freq; 68 } 69 70 // save back word index 71 $fh = fopen($conf['cachedir'].'/word.idx','w'); 72 if(!$fh){ 73 trigger_error("Failed to write word.idx", E_USER_ERROR); 74 return false; 75 } 76 fwrite($fh,join('',$word_idx)); 77 fclose($fh); 78 79 return $index; 80} 81 82/** 83 * Adds/updates the search for the given page 84 * 85 * This is the core function of the indexer which does most 86 * of the work. This function needs to be called with proper 87 * locking! 88 * 89 * @author Andreas Gohr <andi@splitbrain.org> 90 */ 91function idx_addPage($page){ 92 global $conf; 93 94 // load known words and documents 95 $page_idx = file($conf['cachedir'].'/page.idx'); 96 97 // get page id (this is the linenumber in page.idx) 98 $pid = array_search("$page\n",$page_idx); 99 if(!is_int($pid)){ 100 $page_idx[] = "$page\n"; 101 $pid = count($page_idx)-1; 102 // page was new - write back 103 $fh = fopen($conf['cachedir'].'/page.idx','w'); 104 if(!$fh) return false; 105 fwrite($fh,join('',$page_idx)); 106 fclose($fh); 107 } 108 109 // get word usage in page 110 $words = idx_getPageWords($page); 111 if($words === false) return false; 112 if(!count($words)) return true; 113 114 // Open index and temp file 115 $idx = fopen($conf['cachedir'].'/index.idx','r'); 116 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 117 if(!$idx || !$tmp){ 118 trigger_error("Failed to open index files", E_USER_ERROR); 119 return false; 120 } 121 122 // copy from index to temp file, modifying were needed 123 $lno = 0; 124 $line = ''; 125 while (!feof($idx)) { 126 // read full line 127 $line .= fgets($idx, 4096); 128 if(substr($line,-1) != "\n") continue; 129 130 // write a new Line to temp file 131 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 132 133 $line = ''; // reset line buffer 134 $lno++; // increase linecounter 135 } 136 fclose($idx); 137 138 // add missing lines (usually index and word should contain 139 // the same number of lines, however if the page contained 140 // new words the word file has some more lines which need to 141 // be added here 142 $word_idx = file($conf['cachedir'].'/word.idx'); 143 $wcnt = count($word_idx); 144 for($lno; $lno<$wcnt; $lno++){ 145 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 146 } 147 148 // close the temp file and move it over to be the new one 149 fclose($tmp); 150 return rename($conf['cachedir'].'/index.tmp', 151 $conf['cachedir'].'/index.idx'); 152} 153 154/** 155 * Write a new index line to the filehandle 156 * 157 * This function writes an line for the index file to the 158 * given filehandle. It removes the given document from 159 * the given line and readds it when $count is >0. 160 * 161 * @author Andreas Gohr <andi@splitbrain.org> 162 */ 163function idx_writeIndexLine($fh,$line,$pid,$count){ 164 $line = trim($line); 165 166 if($line != ''){ 167 $parts = explode(':',$line); 168 // remove doc from given line 169 foreach($parts as $part){ 170 if($part == '') continue; 171 list($doc,$cnt) = explode('*',$part); 172 if($doc != $pid){ 173 fwrite($fh,"$doc*$cnt:"); 174 } 175 } 176 } 177 178 // add doc 179 if ($count){ 180 fwrite($fh,"$pid*$count"); 181 } 182 183 // add newline 184 fwrite($fh,"\n"); 185} 186 187//Setup VIM: ex: et ts=4 enc=utf-8 : 188