1<?php 2/** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once(DOKU_INC.'inc/io.php'); 12 require_once(DOKU_INC.'inc/utf8.php'); 13 require_once(DOKU_INC.'inc/parserutils.php'); 14 15/** 16 * Split a page into words 17 * 18 * It is based upon PHPCMS's indexer function index_entry 19 * 20 * Returns an array of of word counts, false if an error occured 21 * 22 * @author Andreas Gohr <andi@splitbrain.org> 23 */ 24function idx_getPageWords($page){ 25 global $conf; 26 $word_idx = file($conf['cachedir'].'/word.idx'); 27 28 // split page into words 29 $body = rawWiki($page); 30 $body = utf8_stripspecials($body,' ','._\-:'); 31 $body = utf8_strtolower($body); 32 $body = trim($body); 33 $words = explode(' ',$body); 34 sort($words); 35 36 $index = array(); //resulting index 37 $old = ''; 38 $wid = -1; 39 $doit = true; 40 $pos = 0; 41 42 //compact wordlist FIXME check for stopwords 43 foreach($words as $word){ 44 if(strlen($word) == 0) continue; 45 46 // it's the same word 47 if($word == $old){ 48 if($doit == false) { 49 // we didn't wanted it last time 50 continue; 51 } 52 // just increase the counter 53 $index[$wid]++; 54 continue; 55 } 56 57 // rememember old word 58 $old = $word; 59 $doit = true; 60 61 // checking minimum word-size (excepting numbers) 62 if(!is_numeric($word)) { 63 if(strlen($word) < 3) { #FIXME add config option for max wordsize 64 $doit = false; 65 continue; 66 } 67 } 68 69 //FIXME add stopword check 70 71 // get word ID 72 $wid = array_search("$word\n",$word_idx); 73 if(!is_int($wid)){ 74 $word_idx[] = "$word\n"; 75 $wid = count($word_idx)-1; 76 } 77 // add to index 78 $index[$wid] = 1; 79 } 80 81 // save back word index 82 $fh = fopen($conf['cachedir'].'/word.idx','w'); 83 if(!$fh){ 84 trigger_error("Failed to write word.idx", E_USER_ERROR); 85 return false; 86 } 87 fwrite($fh,join('',$word_idx)); 88 fclose($fh); 89 90 return $index; 91} 92 93/** 94 * Adds/updates the search for the given page 95 * 96 * This is the core function of the indexer which does most 97 * of the work. This function needs to be called with proper 98 * locking! 99 * 100 * @author Andreas Gohr <andi@splitbrain.org> 101 */ 102function idx_addPage($page){ 103 global $conf; 104 105 // load known words and documents 106 $page_idx = file($conf['cachedir'].'/page.idx'); 107 108 // get page id (this is the linenumber in page.idx) 109 $pid = array_search("$page\n",$page_idx); 110 if(!is_int($pid)){ 111 $page_idx[] = "$page\n"; 112 $pid = count($page_idx)-1; 113 // page was new - write back 114 $fh = fopen($conf['cachedir'].'/page.idx','w'); 115 if(!$fh) return false; 116 fwrite($fh,join('',$page_idx)); 117 fclose($fh); 118 } 119 120 // get word usage in page 121 $words = idx_getPageWords($page); 122 if($words === false) return false; 123 if(!count($words)) return true; 124 125 // Open index and temp file 126 $idx = fopen($conf['cachedir'].'/index.idx','r'); 127 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 128 if(!$idx || !$tmp){ 129 trigger_error("Failed to open index files", E_USER_ERROR); 130 return false; 131 } 132 133 // copy from index to temp file, modifying were needed 134 $lno = 0; 135 $line = ''; 136 while (!feof($idx)) { 137 // read full line 138 $line .= fgets($idx, 4096); 139 if(substr($line,-1) != "\n") continue; 140 141 // write a new Line to temp file 142 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 143 144 $line = ''; // reset line buffer 145 $lno++; // increase linecounter 146 } 147 fclose($idx); 148 149 // add missing lines (usually index and word should contain 150 // the same number of lines, however if the page contained 151 // new words the word file has some more lines which need to 152 // be added here 153 $word_idx = file($conf['cachedir'].'/word.idx'); 154 $wcnt = count($word_idx); 155 for($lno; $lno<$wcnt; $lno++){ 156 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 157 } 158 159 // close the temp file and move it over to be the new one 160 fclose($tmp); 161 return rename($conf['cachedir'].'/index.tmp', 162 $conf['cachedir'].'/index.idx'); 163} 164 165/** 166 * Write a new index line to the filehandle 167 * 168 * This function writes an line for the index file to the 169 * given filehandle. It removes the given document from 170 * the given line and readds it when $count is >0. 171 * 172 * @author Andreas Gohr <andi@splitbrain.org> 173 */ 174function idx_writeIndexLine($fh,$line,$pid,$count){ 175 $line = trim($line); 176 177 if($line != ''){ 178 $parts = explode(':',$line); 179 // remove doc from given line 180 foreach($parts as $part){ 181 if($part == '') continue; 182 list($doc,$cnt) = explode('*',$part); 183 if($doc != $pid){ 184 fwrite($fh,"$doc*$cnt:"); 185 } 186 } 187 } 188 189 // add doc 190 if ($count){ 191 fwrite($fh,"$pid*$count"); 192 } 193 194 // add newline 195 fwrite($fh,"\n"); 196} 197 198//Setup VIM: ex: et ts=4 enc=utf-8 : 199