1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3b4ce25e9SAndreas Gohr * Common DokuWiki functions 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7b4ce25e9SAndreas Gohr */ 8b4ce25e9SAndreas Gohr 9b4ce25e9SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10b4ce25e9SAndreas Gohr require_once(DOKU_CONF.'dokuwiki.php'); 11b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/io.php'); 12b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/utf8.php'); 13b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/parserutils.php'); 14b4ce25e9SAndreas Gohr 15b4ce25e9SAndreas Gohr/** 16*44ca0adfSAndreas Gohr * Split a page into words 17*44ca0adfSAndreas Gohr * 18*44ca0adfSAndreas Gohr * It is based upon PHPCMS's indexer function index_entry 19*44ca0adfSAndreas Gohr * 20*44ca0adfSAndreas Gohr * Returns an array of of word counts, false if an error occured 21*44ca0adfSAndreas Gohr * 22*44ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 23b4ce25e9SAndreas Gohr */ 24*44ca0adfSAndreas Gohrfunction idx_getPageWords($page){ 25*44ca0adfSAndreas Gohr global $conf; 26*44ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 27*44ca0adfSAndreas Gohr 28*44ca0adfSAndreas Gohr // split page into words 29*44ca0adfSAndreas Gohr $body = rawWiki($page); 30b4ce25e9SAndreas Gohr $body = utf8_stripspecials($body,' ','._\-:'); 31b4ce25e9SAndreas Gohr $body = utf8_strtolower($body); 32b4ce25e9SAndreas Gohr $body = trim($body); 33b4ce25e9SAndreas Gohr $words = explode(' ',$body); 34b4ce25e9SAndreas Gohr sort($words); 35b4ce25e9SAndreas Gohr 36b4ce25e9SAndreas Gohr $index = array(); //resulting index 37b4ce25e9SAndreas Gohr $old = ''; 38*44ca0adfSAndreas Gohr $wid = -1; 39b4ce25e9SAndreas Gohr $doit = true; 40b4ce25e9SAndreas Gohr $pos = 0; 41b4ce25e9SAndreas Gohr 42b4ce25e9SAndreas Gohr //compact wordlist FIXME check for stopwords 43b4ce25e9SAndreas Gohr foreach($words as $word){ 44b4ce25e9SAndreas Gohr if(strlen($word) == 0) continue; 45b4ce25e9SAndreas Gohr 46b4ce25e9SAndreas Gohr // it's the same word 47b4ce25e9SAndreas Gohr if($word == $old){ 48b4ce25e9SAndreas Gohr if($doit == false) { 49b4ce25e9SAndreas Gohr // we didn't wanted it last time 50b4ce25e9SAndreas Gohr continue; 51b4ce25e9SAndreas Gohr } 52b4ce25e9SAndreas Gohr // just increase the counter 53*44ca0adfSAndreas Gohr $index[$wid]++; 54b4ce25e9SAndreas Gohr continue; 55b4ce25e9SAndreas Gohr } 56b4ce25e9SAndreas Gohr 57b4ce25e9SAndreas Gohr // rememember old word 58b4ce25e9SAndreas Gohr $old = $word; 59b4ce25e9SAndreas Gohr $doit = true; 60b4ce25e9SAndreas Gohr 61b4ce25e9SAndreas Gohr // checking minimum word-size (excepting numbers) 62b4ce25e9SAndreas Gohr if(!is_numeric($word)) { 63b4ce25e9SAndreas Gohr if(strlen($word) < 3) { #FIXME add config option for max wordsize 64b4ce25e9SAndreas Gohr $doit = false; 65b4ce25e9SAndreas Gohr continue; 66b4ce25e9SAndreas Gohr } 67b4ce25e9SAndreas Gohr } 68b4ce25e9SAndreas Gohr 69b4ce25e9SAndreas Gohr //FIXME add stopword check 70b4ce25e9SAndreas Gohr 71*44ca0adfSAndreas Gohr // get word ID 72*44ca0adfSAndreas Gohr $wid = array_search("$word\n",$word_idx); 73*44ca0adfSAndreas Gohr if(!is_int($wid)){ 74*44ca0adfSAndreas Gohr $word_idx[] = "$word\n"; 75*44ca0adfSAndreas Gohr $wid = count($word_idx)-1; 76b4ce25e9SAndreas Gohr } 77*44ca0adfSAndreas Gohr // add to index 78*44ca0adfSAndreas Gohr $index[$wid] = 1; 79*44ca0adfSAndreas Gohr } 80*44ca0adfSAndreas Gohr 81*44ca0adfSAndreas Gohr // save back word index 82*44ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/word.idx','w'); 83*44ca0adfSAndreas Gohr if(!$fh){ 84*44ca0adfSAndreas Gohr trigger_error("Failed to write word.idx", E_USER_ERROR); 85*44ca0adfSAndreas Gohr return false; 86*44ca0adfSAndreas Gohr } 87*44ca0adfSAndreas Gohr fwrite($fh,join('',$word_idx)); 88*44ca0adfSAndreas Gohr fclose($fh); 89b4ce25e9SAndreas Gohr 90b4ce25e9SAndreas Gohr return $index; 91b4ce25e9SAndreas Gohr} 92b4ce25e9SAndreas Gohr 93*44ca0adfSAndreas Gohr/** 94*44ca0adfSAndreas Gohr * Adds/updates the search for the given page 95*44ca0adfSAndreas Gohr * 96*44ca0adfSAndreas Gohr * This is the core function of the indexer which does most 97*44ca0adfSAndreas Gohr * of the work. This function needs to be called with proper 98*44ca0adfSAndreas Gohr * locking! 99*44ca0adfSAndreas Gohr * 100*44ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 101*44ca0adfSAndreas Gohr */ 102*44ca0adfSAndreas Gohrfunction idx_addPage($page){ 103*44ca0adfSAndreas Gohr global $conf; 104b4ce25e9SAndreas Gohr 105*44ca0adfSAndreas Gohr // load known words and documents 106*44ca0adfSAndreas Gohr $page_idx = file($conf['cachedir'].'/page.idx'); 107*44ca0adfSAndreas Gohr 108*44ca0adfSAndreas Gohr // get page id (this is the linenumber in page.idx) 109*44ca0adfSAndreas Gohr $pid = array_search("$page\n",$page_idx); 110*44ca0adfSAndreas Gohr if(!is_int($pid)){ 111*44ca0adfSAndreas Gohr $page_idx[] = "$page\n"; 112*44ca0adfSAndreas Gohr $pid = count($page_idx)-1; 113*44ca0adfSAndreas Gohr // page was new - write back 114*44ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/page.idx','w'); 115*44ca0adfSAndreas Gohr if(!$fh) return false; 116*44ca0adfSAndreas Gohr fwrite($fh,join('',$page_idx)); 117*44ca0adfSAndreas Gohr fclose($fh); 118*44ca0adfSAndreas Gohr } 119*44ca0adfSAndreas Gohr 120*44ca0adfSAndreas Gohr // get word usage in page 121*44ca0adfSAndreas Gohr $words = idx_getPageWords($page); 122*44ca0adfSAndreas Gohr if($words === false) return false; 123*44ca0adfSAndreas Gohr if(!count($words)) return true; 124*44ca0adfSAndreas Gohr 125*44ca0adfSAndreas Gohr // Open index and temp file 126*44ca0adfSAndreas Gohr $idx = fopen($conf['cachedir'].'/index.idx','r'); 127*44ca0adfSAndreas Gohr $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 128*44ca0adfSAndreas Gohr if(!$idx || !$tmp){ 129*44ca0adfSAndreas Gohr trigger_error("Failed to open index files", E_USER_ERROR); 130*44ca0adfSAndreas Gohr return false; 131*44ca0adfSAndreas Gohr } 132*44ca0adfSAndreas Gohr 133*44ca0adfSAndreas Gohr // copy from index to temp file, modifying were needed 134*44ca0adfSAndreas Gohr $lno = 0; 135*44ca0adfSAndreas Gohr $line = ''; 136*44ca0adfSAndreas Gohr while (!feof($idx)) { 137*44ca0adfSAndreas Gohr // read full line 138*44ca0adfSAndreas Gohr $line .= fgets($idx, 4096); 139*44ca0adfSAndreas Gohr if(substr($line,-1) != "\n") continue; 140*44ca0adfSAndreas Gohr 141*44ca0adfSAndreas Gohr // write a new Line to temp file 142*44ca0adfSAndreas Gohr idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 143*44ca0adfSAndreas Gohr 144*44ca0adfSAndreas Gohr $line = ''; // reset line buffer 145*44ca0adfSAndreas Gohr $lno++; // increase linecounter 146*44ca0adfSAndreas Gohr } 147*44ca0adfSAndreas Gohr fclose($idx); 148*44ca0adfSAndreas Gohr 149*44ca0adfSAndreas Gohr // add missing lines (usually index and word should contain 150*44ca0adfSAndreas Gohr // the same number of lines, however if the page contained 151*44ca0adfSAndreas Gohr // new words the word file has some more lines which need to 152*44ca0adfSAndreas Gohr // be added here 153*44ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 154*44ca0adfSAndreas Gohr $wcnt = count($word_idx); 155*44ca0adfSAndreas Gohr for($lno; $lno<$wcnt; $lno++){ 156*44ca0adfSAndreas Gohr idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 157*44ca0adfSAndreas Gohr } 158*44ca0adfSAndreas Gohr 159*44ca0adfSAndreas Gohr // close the temp file and move it over to be the new one 160*44ca0adfSAndreas Gohr fclose($tmp); 161*44ca0adfSAndreas Gohr return rename($conf['cachedir'].'/index.tmp', 162*44ca0adfSAndreas Gohr $conf['cachedir'].'/index.idx'); 163*44ca0adfSAndreas Gohr} 164*44ca0adfSAndreas Gohr 165*44ca0adfSAndreas Gohr/** 166*44ca0adfSAndreas Gohr * Write a new index line to the filehandle 167*44ca0adfSAndreas Gohr * 168*44ca0adfSAndreas Gohr * This function writes an line for the index file to the 169*44ca0adfSAndreas Gohr * given filehandle. It removes the given document from 170*44ca0adfSAndreas Gohr * the given line and readds it when $count is >0. 171*44ca0adfSAndreas Gohr * 172*44ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 173*44ca0adfSAndreas Gohr */ 174*44ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){ 175*44ca0adfSAndreas Gohr $line = trim($line); 176*44ca0adfSAndreas Gohr 177*44ca0adfSAndreas Gohr if($line != ''){ 178*44ca0adfSAndreas Gohr $parts = explode(':',$line); 179*44ca0adfSAndreas Gohr // remove doc from given line 180*44ca0adfSAndreas Gohr foreach($parts as $part){ 181*44ca0adfSAndreas Gohr if($part == '') continue; 182*44ca0adfSAndreas Gohr list($doc,$cnt) = explode('*',$part); 183*44ca0adfSAndreas Gohr if($doc != $pid){ 184*44ca0adfSAndreas Gohr fwrite($fh,"$doc*$cnt:"); 185*44ca0adfSAndreas Gohr } 186*44ca0adfSAndreas Gohr } 187*44ca0adfSAndreas Gohr } 188*44ca0adfSAndreas Gohr 189*44ca0adfSAndreas Gohr // add doc 190*44ca0adfSAndreas Gohr if ($count){ 191*44ca0adfSAndreas Gohr fwrite($fh,"$pid*$count"); 192*44ca0adfSAndreas Gohr } 193*44ca0adfSAndreas Gohr 194*44ca0adfSAndreas Gohr // add newline 195*44ca0adfSAndreas Gohr fwrite($fh,"\n"); 196*44ca0adfSAndreas Gohr} 197b4ce25e9SAndreas Gohr 198b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 : 199