1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3b4ce25e9SAndreas Gohr * Common DokuWiki functions 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7b4ce25e9SAndreas Gohr */ 8b4ce25e9SAndreas Gohr 9b4ce25e9SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10b4ce25e9SAndreas Gohr require_once(DOKU_CONF.'dokuwiki.php'); 11b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/io.php'); 12b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/utf8.php'); 13b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/parserutils.php'); 14b4ce25e9SAndreas Gohr 15b4ce25e9SAndreas Gohr/** 1644ca0adfSAndreas Gohr * Split a page into words 1744ca0adfSAndreas Gohr * 1844ca0adfSAndreas Gohr * Returns an array of of word counts, false if an error occured 1944ca0adfSAndreas Gohr * 2044ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 2117f42b01SChris Smith * @author Christopher Smith <chris@jalakai.co.uk> 22b4ce25e9SAndreas Gohr */ 2344ca0adfSAndreas Gohrfunction idx_getPageWords($page){ 2444ca0adfSAndreas Gohr global $conf; 2544ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 267367b368SAndreas Gohr $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 277367b368SAndreas Gohr if(@file_exists($swfile)){ 287367b368SAndreas Gohr $stopwords = file($swfile); 297367b368SAndreas Gohr }else{ 307367b368SAndreas Gohr $stopwords = array(); 317367b368SAndreas Gohr } 3244ca0adfSAndreas Gohr 3344ca0adfSAndreas Gohr $body = rawWiki($page); 3417f42b01SChris Smith $body = strtr($body, "\r\n\t", ' '); 3517f42b01SChris Smith $tokens = explode(' ', $body); 3617f42b01SChris Smith $tokens = array_count_values($tokens); // count the frequency of each token 3717f42b01SChris Smith 3817f42b01SChris Smith $words = array(); 3917f42b01SChris Smith foreach ($tokens as $word => $count) { 4017f42b01SChris Smith 4117f42b01SChris Smith // simple filter to restrict use of utf8_stripspecials 42bc54ab52Schris if (preg_match('/[^0-9A-Za-z]/u', $word)) { 4317f42b01SChris Smith $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); 4417f42b01SChris Smith $arr = array_count_values($arr); 4517f42b01SChris Smith 4617f42b01SChris Smith foreach ($arr as $w => $c) { 4717f42b01SChris Smith if (!is_numeric($w) && strlen($w) < 3) continue; 48bc54ab52Schris $w = utf8_strtolower($w); 4917f42b01SChris Smith $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); 5017f42b01SChris Smith } 5117f42b01SChris Smith } else { 5217f42b01SChris Smith if (!is_numeric($w) && strlen($w) < 3) continue; 53bc54ab52Schris $word = strtolower($word); 5417f42b01SChris Smith $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 5517f42b01SChris Smith } 5617f42b01SChris Smith } 5717f42b01SChris Smith 5817f42b01SChris Smith // arrive here with $words = array(word => frequency) 59b4ce25e9SAndreas Gohr 60b4ce25e9SAndreas Gohr $index = array(); //resulting index 6117f42b01SChris Smith foreach ($words as $word => $freq) { 6217f42b01SChris Smith if (is_int(array_search("$word\n",$stopwords))) continue; 6344ca0adfSAndreas Gohr $wid = array_search("$word\n",$word_idx); 6444ca0adfSAndreas Gohr if(!is_int($wid)){ 6544ca0adfSAndreas Gohr $word_idx[] = "$word\n"; 6644ca0adfSAndreas Gohr $wid = count($word_idx)-1; 67b4ce25e9SAndreas Gohr } 6817f42b01SChris Smith $index[$wid] = $freq; 6944ca0adfSAndreas Gohr } 7044ca0adfSAndreas Gohr 7144ca0adfSAndreas Gohr // save back word index 7244ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/word.idx','w'); 7344ca0adfSAndreas Gohr if(!$fh){ 7444ca0adfSAndreas Gohr trigger_error("Failed to write word.idx", E_USER_ERROR); 7544ca0adfSAndreas Gohr return false; 7644ca0adfSAndreas Gohr } 7744ca0adfSAndreas Gohr fwrite($fh,join('',$word_idx)); 7844ca0adfSAndreas Gohr fclose($fh); 79b4ce25e9SAndreas Gohr 80b4ce25e9SAndreas Gohr return $index; 81b4ce25e9SAndreas Gohr} 82b4ce25e9SAndreas Gohr 8344ca0adfSAndreas Gohr/** 8444ca0adfSAndreas Gohr * Adds/updates the search for the given page 8544ca0adfSAndreas Gohr * 8644ca0adfSAndreas Gohr * This is the core function of the indexer which does most 8744ca0adfSAndreas Gohr * of the work. This function needs to be called with proper 8844ca0adfSAndreas Gohr * locking! 8944ca0adfSAndreas Gohr * 9044ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 9144ca0adfSAndreas Gohr */ 9244ca0adfSAndreas Gohrfunction idx_addPage($page){ 9344ca0adfSAndreas Gohr global $conf; 94b4ce25e9SAndreas Gohr 95*488dd6ceSAndreas Gohr // load known documents 9644ca0adfSAndreas Gohr $page_idx = file($conf['cachedir'].'/page.idx'); 9744ca0adfSAndreas Gohr 9844ca0adfSAndreas Gohr // get page id (this is the linenumber in page.idx) 9944ca0adfSAndreas Gohr $pid = array_search("$page\n",$page_idx); 10044ca0adfSAndreas Gohr if(!is_int($pid)){ 10144ca0adfSAndreas Gohr $page_idx[] = "$page\n"; 10244ca0adfSAndreas Gohr $pid = count($page_idx)-1; 10344ca0adfSAndreas Gohr // page was new - write back 10444ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/page.idx','w'); 10544ca0adfSAndreas Gohr if(!$fh) return false; 10644ca0adfSAndreas Gohr fwrite($fh,join('',$page_idx)); 10744ca0adfSAndreas Gohr fclose($fh); 10844ca0adfSAndreas Gohr } 10944ca0adfSAndreas Gohr 11044ca0adfSAndreas Gohr // get word usage in page 11144ca0adfSAndreas Gohr $words = idx_getPageWords($page); 11244ca0adfSAndreas Gohr if($words === false) return false; 11344ca0adfSAndreas Gohr if(!count($words)) return true; 11444ca0adfSAndreas Gohr 11544ca0adfSAndreas Gohr // Open index and temp file 11644ca0adfSAndreas Gohr $idx = fopen($conf['cachedir'].'/index.idx','r'); 11744ca0adfSAndreas Gohr $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 11844ca0adfSAndreas Gohr if(!$idx || !$tmp){ 11944ca0adfSAndreas Gohr trigger_error("Failed to open index files", E_USER_ERROR); 12044ca0adfSAndreas Gohr return false; 12144ca0adfSAndreas Gohr } 12244ca0adfSAndreas Gohr 12344ca0adfSAndreas Gohr // copy from index to temp file, modifying were needed 12444ca0adfSAndreas Gohr $lno = 0; 12544ca0adfSAndreas Gohr $line = ''; 12644ca0adfSAndreas Gohr while (!feof($idx)) { 12744ca0adfSAndreas Gohr // read full line 12844ca0adfSAndreas Gohr $line .= fgets($idx, 4096); 12944ca0adfSAndreas Gohr if(substr($line,-1) != "\n") continue; 13044ca0adfSAndreas Gohr 13144ca0adfSAndreas Gohr // write a new Line to temp file 13244ca0adfSAndreas Gohr idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 13344ca0adfSAndreas Gohr 13444ca0adfSAndreas Gohr $line = ''; // reset line buffer 13544ca0adfSAndreas Gohr $lno++; // increase linecounter 13644ca0adfSAndreas Gohr } 13744ca0adfSAndreas Gohr fclose($idx); 13844ca0adfSAndreas Gohr 13944ca0adfSAndreas Gohr // add missing lines (usually index and word should contain 14044ca0adfSAndreas Gohr // the same number of lines, however if the page contained 14144ca0adfSAndreas Gohr // new words the word file has some more lines which need to 14244ca0adfSAndreas Gohr // be added here 14344ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 14444ca0adfSAndreas Gohr $wcnt = count($word_idx); 14544ca0adfSAndreas Gohr for($lno; $lno<$wcnt; $lno++){ 14644ca0adfSAndreas Gohr idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 14744ca0adfSAndreas Gohr } 14844ca0adfSAndreas Gohr 14944ca0adfSAndreas Gohr // close the temp file and move it over to be the new one 15044ca0adfSAndreas Gohr fclose($tmp); 15144ca0adfSAndreas Gohr return rename($conf['cachedir'].'/index.tmp', 15244ca0adfSAndreas Gohr $conf['cachedir'].'/index.idx'); 15344ca0adfSAndreas Gohr} 15444ca0adfSAndreas Gohr 15544ca0adfSAndreas Gohr/** 15644ca0adfSAndreas Gohr * Write a new index line to the filehandle 15744ca0adfSAndreas Gohr * 15844ca0adfSAndreas Gohr * This function writes an line for the index file to the 15944ca0adfSAndreas Gohr * given filehandle. It removes the given document from 16044ca0adfSAndreas Gohr * the given line and readds it when $count is >0. 16144ca0adfSAndreas Gohr * 16244ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 16344ca0adfSAndreas Gohr */ 16444ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){ 16544ca0adfSAndreas Gohr $line = trim($line); 16644ca0adfSAndreas Gohr 16744ca0adfSAndreas Gohr if($line != ''){ 16844ca0adfSAndreas Gohr $parts = explode(':',$line); 16944ca0adfSAndreas Gohr // remove doc from given line 17044ca0adfSAndreas Gohr foreach($parts as $part){ 17144ca0adfSAndreas Gohr if($part == '') continue; 17244ca0adfSAndreas Gohr list($doc,$cnt) = explode('*',$part); 17344ca0adfSAndreas Gohr if($doc != $pid){ 17444ca0adfSAndreas Gohr fwrite($fh,"$doc*$cnt:"); 17544ca0adfSAndreas Gohr } 17644ca0adfSAndreas Gohr } 17744ca0adfSAndreas Gohr } 17844ca0adfSAndreas Gohr 17944ca0adfSAndreas Gohr // add doc 18044ca0adfSAndreas Gohr if ($count){ 18144ca0adfSAndreas Gohr fwrite($fh,"$pid*$count"); 18244ca0adfSAndreas Gohr } 18344ca0adfSAndreas Gohr 18444ca0adfSAndreas Gohr // add newline 18544ca0adfSAndreas Gohr fwrite($fh,"\n"); 18644ca0adfSAndreas Gohr} 187b4ce25e9SAndreas Gohr 188*488dd6ceSAndreas Gohr/** 189*488dd6ceSAndreas Gohr * Lookup words in index 190*488dd6ceSAndreas Gohr * 191*488dd6ceSAndreas Gohr * Takes an array of word and will return a list of matching 192*488dd6ceSAndreas Gohr * documents for each one. 193*488dd6ceSAndreas Gohr * 194*488dd6ceSAndreas Gohr * It returns an array using the same index as the input 195*488dd6ceSAndreas Gohr * array. Returns false if something went wrong. 196*488dd6ceSAndreas Gohr * 197*488dd6ceSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 198*488dd6ceSAndreas Gohr */ 199*488dd6ceSAndreas Gohrfunction idx_lookup($words){ 200*488dd6ceSAndreas Gohr global $conf; 201*488dd6ceSAndreas Gohr 202*488dd6ceSAndreas Gohr $result = array(); 203*488dd6ceSAndreas Gohr 204*488dd6ceSAndreas Gohr // load known words and documents 205*488dd6ceSAndreas Gohr $page_idx = file($conf['cachedir'].'/page.idx'); 206*488dd6ceSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 207*488dd6ceSAndreas Gohr 208*488dd6ceSAndreas Gohr // get word IDs 209*488dd6ceSAndreas Gohr $wids = array(); 210*488dd6ceSAndreas Gohr $pos = 0; 211*488dd6ceSAndreas Gohr foreach($words as $word){ 212*488dd6ceSAndreas Gohr 213*488dd6ceSAndreas Gohr //FIXME words should be cleaned here as in getPageWords 214*488dd6ceSAndreas Gohr 215*488dd6ceSAndreas Gohr $wid = array_search("$word\n",$word_idx); 216*488dd6ceSAndreas Gohr if(is_int($wid)){ 217*488dd6ceSAndreas Gohr $wids[] = $wid; 218*488dd6ceSAndreas Gohr $result[$pos]['wordid'] = $wid; 219*488dd6ceSAndreas Gohr } 220*488dd6ceSAndreas Gohr $result[$pos]['word'] = $word; 221*488dd6ceSAndreas Gohr $pos++; 222*488dd6ceSAndreas Gohr } 223*488dd6ceSAndreas Gohr sort($wids); 224*488dd6ceSAndreas Gohr 225*488dd6ceSAndreas Gohr 226*488dd6ceSAndreas Gohr // Open index 227*488dd6ceSAndreas Gohr $idx = fopen($conf['cachedir'].'/index.idx','r'); 228*488dd6ceSAndreas Gohr if(!$idx){ 229*488dd6ceSAndreas Gohr msg("Failed to open index files",-1); 230*488dd6ceSAndreas Gohr return false; 231*488dd6ceSAndreas Gohr } 232*488dd6ceSAndreas Gohr 233*488dd6ceSAndreas Gohr // Walk the index til the lines are found 234*488dd6ceSAndreas Gohr $docs = array(); // hold docs found 235*488dd6ceSAndreas Gohr $lno = 0; 236*488dd6ceSAndreas Gohr $line = ''; 237*488dd6ceSAndreas Gohr $srch = array_shift($wids); // which word do we look for? 238*488dd6ceSAndreas Gohr while (!feof($idx)) { 239*488dd6ceSAndreas Gohr // read full line 240*488dd6ceSAndreas Gohr $line .= fgets($idx, 4096); 241*488dd6ceSAndreas Gohr if(substr($line,-1) != "\n") continue; 242*488dd6ceSAndreas Gohr if($lno > $srch) break; // shouldn't happen 243*488dd6ceSAndreas Gohr 244*488dd6ceSAndreas Gohr 245*488dd6ceSAndreas Gohr // do we want this line? 246*488dd6ceSAndreas Gohr if($lno == $srch){ 247*488dd6ceSAndreas Gohr // add docs to list 248*488dd6ceSAndreas Gohr $docs[$srch] = idx_parseIndexLine($page_idx,$line); 249*488dd6ceSAndreas Gohr 250*488dd6ceSAndreas Gohr $srch = array_shift($wids); // next word to look up 251*488dd6ceSAndreas Gohr if($srch == null) break; // no more words 252*488dd6ceSAndreas Gohr } 253*488dd6ceSAndreas Gohr 254*488dd6ceSAndreas Gohr $line = ''; // reset line buffer 255*488dd6ceSAndreas Gohr $lno++; // increase linecounter 256*488dd6ceSAndreas Gohr } 257*488dd6ceSAndreas Gohr fclose($idx); 258*488dd6ceSAndreas Gohr 259*488dd6ceSAndreas Gohr // merge docs into results 260*488dd6ceSAndreas Gohr $count = count($result); 261*488dd6ceSAndreas Gohr for($i=0; $i<$count; $i++){ 262*488dd6ceSAndreas Gohr if(isset($result[$i]['wordid'])){ 263*488dd6ceSAndreas Gohr $result[$i]['pages'] = $docs[$result[$i]['wordid']]; 264*488dd6ceSAndreas Gohr } 265*488dd6ceSAndreas Gohr } 266*488dd6ceSAndreas Gohrdbg($result); 267*488dd6ceSAndreas Gohr 268*488dd6ceSAndreas Gohr} 269*488dd6ceSAndreas Gohr 270*488dd6ceSAndreas Gohr/** 271*488dd6ceSAndreas Gohr * Returns a list of documents and counts from a index line 272*488dd6ceSAndreas Gohr * 273*488dd6ceSAndreas Gohr * It omits docs with a count of 0 and pages that no longer 274*488dd6ceSAndreas Gohr * exist. 275*488dd6ceSAndreas Gohr * 276*488dd6ceSAndreas Gohr * @param array $page_idx The list of known pages 277*488dd6ceSAndreas Gohr * @param string $line A line from the main index 278*488dd6ceSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 279*488dd6ceSAndreas Gohr */ 280*488dd6ceSAndreas Gohrfunction idx_parseIndexLine(&$page_idx,$line){ 281*488dd6ceSAndreas Gohr $result = array(); 282*488dd6ceSAndreas Gohr 283*488dd6ceSAndreas Gohr $line = trim($line); 284*488dd6ceSAndreas Gohr if($line == '') return; 285*488dd6ceSAndreas Gohr 286*488dd6ceSAndreas Gohr $parts = explode(':',$line); 287*488dd6ceSAndreas Gohr foreach($parts as $part){ 288*488dd6ceSAndreas Gohr if($part == '') continue; 289*488dd6ceSAndreas Gohr list($doc,$cnt) = explode('*',$part); 290*488dd6ceSAndreas Gohr if(!$cnt) continue; 291*488dd6ceSAndreas Gohr $doc = trim($page_idx[$doc]); 292*488dd6ceSAndreas Gohr if(!$doc) continue; 293*488dd6ceSAndreas Gohr // make sure the document still exists 294*488dd6ceSAndreas Gohr if(!@file_exists(wikiFN($doc))) continue; 295*488dd6ceSAndreas Gohr 296*488dd6ceSAndreas Gohr $result[$doc] = $cnt; 297*488dd6ceSAndreas Gohr } 298*488dd6ceSAndreas Gohr return $result; 299*488dd6ceSAndreas Gohr} 300*488dd6ceSAndreas Gohr 301b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 : 302