1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3b4ce25e9SAndreas Gohr * Common DokuWiki functions 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7b4ce25e9SAndreas Gohr */ 8b4ce25e9SAndreas Gohr 9b4ce25e9SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10b4ce25e9SAndreas Gohr require_once(DOKU_CONF.'dokuwiki.php'); 11b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/io.php'); 12b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/utf8.php'); 13b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/parserutils.php'); 14b4ce25e9SAndreas Gohr 15b4ce25e9SAndreas Gohr/** 1644ca0adfSAndreas Gohr * Split a page into words 1744ca0adfSAndreas Gohr * 1844ca0adfSAndreas Gohr * It is based upon PHPCMS's indexer function index_entry 1944ca0adfSAndreas Gohr * 2044ca0adfSAndreas Gohr * Returns an array of of word counts, false if an error occured 2144ca0adfSAndreas Gohr * 2244ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 23b4ce25e9SAndreas Gohr */ 2444ca0adfSAndreas Gohrfunction idx_getPageWords($page){ 2544ca0adfSAndreas Gohr global $conf; 2644ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 27*7367b368SAndreas Gohr $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 28*7367b368SAndreas Gohr if(@file_exists($swfile)){ 29*7367b368SAndreas Gohr $stopwords = file($swfile); 30*7367b368SAndreas Gohr }else{ 31*7367b368SAndreas Gohr $stopwords = array(); 32*7367b368SAndreas Gohr } 3344ca0adfSAndreas Gohr 3444ca0adfSAndreas Gohr // split page into words 3544ca0adfSAndreas Gohr $body = rawWiki($page); 36b4ce25e9SAndreas Gohr $body = utf8_stripspecials($body,' ','._\-:'); 37b4ce25e9SAndreas Gohr $body = utf8_strtolower($body); 38b4ce25e9SAndreas Gohr $body = trim($body); 39b4ce25e9SAndreas Gohr $words = explode(' ',$body); 40b4ce25e9SAndreas Gohr sort($words); 41b4ce25e9SAndreas Gohr 42b4ce25e9SAndreas Gohr $index = array(); //resulting index 43b4ce25e9SAndreas Gohr $old = ''; 4444ca0adfSAndreas Gohr $wid = -1; 45b4ce25e9SAndreas Gohr $doit = true; 46b4ce25e9SAndreas Gohr $pos = 0; 47b4ce25e9SAndreas Gohr 48b4ce25e9SAndreas Gohr //compact wordlist FIXME check for stopwords 49b4ce25e9SAndreas Gohr foreach($words as $word){ 50b4ce25e9SAndreas Gohr if(strlen($word) == 0) continue; 51b4ce25e9SAndreas Gohr 52b4ce25e9SAndreas Gohr // it's the same word 53b4ce25e9SAndreas Gohr if($word == $old){ 54b4ce25e9SAndreas Gohr if($doit == false) { 55b4ce25e9SAndreas Gohr // we didn't wanted it last time 56b4ce25e9SAndreas Gohr continue; 57b4ce25e9SAndreas Gohr } 58b4ce25e9SAndreas Gohr // just increase the counter 5944ca0adfSAndreas Gohr $index[$wid]++; 60b4ce25e9SAndreas Gohr continue; 61b4ce25e9SAndreas Gohr } 62b4ce25e9SAndreas Gohr 63b4ce25e9SAndreas Gohr // rememember old word 64b4ce25e9SAndreas Gohr $old = $word; 65b4ce25e9SAndreas Gohr $doit = true; 66b4ce25e9SAndreas Gohr 67b4ce25e9SAndreas Gohr // checking minimum word-size (excepting numbers) 68b4ce25e9SAndreas Gohr if(!is_numeric($word)) { 69*7367b368SAndreas Gohr if(strlen($word) < 3) { 70b4ce25e9SAndreas Gohr $doit = false; 71b4ce25e9SAndreas Gohr continue; 72b4ce25e9SAndreas Gohr } 73b4ce25e9SAndreas Gohr } 74b4ce25e9SAndreas Gohr 75*7367b368SAndreas Gohr // stopword check 76*7367b368SAndreas Gohr if(is_int(array_search("$word\n",$stopwords))){ 77*7367b368SAndreas Gohr $doit = false; 78*7367b368SAndreas Gohr continue; 79*7367b368SAndreas Gohr } 80b4ce25e9SAndreas Gohr 8144ca0adfSAndreas Gohr // get word ID 8244ca0adfSAndreas Gohr $wid = array_search("$word\n",$word_idx); 8344ca0adfSAndreas Gohr if(!is_int($wid)){ 8444ca0adfSAndreas Gohr $word_idx[] = "$word\n"; 8544ca0adfSAndreas Gohr $wid = count($word_idx)-1; 86b4ce25e9SAndreas Gohr } 8744ca0adfSAndreas Gohr // add to index 8844ca0adfSAndreas Gohr $index[$wid] = 1; 8944ca0adfSAndreas Gohr } 9044ca0adfSAndreas Gohr 9144ca0adfSAndreas Gohr // save back word index 9244ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/word.idx','w'); 9344ca0adfSAndreas Gohr if(!$fh){ 9444ca0adfSAndreas Gohr trigger_error("Failed to write word.idx", E_USER_ERROR); 9544ca0adfSAndreas Gohr return false; 9644ca0adfSAndreas Gohr } 9744ca0adfSAndreas Gohr fwrite($fh,join('',$word_idx)); 9844ca0adfSAndreas Gohr fclose($fh); 99b4ce25e9SAndreas Gohr 100b4ce25e9SAndreas Gohr return $index; 101b4ce25e9SAndreas Gohr} 102b4ce25e9SAndreas Gohr 10344ca0adfSAndreas Gohr/** 10444ca0adfSAndreas Gohr * Adds/updates the search for the given page 10544ca0adfSAndreas Gohr * 10644ca0adfSAndreas Gohr * This is the core function of the indexer which does most 10744ca0adfSAndreas Gohr * of the work. This function needs to be called with proper 10844ca0adfSAndreas Gohr * locking! 10944ca0adfSAndreas Gohr * 11044ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 11144ca0adfSAndreas Gohr */ 11244ca0adfSAndreas Gohrfunction idx_addPage($page){ 11344ca0adfSAndreas Gohr global $conf; 114b4ce25e9SAndreas Gohr 11544ca0adfSAndreas Gohr // load known words and documents 11644ca0adfSAndreas Gohr $page_idx = file($conf['cachedir'].'/page.idx'); 11744ca0adfSAndreas Gohr 11844ca0adfSAndreas Gohr // get page id (this is the linenumber in page.idx) 11944ca0adfSAndreas Gohr $pid = array_search("$page\n",$page_idx); 12044ca0adfSAndreas Gohr if(!is_int($pid)){ 12144ca0adfSAndreas Gohr $page_idx[] = "$page\n"; 12244ca0adfSAndreas Gohr $pid = count($page_idx)-1; 12344ca0adfSAndreas Gohr // page was new - write back 12444ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/page.idx','w'); 12544ca0adfSAndreas Gohr if(!$fh) return false; 12644ca0adfSAndreas Gohr fwrite($fh,join('',$page_idx)); 12744ca0adfSAndreas Gohr fclose($fh); 12844ca0adfSAndreas Gohr } 12944ca0adfSAndreas Gohr 13044ca0adfSAndreas Gohr // get word usage in page 13144ca0adfSAndreas Gohr $words = idx_getPageWords($page); 13244ca0adfSAndreas Gohr if($words === false) return false; 13344ca0adfSAndreas Gohr if(!count($words)) return true; 13444ca0adfSAndreas Gohr 13544ca0adfSAndreas Gohr // Open index and temp file 13644ca0adfSAndreas Gohr $idx = fopen($conf['cachedir'].'/index.idx','r'); 13744ca0adfSAndreas Gohr $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 13844ca0adfSAndreas Gohr if(!$idx || !$tmp){ 13944ca0adfSAndreas Gohr trigger_error("Failed to open index files", E_USER_ERROR); 14044ca0adfSAndreas Gohr return false; 14144ca0adfSAndreas Gohr } 14244ca0adfSAndreas Gohr 14344ca0adfSAndreas Gohr // copy from index to temp file, modifying were needed 14444ca0adfSAndreas Gohr $lno = 0; 14544ca0adfSAndreas Gohr $line = ''; 14644ca0adfSAndreas Gohr while (!feof($idx)) { 14744ca0adfSAndreas Gohr // read full line 14844ca0adfSAndreas Gohr $line .= fgets($idx, 4096); 14944ca0adfSAndreas Gohr if(substr($line,-1) != "\n") continue; 15044ca0adfSAndreas Gohr 15144ca0adfSAndreas Gohr // write a new Line to temp file 15244ca0adfSAndreas Gohr idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 15344ca0adfSAndreas Gohr 15444ca0adfSAndreas Gohr $line = ''; // reset line buffer 15544ca0adfSAndreas Gohr $lno++; // increase linecounter 15644ca0adfSAndreas Gohr } 15744ca0adfSAndreas Gohr fclose($idx); 15844ca0adfSAndreas Gohr 15944ca0adfSAndreas Gohr // add missing lines (usually index and word should contain 16044ca0adfSAndreas Gohr // the same number of lines, however if the page contained 16144ca0adfSAndreas Gohr // new words the word file has some more lines which need to 16244ca0adfSAndreas Gohr // be added here 16344ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 16444ca0adfSAndreas Gohr $wcnt = count($word_idx); 16544ca0adfSAndreas Gohr for($lno; $lno<$wcnt; $lno++){ 16644ca0adfSAndreas Gohr idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 16744ca0adfSAndreas Gohr } 16844ca0adfSAndreas Gohr 16944ca0adfSAndreas Gohr // close the temp file and move it over to be the new one 17044ca0adfSAndreas Gohr fclose($tmp); 17144ca0adfSAndreas Gohr return rename($conf['cachedir'].'/index.tmp', 17244ca0adfSAndreas Gohr $conf['cachedir'].'/index.idx'); 17344ca0adfSAndreas Gohr} 17444ca0adfSAndreas Gohr 17544ca0adfSAndreas Gohr/** 17644ca0adfSAndreas Gohr * Write a new index line to the filehandle 17744ca0adfSAndreas Gohr * 17844ca0adfSAndreas Gohr * This function writes an line for the index file to the 17944ca0adfSAndreas Gohr * given filehandle. It removes the given document from 18044ca0adfSAndreas Gohr * the given line and readds it when $count is >0. 18144ca0adfSAndreas Gohr * 18244ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 18344ca0adfSAndreas Gohr */ 18444ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){ 18544ca0adfSAndreas Gohr $line = trim($line); 18644ca0adfSAndreas Gohr 18744ca0adfSAndreas Gohr if($line != ''){ 18844ca0adfSAndreas Gohr $parts = explode(':',$line); 18944ca0adfSAndreas Gohr // remove doc from given line 19044ca0adfSAndreas Gohr foreach($parts as $part){ 19144ca0adfSAndreas Gohr if($part == '') continue; 19244ca0adfSAndreas Gohr list($doc,$cnt) = explode('*',$part); 19344ca0adfSAndreas Gohr if($doc != $pid){ 19444ca0adfSAndreas Gohr fwrite($fh,"$doc*$cnt:"); 19544ca0adfSAndreas Gohr } 19644ca0adfSAndreas Gohr } 19744ca0adfSAndreas Gohr } 19844ca0adfSAndreas Gohr 19944ca0adfSAndreas Gohr // add doc 20044ca0adfSAndreas Gohr if ($count){ 20144ca0adfSAndreas Gohr fwrite($fh,"$pid*$count"); 20244ca0adfSAndreas Gohr } 20344ca0adfSAndreas Gohr 20444ca0adfSAndreas Gohr // add newline 20544ca0adfSAndreas Gohr fwrite($fh,"\n"); 20644ca0adfSAndreas Gohr} 207b4ce25e9SAndreas Gohr 208b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 : 209