1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3b4ce25e9SAndreas Gohr * Common DokuWiki functions 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7b4ce25e9SAndreas Gohr */ 8b4ce25e9SAndreas Gohr 9b4ce25e9SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10b4ce25e9SAndreas Gohr require_once(DOKU_CONF.'dokuwiki.php'); 11b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/io.php'); 12b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/utf8.php'); 13b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/parserutils.php'); 14b4ce25e9SAndreas Gohr 1593a60ad2SAndreas Gohr// Asian characters are handled as words. The following regexp defines the 1693a60ad2SAndreas Gohr// Unicode-Ranges for Asian characters 1793a60ad2SAndreas Gohr// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 1893a60ad2SAndreas Gohr// I'm no language expert. If you think some ranges are wrongly chosen or 1993a60ad2SAndreas Gohr// a range is missing, please contact me 20*d5b23302STom N Harrisdefine('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 21*d5b23302STom N Harrisdefine('IDX_ASIAN2','['. 22*d5b23302STom N Harris '\x{2E80}-\x{3040}'. // CJK -> Hangul 23*d5b23302STom N Harris '\x{309D}-\x{30A0}'. 24*d5b23302STom N Harris '\x{30FB}-\x{31EF}\x{3200}-\x{D7AF}'. 2593a60ad2SAndreas Gohr '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 2693a60ad2SAndreas Gohr '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 2793a60ad2SAndreas Gohr ']'); 28*d5b23302STom N Harrisdefine('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 29*d5b23302STom N Harris '\x{3042}\x{3044}\x{3046}\x{3048}'. 30*d5b23302STom N Harris '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 31*d5b23302STom N Harris '\x{3084}\x{3086}\x{3088}-\x{308D}'. 32*d5b23302STom N Harris '\x{308F}-\x{3094}'. 33*d5b23302STom N Harris '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 34*d5b23302STom N Harris '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 35*d5b23302STom N Harris '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 36*d5b23302STom N Harris '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 37*d5b23302STom N Harris ']['. 38*d5b23302STom N Harris '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 39*d5b23302STom N Harris '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 40*d5b23302STom N Harris '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 41*d5b23302STom N Harris '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 42*d5b23302STom N Harris '\x{31F0}-\x{31FF}'. 43*d5b23302STom N Harris ']?'); 4493a60ad2SAndreas Gohr 4593a60ad2SAndreas Gohr 46b4ce25e9SAndreas Gohr/** 47*d5b23302STom N Harris * Measure the length of a string. 48*d5b23302STom N Harris * Differs from strlen in handling of asian characters. 49*d5b23302STom N Harris * 50*d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 51*d5b23302STom N Harris */ 52*d5b23302STom N Harrisfunction wordlen($w){ 53*d5b23302STom N Harris $l = strlen($w); 54*d5b23302STom N Harris // If left alone, all chinese "words" will get put into w3.idx 55*d5b23302STom N Harris // So the "length" of a "word" is faked 56*d5b23302STom N Harris if(preg_match('/'.IDX_ASIAN2.'/u',$w)) 57*d5b23302STom N Harris $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF 58*d5b23302STom N Harris return $l; 59*d5b23302STom N Harris} 60*d5b23302STom N Harris 61*d5b23302STom N Harris/** 62579b0f7eSTNHarris * Write a list of strings to an index file. 63579b0f7eSTNHarris * 64579b0f7eSTNHarris * @author Tom N Harris <tnharris@whoopdedo.org> 65579b0f7eSTNHarris */ 66579b0f7eSTNHarrisfunction idx_saveIndex($pre, $wlen, $idx){ 67579b0f7eSTNHarris global $conf; 68579b0f7eSTNHarris $fn = $conf['indexdir'].'/'.$pre.$wlen; 69579b0f7eSTNHarris $fh = @fopen($fn.'.tmp','w'); 70579b0f7eSTNHarris if(!$fh) return false; 71579b0f7eSTNHarris fwrite($fh,join('',$idx)); 72579b0f7eSTNHarris fclose($fh); 73579b0f7eSTNHarris if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); 74579b0f7eSTNHarris io_rename($fn.'.tmp', $fn.'.idx'); 75579b0f7eSTNHarris return true; 76579b0f7eSTNHarris} 77579b0f7eSTNHarris 78579b0f7eSTNHarris/** 79579b0f7eSTNHarris * Read the list of words in an index (if it exists). 80579b0f7eSTNHarris * 81579b0f7eSTNHarris * @author Tom N Harris <tnharris@whoopdedo.org> 82579b0f7eSTNHarris */ 83579b0f7eSTNHarrisfunction idx_getIndex($pre, $wlen){ 84579b0f7eSTNHarris global $conf; 85579b0f7eSTNHarris $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; 86579b0f7eSTNHarris if(!@file_exists($fn)) return array(); 87579b0f7eSTNHarris return file($fn); 88579b0f7eSTNHarris} 89579b0f7eSTNHarris 90579b0f7eSTNHarris/** 91579b0f7eSTNHarris * Create an empty index file if it doesn't exist yet. 92579b0f7eSTNHarris * 93579b0f7eSTNHarris * @author Tom N Harris <tnharris@whoopdedo.org> 94579b0f7eSTNHarris */ 95579b0f7eSTNHarrisfunction idx_touchIndex($pre, $wlen){ 96579b0f7eSTNHarris global $conf; 97579b0f7eSTNHarris $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx'; 98579b0f7eSTNHarris if(!@file_exists($fn)){ 99579b0f7eSTNHarris touch($fn); 100579b0f7eSTNHarris if($conf['fperm']) chmod($fn, $conf['fperm']); 101579b0f7eSTNHarris } 102579b0f7eSTNHarris} 103579b0f7eSTNHarris 104579b0f7eSTNHarris/** 10544ca0adfSAndreas Gohr * Split a page into words 10644ca0adfSAndreas Gohr * 107579b0f7eSTNHarris * Returns an array of word counts, false if an error occured. 108579b0f7eSTNHarris * Array is keyed on the word length, then the word index. 10944ca0adfSAndreas Gohr * 11044ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 11117f42b01SChris Smith * @author Christopher Smith <chris@jalakai.co.uk> 112b4ce25e9SAndreas Gohr */ 11344ca0adfSAndreas Gohrfunction idx_getPageWords($page){ 11444ca0adfSAndreas Gohr global $conf; 1157367b368SAndreas Gohr $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 1167367b368SAndreas Gohr if(@file_exists($swfile)){ 1177367b368SAndreas Gohr $stopwords = file($swfile); 1187367b368SAndreas Gohr }else{ 1197367b368SAndreas Gohr $stopwords = array(); 1207367b368SAndreas Gohr } 12144ca0adfSAndreas Gohr 12244ca0adfSAndreas Gohr $body = rawWiki($page); 12317f42b01SChris Smith $body = strtr($body, "\r\n\t", ' '); 12417f42b01SChris Smith $tokens = explode(' ', $body); 12517f42b01SChris Smith $tokens = array_count_values($tokens); // count the frequency of each token 12617f42b01SChris Smith 1276b06b652Schris// ensure the deaccented or romanised page names of internal links are added to the token array 1286b06b652Schris// (this is necessary for the backlink function -- there maybe a better way!) 1296b06b652Schris if ($conf['deaccent']) { 1306b06b652Schris $links = p_get_metadata($page,'relation references'); 1316b06b652Schris 1323fc667cfSchris if (!empty($links)) { 1336b06b652Schris $tmp = join(' ',array_keys($links)); // make a single string 1346b06b652Schris $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space 1356b06b652Schris $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens 1366b06b652Schris 1376b06b652Schris foreach ($link_tokens as $link_token) { 1386b06b652Schris if (isset($tokens[$link_token])) continue; 1396b06b652Schris $tokens[$link_token] = 1; 1406b06b652Schris } 1416b06b652Schris } 1423fc667cfSchris } 1436b06b652Schris 14417f42b01SChris Smith $words = array(); 14517f42b01SChris Smith foreach ($tokens as $word => $count) { 146579b0f7eSTNHarris $arr = idx_tokenizer($word,$stopwords); 14717f42b01SChris Smith $arr = array_count_values($arr); 14817f42b01SChris Smith foreach ($arr as $w => $c) { 149*d5b23302STom N Harris $l = wordlen($w); 150579b0f7eSTNHarris if(isset($words[$l])){ 151b2bc63f0SAndreas Gohr $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 15217f42b01SChris Smith }else{ 153579b0f7eSTNHarris $words[$l] = array($w => $c * $count); 154579b0f7eSTNHarris } 15517f42b01SChris Smith } 15617f42b01SChris Smith } 15717f42b01SChris Smith 158579b0f7eSTNHarris // arrive here with $words = array(wordlen => array(word => frequency)) 159b4ce25e9SAndreas Gohr 160b4ce25e9SAndreas Gohr $index = array(); //resulting index 161579b0f7eSTNHarris foreach (array_keys($words) as $wlen){ 162579b0f7eSTNHarris $word_idx = idx_getIndex('w',$wlen); 163579b0f7eSTNHarris foreach ($words[$wlen] as $word => $freq) { 16444ca0adfSAndreas Gohr $wid = array_search("$word\n",$word_idx); 16544ca0adfSAndreas Gohr if(!is_int($wid)){ 166*d5b23302STom N Harris $wid = count($word_idx); 16744ca0adfSAndreas Gohr $word_idx[] = "$word\n"; 168b4ce25e9SAndreas Gohr } 169579b0f7eSTNHarris if(!isset($index[$wlen])) 170579b0f7eSTNHarris $index[$wlen] = array(); 171579b0f7eSTNHarris $index[$wlen][$wid] = $freq; 17244ca0adfSAndreas Gohr } 17344ca0adfSAndreas Gohr 17444ca0adfSAndreas Gohr // save back word index 175579b0f7eSTNHarris if(!idx_saveIndex('w',$wlen,$word_idx)){ 176579b0f7eSTNHarris trigger_error("Failed to write word index", E_USER_ERROR); 17744ca0adfSAndreas Gohr return false; 17844ca0adfSAndreas Gohr } 179579b0f7eSTNHarris } 180b4ce25e9SAndreas Gohr 181b4ce25e9SAndreas Gohr return $index; 182b4ce25e9SAndreas Gohr} 183b4ce25e9SAndreas Gohr 18444ca0adfSAndreas Gohr/** 18544ca0adfSAndreas Gohr * Adds/updates the search for the given page 18644ca0adfSAndreas Gohr * 18744ca0adfSAndreas Gohr * This is the core function of the indexer which does most 18844ca0adfSAndreas Gohr * of the work. This function needs to be called with proper 18944ca0adfSAndreas Gohr * locking! 19044ca0adfSAndreas Gohr * 19144ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 19244ca0adfSAndreas Gohr */ 19344ca0adfSAndreas Gohrfunction idx_addPage($page){ 19444ca0adfSAndreas Gohr global $conf; 195b4ce25e9SAndreas Gohr 196488dd6ceSAndreas Gohr // load known documents 197579b0f7eSTNHarris $page_idx = idx_getIndex('page',''); 19844ca0adfSAndreas Gohr 19944ca0adfSAndreas Gohr // get page id (this is the linenumber in page.idx) 20044ca0adfSAndreas Gohr $pid = array_search("$page\n",$page_idx); 20144ca0adfSAndreas Gohr if(!is_int($pid)){ 20244ca0adfSAndreas Gohr $page_idx[] = "$page\n"; 20344ca0adfSAndreas Gohr $pid = count($page_idx)-1; 20444ca0adfSAndreas Gohr // page was new - write back 205*d5b23302STom N Harris if (!idx_saveIndex('page','',$page_idx)){ 206*d5b23302STom N Harris trigger_error("Failed to write page index", E_USER_ERROR); 207579b0f7eSTNHarris return false; 20844ca0adfSAndreas Gohr } 209*d5b23302STom N Harris } 21044ca0adfSAndreas Gohr 21144ca0adfSAndreas Gohr // get word usage in page 21244ca0adfSAndreas Gohr $words = idx_getPageWords($page); 21344ca0adfSAndreas Gohr if($words === false) return false; 21444ca0adfSAndreas Gohr if(!count($words)) return true; 21544ca0adfSAndreas Gohr 216579b0f7eSTNHarris foreach(array_keys($words) as $wlen){ 217*d5b23302STom N Harris $index = idx_getIndex('i',$wlen); 218*d5b23302STom N Harris foreach($words[$wlen] as $wid => $freq){ 219*d5b23302STom N Harris if($wid<count($index)){ 220*d5b23302STom N Harris $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq); 221*d5b23302STom N Harris }else{ 222*d5b23302STom N Harris // New words **should** have been added in increasing order 223*d5b23302STom N Harris // starting with the first unassigned index. 224*d5b23302STom N Harris // If someone can show how this isn't true, then I'll need to sort 225*d5b23302STom N Harris // or do something special. 226*d5b23302STom N Harris $index[$wid] = idx_updateIndexLine('',$pid,$freq); 227*d5b23302STom N Harris } 228*d5b23302STom N Harris } 229*d5b23302STom N Harris // save back word index 230*d5b23302STom N Harris if(!idx_saveIndex('i',$wlen,$index)){ 231*d5b23302STom N Harris trigger_error("Failed to write index", E_USER_ERROR); 23244ca0adfSAndreas Gohr return false; 23344ca0adfSAndreas Gohr } 234579b0f7eSTNHarris } 235579b0f7eSTNHarris 236579b0f7eSTNHarris return true; 23744ca0adfSAndreas Gohr} 23844ca0adfSAndreas Gohr 23944ca0adfSAndreas Gohr/** 24044ca0adfSAndreas Gohr * Write a new index line to the filehandle 24144ca0adfSAndreas Gohr * 24244ca0adfSAndreas Gohr * This function writes an line for the index file to the 24344ca0adfSAndreas Gohr * given filehandle. It removes the given document from 24444ca0adfSAndreas Gohr * the given line and readds it when $count is >0. 24544ca0adfSAndreas Gohr * 246*d5b23302STom N Harris * @deprecated - see idx_updateIndexLine 24744ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 24844ca0adfSAndreas Gohr */ 24944ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){ 250*d5b23302STom N Harris fwrite($fh,idx_updateIndexLine($line,$pid,$count)); 251*d5b23302STom N Harris} 25244ca0adfSAndreas Gohr 253*d5b23302STom N Harris/** 254*d5b23302STom N Harris * Modify an index line with new information 255*d5b23302STom N Harris * 256*d5b23302STom N Harris * This returns a line of the index. It removes the 257*d5b23302STom N Harris * given document from the line and readds it if 258*d5b23302STom N Harris * $count is >0. 259*d5b23302STom N Harris * 260*d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 261*d5b23302STom N Harris * @author Andreas Gohr <andi@splitbrain.org> 262*d5b23302STom N Harris */ 263*d5b23302STom N Harrisfunction idx_updateIndexLine($line,$pid,$count){ 264*d5b23302STom N Harris $line = trim($line); 265*d5b23302STom N Harris $updated = array(); 26644ca0adfSAndreas Gohr if($line != ''){ 26744ca0adfSAndreas Gohr $parts = explode(':',$line); 26844ca0adfSAndreas Gohr // remove doc from given line 26944ca0adfSAndreas Gohr foreach($parts as $part){ 27044ca0adfSAndreas Gohr if($part == '') continue; 27144ca0adfSAndreas Gohr list($doc,$cnt) = explode('*',$part); 27244ca0adfSAndreas Gohr if($doc != $pid){ 273*d5b23302STom N Harris $updated[] = $part; 27444ca0adfSAndreas Gohr } 27544ca0adfSAndreas Gohr } 27644ca0adfSAndreas Gohr } 27744ca0adfSAndreas Gohr 27844ca0adfSAndreas Gohr // add doc 27944ca0adfSAndreas Gohr if ($count){ 280*d5b23302STom N Harris $updated[] = "$pid*$count"; 28144ca0adfSAndreas Gohr } 28244ca0adfSAndreas Gohr 283*d5b23302STom N Harris return join(':',$updated)."\n"; 28444ca0adfSAndreas Gohr} 285b4ce25e9SAndreas Gohr 286488dd6ceSAndreas Gohr/** 287579b0f7eSTNHarris * Get the word lengths that have been indexed. 288579b0f7eSTNHarris * 289579b0f7eSTNHarris * Reads the index directory and returns an array of lengths 290579b0f7eSTNHarris * that there are indices for. 291579b0f7eSTNHarris * 292579b0f7eSTNHarris * @author Tom N Harris <tnharris@whoopdedo.org> 293579b0f7eSTNHarris */ 294*d5b23302STom N Harrisfunction idx_indexLengths(&$filter){ 295579b0f7eSTNHarris global $conf; 296579b0f7eSTNHarris $dir = @opendir($conf['indexdir']); 297579b0f7eSTNHarris if($dir===false) 298579b0f7eSTNHarris return array(); 299579b0f7eSTNHarris $idx = array(); 300*d5b23302STom N Harris if(is_array($filter)){ 301579b0f7eSTNHarris while (($f = readdir($dir)) !== false) { 302579b0f7eSTNHarris if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){ 303579b0f7eSTNHarris $i = substr($f,1,-4); 304*d5b23302STom N Harris if (is_numeric($i) && isset($filter[(int)$i])) 305*d5b23302STom N Harris $idx[] = (int)$i; 306*d5b23302STom N Harris } 307*d5b23302STom N Harris } 308*d5b23302STom N Harris }else{ 309*d5b23302STom N Harris // Exact match first. 310*d5b23302STom N Harris if(@file_exists($conf['indexdir']."/i$filter.idx")) 311*d5b23302STom N Harris $idx[] = $filter; 312*d5b23302STom N Harris while (($f = readdir($dir)) !== false) { 313*d5b23302STom N Harris if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){ 314*d5b23302STom N Harris $i = substr($f,1,-4); 315*d5b23302STom N Harris if (is_numeric($i) && $i > $filter) 316*d5b23302STom N Harris $idx[] = (int)$i; 317*d5b23302STom N Harris } 318579b0f7eSTNHarris } 319579b0f7eSTNHarris } 320579b0f7eSTNHarris closedir($dir); 321579b0f7eSTNHarris return $idx; 322579b0f7eSTNHarris} 323579b0f7eSTNHarris 324579b0f7eSTNHarris/** 325*d5b23302STom N Harris * Find the the index number of each search term. 326*d5b23302STom N Harris * 327*d5b23302STom N Harris * There are two variation: Simple and Sorted. 328*d5b23302STom N Harris * The simple version just takes the words one at a time. 329*d5b23302STom N Harris * The sorted version will group together words that appear in the same index. 330*d5b23302STom N Harris * So it should perform better, because it only opens each index once. 331*d5b23302STom N Harris * Actually, it's not that great. (in my experience) Probably because of the disk cache. 332*d5b23302STom N Harris * And the sorted function does more work, making it slightly slower in some cases. 333*d5b23302STom N Harris * 334*d5b23302STom N Harris * For now, you can choose to use the sorted version by setting $conf['test_indexer'] = 1 335*d5b23302STom N Harris * Eventually, the more worthy will be chosen and the loser cast into the deepest depths. 336*d5b23302STom N Harris * 337*d5b23302STom N Harris * @param array $words The query terms. Words should only contain valid characters, 338*d5b23302STom N Harris * with a '*' at either the beginning or end of the word (or both) 339*d5b23302STom N Harris * @param arrayref $result Set to word => array("length*id" ...), use this to merge the 340*d5b23302STom N Harris * index locations with the appropriate query term. 341*d5b23302STom N Harris * @return array Set to length => array(id ...) 342*d5b23302STom N Harris * 343*d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 344*d5b23302STom N Harris */ 345*d5b23302STom N Harrisfunction idx_getIndexWordsSimple($words, &$result){ 346*d5b23302STom N Harris // get word IDs 347*d5b23302STom N Harris $wids = array(); 348*d5b23302STom N Harris foreach($words as $word){ 349*d5b23302STom N Harris $result[$word] = array(); 350*d5b23302STom N Harris $wild = 0; 351*d5b23302STom N Harris $xword = $word; 352*d5b23302STom N Harris $wlen = wordlen($word); 353*d5b23302STom N Harris 354*d5b23302STom N Harris // check for wildcards 355*d5b23302STom N Harris if(substr($xword,0,1) == '*'){ 356*d5b23302STom N Harris $xword = substr($xword,1); 357*d5b23302STom N Harris $wild |= 1; 358*d5b23302STom N Harris $wlen -= 1; 359*d5b23302STom N Harris } 360*d5b23302STom N Harris if(substr($xword,-1,1) == '*'){ 361*d5b23302STom N Harris $xword = substr($xword,0,-1); 362*d5b23302STom N Harris $wild |= 2; 363*d5b23302STom N Harris $wlen -= 1; 364*d5b23302STom N Harris } 365*d5b23302STom N Harris if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue; 366*d5b23302STom N Harris 367*d5b23302STom N Harris // look for the ID(s) for the given word 368*d5b23302STom N Harris if($wild){ // handle wildcard search 369*d5b23302STom N Harris $ptn = preg_quote($xword,'/'); 370*d5b23302STom N Harris if(($wild&1) == 0) $ptn = '^'.$ptn; 371*d5b23302STom N Harris if(($wild&2) == 0) $ptn = $ptn.'$'; 372*d5b23302STom N Harris $ptn = '/'.$ptn.'/'; 373*d5b23302STom N Harris foreach (idx_indexLengths($wlen) as $ixlen){ 374*d5b23302STom N Harris $word_idx = idx_getIndex('w',$ixlen); 375*d5b23302STom N Harris foreach(array_keys(preg_grep($ptn,$word_idx)) as $wid){ 376*d5b23302STom N Harris $wids[$ixlen][] = $wid; 377*d5b23302STom N Harris $result[$word][] = "$ixlen*$wid"; 378*d5b23302STom N Harris } 379*d5b23302STom N Harris } 380*d5b23302STom N Harris }else{ // handle exact search 381*d5b23302STom N Harris $word_idx = idx_getIndex('w',$wlen); 382*d5b23302STom N Harris $wid = array_search("$word\n",$word_idx); 383*d5b23302STom N Harris if(is_int($wid)){ 384*d5b23302STom N Harris $wids[$wlen][] = $wid; 385*d5b23302STom N Harris $result[$word][] = "$wlen*$wid"; 386*d5b23302STom N Harris }else{ 387*d5b23302STom N Harris $result[$word] = array(); 388*d5b23302STom N Harris } 389*d5b23302STom N Harris } 390*d5b23302STom N Harris } 391*d5b23302STom N Harris return $wids; 392*d5b23302STom N Harris} 393*d5b23302STom N Harrisfunction idx_getIndexWordsSorted($words,&$result){ 394*d5b23302STom N Harris // parse and sort tokens 395*d5b23302STom N Harris $tokens = array(); 396*d5b23302STom N Harris $tokenlength = array(); 397*d5b23302STom N Harris $tokenwild = array(); 398*d5b23302STom N Harris foreach($words as $word){ 399*d5b23302STom N Harris $result[$word] = array(); 400*d5b23302STom N Harris $wild = 0; 401*d5b23302STom N Harris $xword = $word; 402*d5b23302STom N Harris $wlen = wordlen($word); 403*d5b23302STom N Harris 404*d5b23302STom N Harris // check for wildcards 405*d5b23302STom N Harris if(substr($xword,0,1) == '*'){ 406*d5b23302STom N Harris $xword = substr($xword,1); 407*d5b23302STom N Harris $wild |= 1; 408*d5b23302STom N Harris $wlen -= 1; 409*d5b23302STom N Harris } 410*d5b23302STom N Harris if(substr($xword,-1,1) == '*'){ 411*d5b23302STom N Harris $xword = substr($xword,0,-1); 412*d5b23302STom N Harris $wild |= 2; 413*d5b23302STom N Harris $wlen -= 1; 414*d5b23302STom N Harris } 415*d5b23302STom N Harris if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue; 416*d5b23302STom N Harris if(!isset($tokens[$xword])){ 417*d5b23302STom N Harris $tokenlength[$wlen][] = $xword; 418*d5b23302STom N Harris } 419*d5b23302STom N Harris if($wild){ 420*d5b23302STom N Harris $ptn = preg_quote($xword,'/'); 421*d5b23302STom N Harris if(($wild&1) == 0) $ptn = '^'.$ptn; 422*d5b23302STom N Harris if(($wild&2) == 0) $ptn = $ptn.'$'; 423*d5b23302STom N Harris $tokens[$xword][] = array($word, '/'.$ptn.'/'); 424*d5b23302STom N Harris if(!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen; 425*d5b23302STom N Harris }else 426*d5b23302STom N Harris $tokens[$xword][] = array($word, null); 427*d5b23302STom N Harris } 428*d5b23302STom N Harris asort($tokenwild); 429*d5b23302STom N Harris // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... ) 430*d5b23302STom N Harris // $tokenlength = array( base word length => base word ... ) 431*d5b23302STom N Harris // $tokenwild = array( base word => base word length ... ) 432*d5b23302STom N Harris 433*d5b23302STom N Harris $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 434*d5b23302STom N Harris $indexes_known = idx_indexLengths($length_filter); 435*d5b23302STom N Harris if(!empty($tokenwild)) sort($indexes_known); 436*d5b23302STom N Harris // get word IDs 437*d5b23302STom N Harris $wids = array(); 438*d5b23302STom N Harris echo "\n"; 439*d5b23302STom N Harris foreach($indexes_known as $ixlen){ 440*d5b23302STom N Harris $word_idx = idx_getIndex('w',$ixlen); 441*d5b23302STom N Harris // handle exact search 442*d5b23302STom N Harris if(isset($tokenlength[$ixlen])){ 443*d5b23302STom N Harris foreach($tokenlength[$ixlen] as $xword){ 444*d5b23302STom N Harris $wid = array_search("$xword\n",$word_idx); 445*d5b23302STom N Harris if(is_int($wid)){ 446*d5b23302STom N Harris $wids[$ixlen][] = $wid; 447*d5b23302STom N Harris foreach($tokens[$xword] as $w) 448*d5b23302STom N Harris $result[$w[0]][] = "$ixlen*$wid"; 449*d5b23302STom N Harris } 450*d5b23302STom N Harris } 451*d5b23302STom N Harris } 452*d5b23302STom N Harris // handle wildcard search 453*d5b23302STom N Harris foreach($tokenwild as $xword => $wlen){ 454*d5b23302STom N Harris if($wlen >= $ixlen) break; 455*d5b23302STom N Harris foreach($tokens[$xword] as $w){ 456*d5b23302STom N Harris if(is_null($w[1])) continue; 457*d5b23302STom N Harris foreach(array_keys(preg_grep($w[1],$word_idx)) as $wid){ 458*d5b23302STom N Harris $wids[$ixlen][] = $wid; 459*d5b23302STom N Harris $result[$w[0]][] = "$ixlen*$wid"; 460*d5b23302STom N Harris } 461*d5b23302STom N Harris } 462*d5b23302STom N Harris } 463*d5b23302STom N Harris } 464*d5b23302STom N Harris return $wids; 465*d5b23302STom N Harris} 466*d5b23302STom N Harris 467*d5b23302STom N Harris/** 468488dd6ceSAndreas Gohr * Lookup words in index 469488dd6ceSAndreas Gohr * 470488dd6ceSAndreas Gohr * Takes an array of word and will return a list of matching 471488dd6ceSAndreas Gohr * documents for each one. 472488dd6ceSAndreas Gohr * 47363773904SAndreas Gohr * Important: No ACL checking is done here! All results are 47463773904SAndreas Gohr * returned, regardless of permissions 47563773904SAndreas Gohr * 476488dd6ceSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 477488dd6ceSAndreas Gohr */ 478488dd6ceSAndreas Gohrfunction idx_lookup($words){ 479488dd6ceSAndreas Gohr global $conf; 480488dd6ceSAndreas Gohr 481488dd6ceSAndreas Gohr $result = array(); 482488dd6ceSAndreas Gohr 483*d5b23302STom N Harris if(isset($conf['test_indexer']) && ($conf['test_indexer']&1)) 484*d5b23302STom N Harris $wids = idx_getIndexWordsSorted($words, $result); 485*d5b23302STom N Harris else 486*d5b23302STom N Harris $wids = idx_getIndexWordsSimple($words, $result); 487*d5b23302STom N Harris if(empty($wids)) return array(); 488*d5b23302STom N Harris 489488dd6ceSAndreas Gohr // load known words and documents 490579b0f7eSTNHarris $page_idx = idx_getIndex('page',''); 491488dd6ceSAndreas Gohr 492579b0f7eSTNHarris $docs = array(); // hold docs found 493579b0f7eSTNHarris foreach(array_keys($wids) as $wlen){ 494579b0f7eSTNHarris $wids[$wlen] = array_unique($wids[$wlen]); 495*d5b23302STom N Harris $index = idx_getIndex('i',$wlen); 496*d5b23302STom N Harris foreach($wids[$wlen] as $ixid){ 497*d5b23302STom N Harris if($ixid < count($index)) 498*d5b23302STom N Harris $docs["$wlen*$ixid"] = idx_parseIndexLine($page_idx,$index[$ixid]); 499488dd6ceSAndreas Gohr } 500488dd6ceSAndreas Gohr } 501488dd6ceSAndreas Gohr 502ad81d431SAndreas Gohr // merge found pages into final result array 503ad81d431SAndreas Gohr $final = array(); 504ad81d431SAndreas Gohr foreach(array_keys($result) as $word){ 505ad81d431SAndreas Gohr $final[$word] = array(); 506ad81d431SAndreas Gohr foreach($result[$word] as $wid){ 507ad81d431SAndreas Gohr $hits = &$docs[$wid]; 508ad81d431SAndreas Gohr foreach ($hits as $hitkey => $hitcnt) { 509ad81d431SAndreas Gohr $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; 510ad81d431SAndreas Gohr } 511ad81d431SAndreas Gohr } 512ad81d431SAndreas Gohr } 513ad81d431SAndreas Gohr return $final; 514488dd6ceSAndreas Gohr} 515488dd6ceSAndreas Gohr 516488dd6ceSAndreas Gohr/** 517488dd6ceSAndreas Gohr * Returns a list of documents and counts from a index line 518488dd6ceSAndreas Gohr * 519488dd6ceSAndreas Gohr * It omits docs with a count of 0 and pages that no longer 520488dd6ceSAndreas Gohr * exist. 521488dd6ceSAndreas Gohr * 522488dd6ceSAndreas Gohr * @param array $page_idx The list of known pages 523488dd6ceSAndreas Gohr * @param string $line A line from the main index 524488dd6ceSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 525488dd6ceSAndreas Gohr */ 526488dd6ceSAndreas Gohrfunction idx_parseIndexLine(&$page_idx,$line){ 527488dd6ceSAndreas Gohr $result = array(); 528488dd6ceSAndreas Gohr 529488dd6ceSAndreas Gohr $line = trim($line); 530f5eb7cf0SAndreas Gohr if($line == '') return $result; 531488dd6ceSAndreas Gohr 532488dd6ceSAndreas Gohr $parts = explode(':',$line); 533488dd6ceSAndreas Gohr foreach($parts as $part){ 534488dd6ceSAndreas Gohr if($part == '') continue; 535488dd6ceSAndreas Gohr list($doc,$cnt) = explode('*',$part); 536488dd6ceSAndreas Gohr if(!$cnt) continue; 537488dd6ceSAndreas Gohr $doc = trim($page_idx[$doc]); 538488dd6ceSAndreas Gohr if(!$doc) continue; 539488dd6ceSAndreas Gohr // make sure the document still exists 5400d8ea614Schris if(!@file_exists(wikiFN($doc,'',false))) continue; 541488dd6ceSAndreas Gohr 542488dd6ceSAndreas Gohr $result[$doc] = $cnt; 543488dd6ceSAndreas Gohr } 544488dd6ceSAndreas Gohr return $result; 545488dd6ceSAndreas Gohr} 546488dd6ceSAndreas Gohr 547f5eb7cf0SAndreas Gohr/** 548f5eb7cf0SAndreas Gohr * Tokenizes a string into an array of search words 549f5eb7cf0SAndreas Gohr * 550f5eb7cf0SAndreas Gohr * Uses the same algorithm as idx_getPageWords() 551f5eb7cf0SAndreas Gohr * 552ad81d431SAndreas Gohr * @param string $string the query as given by the user 553ad81d431SAndreas Gohr * @param arrayref $stopwords array of stopwords 554ad81d431SAndreas Gohr * @param boolean $wc are wildcards allowed? 555f5eb7cf0SAndreas Gohr */ 556ad81d431SAndreas Gohrfunction idx_tokenizer($string,&$stopwords,$wc=false){ 557f5eb7cf0SAndreas Gohr $words = array(); 5584efb9a42SAndreas Gohr $wc = ($wc) ? '' : $wc = '\*'; 559f5eb7cf0SAndreas Gohr 560f5eb7cf0SAndreas Gohr if(preg_match('/[^0-9A-Za-z]/u', $string)){ 56191bb5faaSAndreas Gohr // handle asian chars as single words (may fail on older PHP version) 562*d5b23302STom N Harris $asia = @preg_replace('/('.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')/u',' \1 ',$string); 56391bb5faaSAndreas Gohr if(!is_null($asia)) $string = $asia; //recover from regexp failure 56493a60ad2SAndreas Gohr 5654efb9a42SAndreas Gohr $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); 566f5eb7cf0SAndreas Gohr foreach ($arr as $w) { 567f5eb7cf0SAndreas Gohr if (!is_numeric($w) && strlen($w) < 3) continue; 568f5eb7cf0SAndreas Gohr $w = utf8_strtolower($w); 5693cbaa9a4SAndreas Gohr if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 570f5eb7cf0SAndreas Gohr $words[] = $w; 571f5eb7cf0SAndreas Gohr } 572f5eb7cf0SAndreas Gohr }else{ 573f5eb7cf0SAndreas Gohr $w = $string; 574f5eb7cf0SAndreas Gohr if (!is_numeric($w) && strlen($w) < 3) return $words; 575f5eb7cf0SAndreas Gohr $w = strtolower($w); 576f5eb7cf0SAndreas Gohr if(is_int(array_search("$w\n",$stopwords))) return $words; 577f5eb7cf0SAndreas Gohr $words[] = $w; 578f5eb7cf0SAndreas Gohr } 579f5eb7cf0SAndreas Gohr 580f5eb7cf0SAndreas Gohr return $words; 581f5eb7cf0SAndreas Gohr} 582f5eb7cf0SAndreas Gohr 583b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 : 584