1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3b4ce25e9SAndreas Gohr * Common DokuWiki functions 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7b4ce25e9SAndreas Gohr */ 8b4ce25e9SAndreas Gohr 9b4ce25e9SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10b4ce25e9SAndreas Gohr require_once(DOKU_CONF.'dokuwiki.php'); 11b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/io.php'); 12b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/utf8.php'); 13b4ce25e9SAndreas Gohr require_once(DOKU_INC.'inc/parserutils.php'); 14b4ce25e9SAndreas Gohr 1593a60ad2SAndreas Gohr// Asian characters are handled as words. The following regexp defines the 1693a60ad2SAndreas Gohr// Unicode-Ranges for Asian characters 1793a60ad2SAndreas Gohr// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 1893a60ad2SAndreas Gohr// I'm no language expert. If you think some ranges are wrongly chosen or 1993a60ad2SAndreas Gohr// a range is missing, please contact me 2091bb5faaSAndreas Gohrdefine('IDX_ASIAN','['. 2193a60ad2SAndreas Gohr '\x{0E00}-\x{0E7F}'. // Thai 2293a60ad2SAndreas Gohr '\x{2E80}-\x{D7AF}'. // CJK -> Hangul 2393a60ad2SAndreas Gohr '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 2493a60ad2SAndreas Gohr '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 2593a60ad2SAndreas Gohr ']'); 2693a60ad2SAndreas Gohr 2793a60ad2SAndreas Gohr 28b4ce25e9SAndreas Gohr/** 2944ca0adfSAndreas Gohr * Split a page into words 3044ca0adfSAndreas Gohr * 3144ca0adfSAndreas Gohr * Returns an array of of word counts, false if an error occured 3244ca0adfSAndreas Gohr * 3344ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 3417f42b01SChris Smith * @author Christopher Smith <chris@jalakai.co.uk> 35b4ce25e9SAndreas Gohr */ 3644ca0adfSAndreas Gohrfunction idx_getPageWords($page){ 3744ca0adfSAndreas Gohr global $conf; 3844ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 397367b368SAndreas Gohr $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 407367b368SAndreas Gohr if(@file_exists($swfile)){ 417367b368SAndreas Gohr $stopwords = file($swfile); 427367b368SAndreas Gohr }else{ 437367b368SAndreas Gohr $stopwords = array(); 447367b368SAndreas Gohr } 4544ca0adfSAndreas Gohr 4644ca0adfSAndreas Gohr $body = rawWiki($page); 4717f42b01SChris Smith $body = strtr($body, "\r\n\t", ' '); 4817f42b01SChris Smith $tokens = explode(' ', $body); 4917f42b01SChris Smith $tokens = array_count_values($tokens); // count the frequency of each token 5017f42b01SChris Smith 51*6b06b652Schris// ensure the deaccented or romanised page names of internal links are added to the token array 52*6b06b652Schris// (this is necessary for the backlink function -- there maybe a better way!) 53*6b06b652Schris if ($conf['deaccent']) { 54*6b06b652Schris $links = p_get_metadata($page,'relation references'); 55*6b06b652Schris 56*6b06b652Schris $tmp = join(' ',array_keys($links)); // make a single string 57*6b06b652Schris $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space 58*6b06b652Schris $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens 59*6b06b652Schris 60*6b06b652Schris foreach ($link_tokens as $link_token) { 61*6b06b652Schris if (isset($tokens[$link_token])) continue; 62*6b06b652Schris $tokens[$link_token] = 1; 63*6b06b652Schris } 64*6b06b652Schris } 65*6b06b652Schris 6617f42b01SChris Smith $words = array(); 6717f42b01SChris Smith foreach ($tokens as $word => $count) { 6817f42b01SChris Smith // simple filter to restrict use of utf8_stripspecials 69bc54ab52Schris if (preg_match('/[^0-9A-Za-z]/u', $word)) { 7091bb5faaSAndreas Gohr // handle asian chars as single words (may fail on older PHP version) 7191bb5faaSAndreas Gohr $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); 7291bb5faaSAndreas Gohr if(!is_null($asia)) $word = $asia; //recover from regexp failure 73ad81d431SAndreas Gohr $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*')); 7417f42b01SChris Smith $arr = array_count_values($arr); 7517f42b01SChris Smith 7617f42b01SChris Smith foreach ($arr as $w => $c) { 7717f42b01SChris Smith if (!is_numeric($w) && strlen($w) < 3) continue; 78bc54ab52Schris $w = utf8_strtolower($w); 7963201c6eSOsamu Higuchi $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0); 8017f42b01SChris Smith } 8117f42b01SChris Smith } else { 82d18f28deSAndreas Gohr if (!is_numeric($word) && strlen($word) < 3) continue; 83bc54ab52Schris $word = strtolower($word); 8417f42b01SChris Smith $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 8517f42b01SChris Smith } 8617f42b01SChris Smith } 8717f42b01SChris Smith 8817f42b01SChris Smith // arrive here with $words = array(word => frequency) 89b4ce25e9SAndreas Gohr 90b4ce25e9SAndreas Gohr $index = array(); //resulting index 9117f42b01SChris Smith foreach ($words as $word => $freq) { 9217f42b01SChris Smith if (is_int(array_search("$word\n",$stopwords))) continue; 9344ca0adfSAndreas Gohr $wid = array_search("$word\n",$word_idx); 9444ca0adfSAndreas Gohr if(!is_int($wid)){ 9544ca0adfSAndreas Gohr $word_idx[] = "$word\n"; 9644ca0adfSAndreas Gohr $wid = count($word_idx)-1; 97b4ce25e9SAndreas Gohr } 9817f42b01SChris Smith $index[$wid] = $freq; 9944ca0adfSAndreas Gohr } 10044ca0adfSAndreas Gohr 10144ca0adfSAndreas Gohr // save back word index 10244ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/word.idx','w'); 10344ca0adfSAndreas Gohr if(!$fh){ 10444ca0adfSAndreas Gohr trigger_error("Failed to write word.idx", E_USER_ERROR); 10544ca0adfSAndreas Gohr return false; 10644ca0adfSAndreas Gohr } 10744ca0adfSAndreas Gohr fwrite($fh,join('',$word_idx)); 10844ca0adfSAndreas Gohr fclose($fh); 109b4ce25e9SAndreas Gohr 110b4ce25e9SAndreas Gohr return $index; 111b4ce25e9SAndreas Gohr} 112b4ce25e9SAndreas Gohr 11344ca0adfSAndreas Gohr/** 11444ca0adfSAndreas Gohr * Adds/updates the search for the given page 11544ca0adfSAndreas Gohr * 11644ca0adfSAndreas Gohr * This is the core function of the indexer which does most 11744ca0adfSAndreas Gohr * of the work. This function needs to be called with proper 11844ca0adfSAndreas Gohr * locking! 11944ca0adfSAndreas Gohr * 12044ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 12144ca0adfSAndreas Gohr */ 12244ca0adfSAndreas Gohrfunction idx_addPage($page){ 12344ca0adfSAndreas Gohr global $conf; 124b4ce25e9SAndreas Gohr 125488dd6ceSAndreas Gohr // load known documents 12644ca0adfSAndreas Gohr $page_idx = file($conf['cachedir'].'/page.idx'); 12744ca0adfSAndreas Gohr 12844ca0adfSAndreas Gohr // get page id (this is the linenumber in page.idx) 12944ca0adfSAndreas Gohr $pid = array_search("$page\n",$page_idx); 13044ca0adfSAndreas Gohr if(!is_int($pid)){ 13144ca0adfSAndreas Gohr $page_idx[] = "$page\n"; 13244ca0adfSAndreas Gohr $pid = count($page_idx)-1; 13344ca0adfSAndreas Gohr // page was new - write back 13444ca0adfSAndreas Gohr $fh = fopen($conf['cachedir'].'/page.idx','w'); 13544ca0adfSAndreas Gohr if(!$fh) return false; 13644ca0adfSAndreas Gohr fwrite($fh,join('',$page_idx)); 13744ca0adfSAndreas Gohr fclose($fh); 13844ca0adfSAndreas Gohr } 13944ca0adfSAndreas Gohr 14044ca0adfSAndreas Gohr // get word usage in page 14144ca0adfSAndreas Gohr $words = idx_getPageWords($page); 14244ca0adfSAndreas Gohr if($words === false) return false; 14344ca0adfSAndreas Gohr if(!count($words)) return true; 14444ca0adfSAndreas Gohr 14544ca0adfSAndreas Gohr // Open index and temp file 14644ca0adfSAndreas Gohr $idx = fopen($conf['cachedir'].'/index.idx','r'); 14744ca0adfSAndreas Gohr $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 14844ca0adfSAndreas Gohr if(!$idx || !$tmp){ 14944ca0adfSAndreas Gohr trigger_error("Failed to open index files", E_USER_ERROR); 15044ca0adfSAndreas Gohr return false; 15144ca0adfSAndreas Gohr } 15244ca0adfSAndreas Gohr 15344ca0adfSAndreas Gohr // copy from index to temp file, modifying were needed 15444ca0adfSAndreas Gohr $lno = 0; 15544ca0adfSAndreas Gohr $line = ''; 15644ca0adfSAndreas Gohr while (!feof($idx)) { 15744ca0adfSAndreas Gohr // read full line 15844ca0adfSAndreas Gohr $line .= fgets($idx, 4096); 15944ca0adfSAndreas Gohr if(substr($line,-1) != "\n") continue; 16044ca0adfSAndreas Gohr 16144ca0adfSAndreas Gohr // write a new Line to temp file 16244ca0adfSAndreas Gohr idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 16344ca0adfSAndreas Gohr 16444ca0adfSAndreas Gohr $line = ''; // reset line buffer 16544ca0adfSAndreas Gohr $lno++; // increase linecounter 16644ca0adfSAndreas Gohr } 16744ca0adfSAndreas Gohr fclose($idx); 16844ca0adfSAndreas Gohr 16944ca0adfSAndreas Gohr // add missing lines (usually index and word should contain 17044ca0adfSAndreas Gohr // the same number of lines, however if the page contained 17144ca0adfSAndreas Gohr // new words the word file has some more lines which need to 17244ca0adfSAndreas Gohr // be added here 17344ca0adfSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 17444ca0adfSAndreas Gohr $wcnt = count($word_idx); 17544ca0adfSAndreas Gohr for($lno; $lno<$wcnt; $lno++){ 17644ca0adfSAndreas Gohr idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 17744ca0adfSAndreas Gohr } 17844ca0adfSAndreas Gohr 17944ca0adfSAndreas Gohr // close the temp file and move it over to be the new one 18044ca0adfSAndreas Gohr fclose($tmp); 1819684e36cSAndreas Gohr // try rename first (fast) fallback to copy (slow) 1823aee4c27SAndreas Gohr io_rename($conf['cachedir'].'/index.tmp', 1833aee4c27SAndreas Gohr $conf['cachedir'].'/index.idx'); 1849684e36cSAndreas Gohr return false; 18544ca0adfSAndreas Gohr} 18644ca0adfSAndreas Gohr 18744ca0adfSAndreas Gohr/** 18844ca0adfSAndreas Gohr * Write a new index line to the filehandle 18944ca0adfSAndreas Gohr * 19044ca0adfSAndreas Gohr * This function writes an line for the index file to the 19144ca0adfSAndreas Gohr * given filehandle. It removes the given document from 19244ca0adfSAndreas Gohr * the given line and readds it when $count is >0. 19344ca0adfSAndreas Gohr * 19444ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 19544ca0adfSAndreas Gohr */ 19644ca0adfSAndreas Gohrfunction idx_writeIndexLine($fh,$line,$pid,$count){ 19744ca0adfSAndreas Gohr $line = trim($line); 19844ca0adfSAndreas Gohr 19944ca0adfSAndreas Gohr if($line != ''){ 20044ca0adfSAndreas Gohr $parts = explode(':',$line); 20144ca0adfSAndreas Gohr // remove doc from given line 20244ca0adfSAndreas Gohr foreach($parts as $part){ 20344ca0adfSAndreas Gohr if($part == '') continue; 20444ca0adfSAndreas Gohr list($doc,$cnt) = explode('*',$part); 20544ca0adfSAndreas Gohr if($doc != $pid){ 20644ca0adfSAndreas Gohr fwrite($fh,"$doc*$cnt:"); 20744ca0adfSAndreas Gohr } 20844ca0adfSAndreas Gohr } 20944ca0adfSAndreas Gohr } 21044ca0adfSAndreas Gohr 21144ca0adfSAndreas Gohr // add doc 21244ca0adfSAndreas Gohr if ($count){ 21344ca0adfSAndreas Gohr fwrite($fh,"$pid*$count"); 21444ca0adfSAndreas Gohr } 21544ca0adfSAndreas Gohr 21644ca0adfSAndreas Gohr // add newline 21744ca0adfSAndreas Gohr fwrite($fh,"\n"); 21844ca0adfSAndreas Gohr} 219b4ce25e9SAndreas Gohr 220488dd6ceSAndreas Gohr/** 221488dd6ceSAndreas Gohr * Lookup words in index 222488dd6ceSAndreas Gohr * 223488dd6ceSAndreas Gohr * Takes an array of word and will return a list of matching 224488dd6ceSAndreas Gohr * documents for each one. 225488dd6ceSAndreas Gohr * 22663773904SAndreas Gohr * Important: No ACL checking is done here! All results are 22763773904SAndreas Gohr * returned, regardless of permissions 22863773904SAndreas Gohr * 229488dd6ceSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 230488dd6ceSAndreas Gohr */ 231488dd6ceSAndreas Gohrfunction idx_lookup($words){ 232488dd6ceSAndreas Gohr global $conf; 233488dd6ceSAndreas Gohr 234488dd6ceSAndreas Gohr $result = array(); 235488dd6ceSAndreas Gohr 236488dd6ceSAndreas Gohr // load known words and documents 237488dd6ceSAndreas Gohr $page_idx = file($conf['cachedir'].'/page.idx'); 238488dd6ceSAndreas Gohr $word_idx = file($conf['cachedir'].'/word.idx'); 239488dd6ceSAndreas Gohr 240488dd6ceSAndreas Gohr // get word IDs 241488dd6ceSAndreas Gohr $wids = array(); 242488dd6ceSAndreas Gohr foreach($words as $word){ 243ad81d431SAndreas Gohr $result[$word] = array(); 244ad81d431SAndreas Gohr $wild = 0; 245ad81d431SAndreas Gohr $xword = $word; 246ad81d431SAndreas Gohr 247ad81d431SAndreas Gohr // check for wildcards 248ad81d431SAndreas Gohr if(substr($xword,0,1) == '*'){ 249ad81d431SAndreas Gohr $xword = substr($xword,1); 250ad81d431SAndreas Gohr $wild = 1; 2519ee93076Schris $ptn = '/'.preg_quote($xword,'/').'$/'; 2529ee93076Schris# $l = -1*strlen($xword)-1; 253ad81d431SAndreas Gohr } 254ad81d431SAndreas Gohr if(substr($xword,-1,1) == '*'){ 255ad81d431SAndreas Gohr $xword = substr($xword,0,-1); 256ad81d431SAndreas Gohr $wild += 2; 257ad81d431SAndreas Gohr } 258ad81d431SAndreas Gohr 259ad81d431SAndreas Gohr // look for the ID(s) for the given word 260ad81d431SAndreas Gohr if($wild){ // handle wildcard search 261ad81d431SAndreas Gohr $cnt = count($word_idx); 262ad81d431SAndreas Gohr for($wid=0; $wid<$cnt; $wid++){ 263ad81d431SAndreas Gohr $iword = $word_idx[$wid]; 264ad81d431SAndreas Gohr if( (($wild==3) && is_int(strpos($iword,$xword))) || 2659ee93076Schris# (($wild==1) && ("$xword\n" == substr($iword,$l))) || 2669ee93076Schris (($wild==1) && preg_match($ptn,$iword)) || 2679ee93076Schris# (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) 2689ee93076Schris (($wild==2) && (0 === strpos($iword,$xword))) 2699ee93076Schris 270ad81d431SAndreas Gohr ){ 271ad81d431SAndreas Gohr $wids[] = $wid; 272ad81d431SAndreas Gohr $result[$word][] = $wid; 273ad81d431SAndreas Gohr } 274ad81d431SAndreas Gohr } 275ad81d431SAndreas Gohr }else{ // handle exact search 276488dd6ceSAndreas Gohr $wid = array_search("$word\n",$word_idx); 277488dd6ceSAndreas Gohr if(is_int($wid)){ 278488dd6ceSAndreas Gohr $wids[] = $wid; 279ad81d431SAndreas Gohr $result[$word][] = $wid; 280f5eb7cf0SAndreas Gohr }else{ 281f5eb7cf0SAndreas Gohr $result[$word] = array(); 282488dd6ceSAndreas Gohr } 283488dd6ceSAndreas Gohr } 284ad81d431SAndreas Gohr } 285488dd6ceSAndreas Gohr sort($wids); 286f5eb7cf0SAndreas Gohr $wids = array_unique($wids); 287488dd6ceSAndreas Gohr 288488dd6ceSAndreas Gohr // Open index 289488dd6ceSAndreas Gohr $idx = fopen($conf['cachedir'].'/index.idx','r'); 290488dd6ceSAndreas Gohr if(!$idx){ 291ad81d431SAndreas Gohr msg("Failed to open index file",-1); 292488dd6ceSAndreas Gohr return false; 293488dd6ceSAndreas Gohr } 294488dd6ceSAndreas Gohr 295488dd6ceSAndreas Gohr // Walk the index til the lines are found 296488dd6ceSAndreas Gohr $docs = array(); // hold docs found 297488dd6ceSAndreas Gohr $lno = 0; 298488dd6ceSAndreas Gohr $line = ''; 299488dd6ceSAndreas Gohr $srch = array_shift($wids); // which word do we look for? 300488dd6ceSAndreas Gohr while (!feof($idx)) { 301488dd6ceSAndreas Gohr // read full line 302488dd6ceSAndreas Gohr $line .= fgets($idx, 4096); 303488dd6ceSAndreas Gohr if(substr($line,-1) != "\n") continue; 304488dd6ceSAndreas Gohr if($lno > $srch) break; // shouldn't happen 305488dd6ceSAndreas Gohr 306488dd6ceSAndreas Gohr 307488dd6ceSAndreas Gohr // do we want this line? 308488dd6ceSAndreas Gohr if($lno == $srch){ 309488dd6ceSAndreas Gohr // add docs to list 310488dd6ceSAndreas Gohr $docs[$srch] = idx_parseIndexLine($page_idx,$line); 311488dd6ceSAndreas Gohr 312488dd6ceSAndreas Gohr $srch = array_shift($wids); // next word to look up 313488dd6ceSAndreas Gohr if($srch == null) break; // no more words 314488dd6ceSAndreas Gohr } 315488dd6ceSAndreas Gohr 316488dd6ceSAndreas Gohr $line = ''; // reset line buffer 317488dd6ceSAndreas Gohr $lno++; // increase linecounter 318488dd6ceSAndreas Gohr } 319488dd6ceSAndreas Gohr fclose($idx); 320488dd6ceSAndreas Gohr 321488dd6ceSAndreas Gohr 322ad81d431SAndreas Gohr // merge found pages into final result array 323ad81d431SAndreas Gohr $final = array(); 324ad81d431SAndreas Gohr foreach(array_keys($result) as $word){ 325ad81d431SAndreas Gohr $final[$word] = array(); 326ad81d431SAndreas Gohr foreach($result[$word] as $wid){ 327ad81d431SAndreas Gohr $hits = &$docs[$wid]; 328ad81d431SAndreas Gohr foreach ($hits as $hitkey => $hitcnt) { 329ad81d431SAndreas Gohr $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; 330ad81d431SAndreas Gohr } 331ad81d431SAndreas Gohr } 332ad81d431SAndreas Gohr } 333ad81d431SAndreas Gohr return $final; 334488dd6ceSAndreas Gohr} 335488dd6ceSAndreas Gohr 336488dd6ceSAndreas Gohr/** 337488dd6ceSAndreas Gohr * Returns a list of documents and counts from a index line 338488dd6ceSAndreas Gohr * 339488dd6ceSAndreas Gohr * It omits docs with a count of 0 and pages that no longer 340488dd6ceSAndreas Gohr * exist. 341488dd6ceSAndreas Gohr * 342488dd6ceSAndreas Gohr * @param array $page_idx The list of known pages 343488dd6ceSAndreas Gohr * @param string $line A line from the main index 344488dd6ceSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 345488dd6ceSAndreas Gohr */ 346488dd6ceSAndreas Gohrfunction idx_parseIndexLine(&$page_idx,$line){ 347488dd6ceSAndreas Gohr $result = array(); 348488dd6ceSAndreas Gohr 349488dd6ceSAndreas Gohr $line = trim($line); 350f5eb7cf0SAndreas Gohr if($line == '') return $result; 351488dd6ceSAndreas Gohr 352488dd6ceSAndreas Gohr $parts = explode(':',$line); 353488dd6ceSAndreas Gohr foreach($parts as $part){ 354488dd6ceSAndreas Gohr if($part == '') continue; 355488dd6ceSAndreas Gohr list($doc,$cnt) = explode('*',$part); 356488dd6ceSAndreas Gohr if(!$cnt) continue; 357488dd6ceSAndreas Gohr $doc = trim($page_idx[$doc]); 358488dd6ceSAndreas Gohr if(!$doc) continue; 359488dd6ceSAndreas Gohr // make sure the document still exists 3600d8ea614Schris if(!@file_exists(wikiFN($doc,'',false))) continue; 361488dd6ceSAndreas Gohr 362488dd6ceSAndreas Gohr $result[$doc] = $cnt; 363488dd6ceSAndreas Gohr } 364488dd6ceSAndreas Gohr return $result; 365488dd6ceSAndreas Gohr} 366488dd6ceSAndreas Gohr 367f5eb7cf0SAndreas Gohr/** 368f5eb7cf0SAndreas Gohr * Tokenizes a string into an array of search words 369f5eb7cf0SAndreas Gohr * 370f5eb7cf0SAndreas Gohr * Uses the same algorithm as idx_getPageWords() 371f5eb7cf0SAndreas Gohr * 372ad81d431SAndreas Gohr * @param string $string the query as given by the user 373ad81d431SAndreas Gohr * @param arrayref $stopwords array of stopwords 374ad81d431SAndreas Gohr * @param boolean $wc are wildcards allowed? 375ad81d431SAndreas Gohr * 376f5eb7cf0SAndreas Gohr * @todo make combined function to use alone or in getPageWords 377f5eb7cf0SAndreas Gohr */ 378ad81d431SAndreas Gohrfunction idx_tokenizer($string,&$stopwords,$wc=false){ 379f5eb7cf0SAndreas Gohr $words = array(); 3804efb9a42SAndreas Gohr $wc = ($wc) ? '' : $wc = '\*'; 381f5eb7cf0SAndreas Gohr 382f5eb7cf0SAndreas Gohr if(preg_match('/[^0-9A-Za-z]/u', $string)){ 38391bb5faaSAndreas Gohr // handle asian chars as single words (may fail on older PHP version) 38491bb5faaSAndreas Gohr $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); 38591bb5faaSAndreas Gohr if(!is_null($asia)) $string = $asia; //recover from regexp failure 38693a60ad2SAndreas Gohr 3874efb9a42SAndreas Gohr $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); 388f5eb7cf0SAndreas Gohr foreach ($arr as $w) { 389f5eb7cf0SAndreas Gohr if (!is_numeric($w) && strlen($w) < 3) continue; 390f5eb7cf0SAndreas Gohr $w = utf8_strtolower($w); 3913cbaa9a4SAndreas Gohr if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 392f5eb7cf0SAndreas Gohr $words[] = $w; 393f5eb7cf0SAndreas Gohr } 394f5eb7cf0SAndreas Gohr }else{ 395f5eb7cf0SAndreas Gohr $w = $string; 396f5eb7cf0SAndreas Gohr if (!is_numeric($w) && strlen($w) < 3) return $words; 397f5eb7cf0SAndreas Gohr $w = strtolower($w); 398f5eb7cf0SAndreas Gohr if(is_int(array_search("$w\n",$stopwords))) return $words; 399f5eb7cf0SAndreas Gohr $words[] = $w; 400f5eb7cf0SAndreas Gohr } 401f5eb7cf0SAndreas Gohr 402f5eb7cf0SAndreas Gohr return $words; 403f5eb7cf0SAndreas Gohr} 404f5eb7cf0SAndreas Gohr 405b4ce25e9SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 : 406