1094ebf29SSatoshi Sahara<?php 2094ebf29SSatoshi Sahara 3094ebf29SSatoshi Saharanamespace dokuwiki\Search; 4094ebf29SSatoshi Sahara 5094ebf29SSatoshi Saharause dokuwiki\Extension\Event; 6094ebf29SSatoshi Saharause dokuwiki\Utf8; 7094ebf29SSatoshi Sahara 8094ebf29SSatoshi Sahara// set the minimum token length to use in the index 9094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens) 10094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2; 11094ebf29SSatoshi Sahara 12094ebf29SSatoshi Sahara/** 13*1755450bSSatoshi Sahara * DokuWiki Tokenizer class 14094ebf29SSatoshi Sahara */ 15094ebf29SSatoshi Saharaclass Tokenizer 16094ebf29SSatoshi Sahara{ 17094ebf29SSatoshi Sahara /** @var array $Stopwords Words that tokenizer ignores */ 18*1755450bSSatoshi Sahara protected static $Stopwords; 19094ebf29SSatoshi Sahara 20094ebf29SSatoshi Sahara /** @var int $MinWordLength minimum token length */ 21*1755450bSSatoshi Sahara protected static $MinWordLength; 22094ebf29SSatoshi Sahara 23094ebf29SSatoshi Sahara /** 24094ebf29SSatoshi Sahara * Returns words that will be ignored 25094ebf29SSatoshi Sahara * 26094ebf29SSatoshi Sahara * @return array list of stop words 27094ebf29SSatoshi Sahara * 28094ebf29SSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 29094ebf29SSatoshi Sahara */ 30*1755450bSSatoshi Sahara public static function getStopwords() 31094ebf29SSatoshi Sahara { 32*1755450bSSatoshi Sahara if (!isset(static::$Stopwords)) { 33094ebf29SSatoshi Sahara global $conf; 34094ebf29SSatoshi Sahara $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 35094ebf29SSatoshi Sahara if (file_exists($swFile)) { 36*1755450bSSatoshi Sahara static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); 37094ebf29SSatoshi Sahara } else { 38*1755450bSSatoshi Sahara static::$Stopwords = array(); 39094ebf29SSatoshi Sahara } 40094ebf29SSatoshi Sahara } 41*1755450bSSatoshi Sahara return static::$Stopwords; 42094ebf29SSatoshi Sahara } 43094ebf29SSatoshi Sahara 44094ebf29SSatoshi Sahara /** 45094ebf29SSatoshi Sahara * Returns minimum word length to be used in the index 46*1755450bSSatoshi Sahara * 47*1755450bSSatoshi Sahara * @return int 48094ebf29SSatoshi Sahara */ 49*1755450bSSatoshi Sahara public static function getMinWordLength() 50094ebf29SSatoshi Sahara { 51*1755450bSSatoshi Sahara if (!isset(static::$MinWordLength)) { 52*1755450bSSatoshi Sahara // set the minimum token length to use in the index 53*1755450bSSatoshi Sahara // (note, this doesn't apply to numeric tokens) 54*1755450bSSatoshi Sahara static::$MinWordLength = (defined('IDX_MINWORDLENGTH')) 55*1755450bSSatoshi Sahara ? IDX_MINWORDLENGTH 56*1755450bSSatoshi Sahara : MINWORDLENGTH; 57*1755450bSSatoshi Sahara } 58*1755450bSSatoshi Sahara return static::$MinWordLength; 59094ebf29SSatoshi Sahara } 60094ebf29SSatoshi Sahara 61094ebf29SSatoshi Sahara /** 62094ebf29SSatoshi Sahara * Split the text into words for fulltext search 63094ebf29SSatoshi Sahara * 64094ebf29SSatoshi Sahara * @triggers INDEXER_TEXT_PREPARE 65094ebf29SSatoshi Sahara * This event allows plugins to modify the text before it gets tokenized. 66094ebf29SSatoshi Sahara * Plugins intercepting this event should also intercept INDEX_VERSION_GET 67094ebf29SSatoshi Sahara * 68094ebf29SSatoshi Sahara * @param string $text plain text 69094ebf29SSatoshi Sahara * @param bool $wc are wildcards allowed? 70094ebf29SSatoshi Sahara * @return array list of words in the text 71094ebf29SSatoshi Sahara * 72094ebf29SSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 73094ebf29SSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 74094ebf29SSatoshi Sahara */ 75*1755450bSSatoshi Sahara public static function getWords($text, $wc = false) 76094ebf29SSatoshi Sahara { 77094ebf29SSatoshi Sahara $wc = ($wc) ? '' : '\*'; 78094ebf29SSatoshi Sahara 79094ebf29SSatoshi Sahara // prepare the text to be tokenized 80094ebf29SSatoshi Sahara $event = new Event('INDEXER_TEXT_PREPARE', $text); 81094ebf29SSatoshi Sahara if ($event->advise_before(true)) { 82094ebf29SSatoshi Sahara if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 83094ebf29SSatoshi Sahara $text = Utf8\Asian::separateAsianWords($text); 84094ebf29SSatoshi Sahara } 85094ebf29SSatoshi Sahara } 86094ebf29SSatoshi Sahara $event->advise_after(); 87094ebf29SSatoshi Sahara unset($event); 88094ebf29SSatoshi Sahara 89*1755450bSSatoshi Sahara $text = strtr($text, array( 90094ebf29SSatoshi Sahara "\r" => ' ', 91094ebf29SSatoshi Sahara "\n" => ' ', 92094ebf29SSatoshi Sahara "\t" => ' ', 93094ebf29SSatoshi Sahara "\xC2\xAD" => '', //soft-hyphen 94*1755450bSSatoshi Sahara )); 95094ebf29SSatoshi Sahara if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 96094ebf29SSatoshi Sahara $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 97094ebf29SSatoshi Sahara } 98094ebf29SSatoshi Sahara 99094ebf29SSatoshi Sahara $wordlist = explode(' ', $text); 100094ebf29SSatoshi Sahara foreach ($wordlist as $i => $word) { 101094ebf29SSatoshi Sahara $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 102094ebf29SSatoshi Sahara Utf8\PhpString::strtolower($word) : strtolower($word); 103094ebf29SSatoshi Sahara } 104094ebf29SSatoshi Sahara 105094ebf29SSatoshi Sahara foreach ($wordlist as $i => $word) { 106*1755450bSSatoshi Sahara if ((!is_numeric($word) && strlen($word) < static::getMinWordLength()) 107*1755450bSSatoshi Sahara || array_search($word, static::getStopwords(), true) !== false) { 108094ebf29SSatoshi Sahara unset($wordlist[$i]); 109094ebf29SSatoshi Sahara } 110094ebf29SSatoshi Sahara } 111094ebf29SSatoshi Sahara return array_values($wordlist); 112094ebf29SSatoshi Sahara } 113094ebf29SSatoshi Sahara} 114