1094ebf29SSatoshi Sahara<?php 2094ebf29SSatoshi Sahara 3094ebf29SSatoshi Saharanamespace dokuwiki\Search; 4094ebf29SSatoshi Sahara 5*9369b4a9SAndreas Gohruse dokuwiki\Utf8\Asian; 6*9369b4a9SAndreas Gohruse dokuwiki\Utf8\Clean; 7*9369b4a9SAndreas Gohruse dokuwiki\Utf8\PhpString; 8094ebf29SSatoshi Saharause dokuwiki\Extension\Event; 9094ebf29SSatoshi Saharause dokuwiki\Utf8; 10094ebf29SSatoshi Sahara 11094ebf29SSatoshi Sahara// set the minimum token length to use in the index 12094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens) 13094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2; 14094ebf29SSatoshi Sahara 15094ebf29SSatoshi Sahara/** 161755450bSSatoshi Sahara * DokuWiki Tokenizer class 17094ebf29SSatoshi Sahara */ 18094ebf29SSatoshi Saharaclass Tokenizer 19094ebf29SSatoshi Sahara{ 20094ebf29SSatoshi Sahara /** @var array $Stopwords Words that tokenizer ignores */ 21*9369b4a9SAndreas Gohr protected static array $Stopwords; 22094ebf29SSatoshi Sahara 23094ebf29SSatoshi Sahara /** @var int $MinWordLength minimum token length */ 24*9369b4a9SAndreas Gohr protected static int $MinWordLength; 25094ebf29SSatoshi Sahara 26094ebf29SSatoshi Sahara /** 27094ebf29SSatoshi Sahara * Returns words that will be ignored 28094ebf29SSatoshi Sahara * 29094ebf29SSatoshi Sahara * @return array list of stop words 30094ebf29SSatoshi Sahara * 31094ebf29SSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 32094ebf29SSatoshi Sahara */ 33*9369b4a9SAndreas Gohr public static function getStopwords(): array 34094ebf29SSatoshi Sahara { 351755450bSSatoshi Sahara if (!isset(static::$Stopwords)) { 36094ebf29SSatoshi Sahara global $conf; 37094ebf29SSatoshi Sahara $swFile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; 38094ebf29SSatoshi Sahara if (file_exists($swFile)) { 391755450bSSatoshi Sahara static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); 40094ebf29SSatoshi Sahara } else { 41*9369b4a9SAndreas Gohr static::$Stopwords = []; 42094ebf29SSatoshi Sahara } 43094ebf29SSatoshi Sahara } 441755450bSSatoshi Sahara return static::$Stopwords; 45094ebf29SSatoshi Sahara } 46094ebf29SSatoshi Sahara 47094ebf29SSatoshi Sahara /** 48094ebf29SSatoshi Sahara * Returns minimum word length to be used in the index 491755450bSSatoshi Sahara * 501755450bSSatoshi Sahara * @return int 51094ebf29SSatoshi Sahara */ 52*9369b4a9SAndreas Gohr public static function getMinWordLength(): int 53094ebf29SSatoshi Sahara { 541755450bSSatoshi Sahara if (!isset(static::$MinWordLength)) { 551755450bSSatoshi Sahara // set the minimum token length to use in the index 561755450bSSatoshi Sahara // (note, this doesn't apply to numeric tokens) 571755450bSSatoshi Sahara static::$MinWordLength = (defined('IDX_MINWORDLENGTH')) 581755450bSSatoshi Sahara ? IDX_MINWORDLENGTH 591755450bSSatoshi Sahara : MINWORDLENGTH; 601755450bSSatoshi Sahara } 611755450bSSatoshi Sahara return static::$MinWordLength; 62094ebf29SSatoshi Sahara } 63094ebf29SSatoshi Sahara 64094ebf29SSatoshi Sahara /** 65094ebf29SSatoshi Sahara * Split the text into words for fulltext search 66094ebf29SSatoshi Sahara * 67094ebf29SSatoshi Sahara * @triggers INDEXER_TEXT_PREPARE 68094ebf29SSatoshi Sahara * This event allows plugins to modify the text before it gets tokenized. 69094ebf29SSatoshi Sahara * Plugins intercepting this event should also intercept INDEX_VERSION_GET 70094ebf29SSatoshi Sahara * 71094ebf29SSatoshi Sahara * @param string $text plain text 72094ebf29SSatoshi Sahara * @param bool $wc are wildcards allowed? 73094ebf29SSatoshi Sahara * @return array list of words in the text 74094ebf29SSatoshi Sahara * 75094ebf29SSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 76094ebf29SSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 77094ebf29SSatoshi Sahara */ 78*9369b4a9SAndreas Gohr public static function getWords(string $text, bool $wc = false): array 79094ebf29SSatoshi Sahara { 80094ebf29SSatoshi Sahara $wc = ($wc) ? '' : '\*'; 81094ebf29SSatoshi Sahara 82094ebf29SSatoshi Sahara // prepare the text to be tokenized 83094ebf29SSatoshi Sahara $event = new Event('INDEXER_TEXT_PREPARE', $text); 84*9369b4a9SAndreas Gohr if ($event->advise_before()) { 85094ebf29SSatoshi Sahara if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 86*9369b4a9SAndreas Gohr $text = Asian::separateAsianWords($text); 87094ebf29SSatoshi Sahara } 88094ebf29SSatoshi Sahara } 89094ebf29SSatoshi Sahara $event->advise_after(); 90094ebf29SSatoshi Sahara unset($event); 91094ebf29SSatoshi Sahara 92*9369b4a9SAndreas Gohr $text = strtr($text, [ 93094ebf29SSatoshi Sahara "\r" => ' ', 94094ebf29SSatoshi Sahara "\n" => ' ', 95094ebf29SSatoshi Sahara "\t" => ' ', 96094ebf29SSatoshi Sahara "\xC2\xAD" => '', //soft-hyphen 97*9369b4a9SAndreas Gohr ]); 98094ebf29SSatoshi Sahara if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 99*9369b4a9SAndreas Gohr $text = Clean::stripspecials($text, ' ', '\._\-:' . $wc); 100094ebf29SSatoshi Sahara } 101094ebf29SSatoshi Sahara 102094ebf29SSatoshi Sahara $wordlist = explode(' ', $text); 103094ebf29SSatoshi Sahara foreach ($wordlist as $i => $word) { 104094ebf29SSatoshi Sahara $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 105*9369b4a9SAndreas Gohr PhpString::strtolower($word) : strtolower($word); 106094ebf29SSatoshi Sahara } 107094ebf29SSatoshi Sahara 108094ebf29SSatoshi Sahara foreach ($wordlist as $i => $word) { 109*9369b4a9SAndreas Gohr if ( 110*9369b4a9SAndreas Gohr (!is_numeric($word) && strlen($word) < static::getMinWordLength()) 111*9369b4a9SAndreas Gohr || in_array($word, static::getStopwords(), true) 112*9369b4a9SAndreas Gohr ) { 113094ebf29SSatoshi Sahara unset($wordlist[$i]); 114094ebf29SSatoshi Sahara } 115094ebf29SSatoshi Sahara } 116094ebf29SSatoshi Sahara return array_values($wordlist); 117094ebf29SSatoshi Sahara } 118596d5287SAndreas Gohr 119596d5287SAndreas Gohr /** 1201148921dSAndreas Gohr * Check if a search term meets the minimum length requirement 1211148921dSAndreas Gohr * 1221148921dSAndreas Gohr * Strips wildcard characters, then checks the base against the minimum 1231148921dSAndreas Gohr * word length. Numeric terms are always accepted. 1241148921dSAndreas Gohr * 1251148921dSAndreas Gohr * @param string $term the search term, may include * wildcards 1261148921dSAndreas Gohr * @return bool true if the term is valid for searching 1271148921dSAndreas Gohr */ 1281148921dSAndreas Gohr public static function isValidSearchTerm(string $term): bool 1291148921dSAndreas Gohr { 1301148921dSAndreas Gohr $base = trim($term, '*'); 1311148921dSAndreas Gohr if ($base === '') return false; 1321148921dSAndreas Gohr if (is_numeric($base)) return true; 1331148921dSAndreas Gohr return static::tokenLength($base) >= static::getMinWordLength(); 1341148921dSAndreas Gohr } 1351148921dSAndreas Gohr 1361148921dSAndreas Gohr /** 137596d5287SAndreas Gohr * Measure the length of a string 138596d5287SAndreas Gohr * 139596d5287SAndreas Gohr * Differs from strlen in handling of asian characters, otherwise byte lengths are used 140596d5287SAndreas Gohr * 141596d5287SAndreas Gohr * @param string $token 142596d5287SAndreas Gohr * @return int 143596d5287SAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 144596d5287SAndreas Gohr * 145596d5287SAndreas Gohr */ 146*9369b4a9SAndreas Gohr public static function tokenLength(string $token): int 147596d5287SAndreas Gohr { 148596d5287SAndreas Gohr $length = strlen($token); 149596d5287SAndreas Gohr // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked 150596d5287SAndreas Gohr if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) { 151596d5287SAndreas Gohr foreach ($leadbytes[0] as $byte) { 152596d5287SAndreas Gohr $length += ord($byte) - 0xE1; 153596d5287SAndreas Gohr } 154596d5287SAndreas Gohr } 155596d5287SAndreas Gohr return $length; 156596d5287SAndreas Gohr } 157094ebf29SSatoshi Sahara} 158