1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Utf8; 7 8// set the minimum token length to use in the index 9// (note, this doesn't apply to numeric tokens) 10const MINWORDLENGTH = 2; 11 12/** 13 * DokuWiki Tokenizer class 14 */ 15class Tokenizer 16{ 17 /** @var array $Stopwords Words that tokenizer ignores */ 18 protected static $Stopwords; 19 20 /** @var int $MinWordLength minimum token length */ 21 protected static $MinWordLength; 22 23 /** 24 * Returns words that will be ignored 25 * 26 * @return array list of stop words 27 * 28 * @author Tom N Harris <tnharris@whoopdedo.org> 29 */ 30 public static function getStopwords() 31 { 32 if (!isset(static::$Stopwords)) { 33 global $conf; 34 $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 35 if (file_exists($swFile)) { 36 static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); 37 } else { 38 static::$Stopwords = array(); 39 } 40 } 41 return static::$Stopwords; 42 } 43 44 /** 45 * Returns minimum word length to be used in the index 46 * 47 * @return int 48 */ 49 public static function getMinWordLength() 50 { 51 if (!isset(static::$MinWordLength)) { 52 // set the minimum token length to use in the index 53 // (note, this doesn't apply to numeric tokens) 54 static::$MinWordLength = (defined('IDX_MINWORDLENGTH')) 55 ? IDX_MINWORDLENGTH 56 : MINWORDLENGTH; 57 } 58 return static::$MinWordLength; 59 } 60 61 /** 62 * Split the text into words for fulltext search 63 * 64 * @triggers INDEXER_TEXT_PREPARE 65 * This event allows plugins to modify the text before it gets tokenized. 66 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 67 * 68 * @param string $text plain text 69 * @param bool $wc are wildcards allowed? 70 * @return array list of words in the text 71 * 72 * @author Tom N Harris <tnharris@whoopdedo.org> 73 * @author Andreas Gohr <andi@splitbrain.org> 74 */ 75 public static function getWords($text, $wc = false) 76 { 77 $wc = ($wc) ? '' : '\*'; 78 79 // prepare the text to be tokenized 80 $event = new Event('INDEXER_TEXT_PREPARE', $text); 81 if ($event->advise_before(true)) { 82 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 83 $text = Utf8\Asian::separateAsianWords($text); 84 } 85 } 86 $event->advise_after(); 87 unset($event); 88 89 $text = strtr($text, array( 90 "\r" => ' ', 91 "\n" => ' ', 92 "\t" => ' ', 93 "\xC2\xAD" => '', //soft-hyphen 94 )); 95 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 96 $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 97 } 98 99 $wordlist = explode(' ', $text); 100 foreach ($wordlist as $i => $word) { 101 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 102 Utf8\PhpString::strtolower($word) : strtolower($word); 103 } 104 105 foreach ($wordlist as $i => $word) { 106 if ((!is_numeric($word) && strlen($word) < static::getMinWordLength()) 107 || array_search($word, static::getStopwords(), true) !== false) { 108 unset($wordlist[$i]); 109 } 110 } 111 return array_values($wordlist); 112 } 113 114 /** 115 * Check if a search term meets the minimum length requirement 116 * 117 * Strips wildcard characters, then checks the base against the minimum 118 * word length. Numeric terms are always accepted. 119 * 120 * @param string $term the search term, may include * wildcards 121 * @return bool true if the term is valid for searching 122 */ 123 public static function isValidSearchTerm(string $term): bool 124 { 125 $base = trim($term, '*'); 126 if ($base === '') return false; 127 if (is_numeric($base)) return true; 128 return static::tokenLength($base) >= static::getMinWordLength(); 129 } 130 131 /** 132 * Measure the length of a string 133 * 134 * Differs from strlen in handling of asian characters, otherwise byte lengths are used 135 * 136 * @param string $token 137 * @return int 138 * @author Tom N Harris <tnharris@whoopdedo.org> 139 * 140 */ 141 public static function tokenLength($token) 142 { 143 $length = strlen($token); 144 // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked 145 if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) { 146 foreach ($leadbytes[0] as $byte) { 147 $length += ord($byte) - 0xE1; 148 } 149 } 150 return $length; 151 } 152 153} 154