1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Utf8\Asian; 6use dokuwiki\Utf8\Clean; 7use dokuwiki\Utf8\PhpString; 8use dokuwiki\Extension\Event; 9use dokuwiki\Utf8; 10 11// set the minimum token length to use in the index 12// (note, this doesn't apply to numeric tokens) 13const MINWORDLENGTH = 2; 14 15/** 16 * DokuWiki Tokenizer class 17 */ 18class Tokenizer 19{ 20 /** @var array $Stopwords Words that tokenizer ignores */ 21 protected static array $Stopwords; 22 23 /** @var int $MinWordLength minimum token length */ 24 protected static int $MinWordLength; 25 26 /** 27 * Returns words that will be ignored 28 * 29 * @return array list of stop words 30 * 31 * @author Tom N Harris <tnharris@whoopdedo.org> 32 */ 33 public static function getStopwords(): array 34 { 35 if (!isset(static::$Stopwords)) { 36 global $conf; 37 $swFile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; 38 if (file_exists($swFile)) { 39 static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); 40 } else { 41 static::$Stopwords = []; 42 } 43 } 44 return static::$Stopwords; 45 } 46 47 /** 48 * Returns minimum word length to be used in the index 49 * 50 * @return int 51 */ 52 public static function getMinWordLength(): int 53 { 54 if (!isset(static::$MinWordLength)) { 55 // set the minimum token length to use in the index 56 // (note, this doesn't apply to numeric tokens) 57 static::$MinWordLength = (defined('IDX_MINWORDLENGTH')) 58 ? IDX_MINWORDLENGTH 59 : MINWORDLENGTH; 60 } 61 return static::$MinWordLength; 62 } 63 64 /** 65 * Split the text into words for fulltext search 66 * 67 * @triggers INDEXER_TEXT_PREPARE 68 * This event allows plugins to modify the text before it gets tokenized. 69 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 70 * 71 * @param string $text plain text 72 * @param bool $wc are wildcards allowed? 73 * @return array list of words in the text 74 * 75 * @author Tom N Harris <tnharris@whoopdedo.org> 76 * @author Andreas Gohr <andi@splitbrain.org> 77 */ 78 public static function getWords(string $text, bool $wc = false): array 79 { 80 $wc = ($wc) ? '' : '\*'; 81 82 // prepare the text to be tokenized 83 $event = new Event('INDEXER_TEXT_PREPARE', $text); 84 if ($event->advise_before()) { 85 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 86 $text = Asian::separateAsianWords($text); 87 } 88 } 89 $event->advise_after(); 90 unset($event); 91 92 $text = strtr($text, [ 93 "\r" => ' ', 94 "\n" => ' ', 95 "\t" => ' ', 96 "\xC2\xAD" => '', //soft-hyphen 97 ]); 98 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 99 $text = Clean::stripspecials($text, ' ', '\._\-:' . $wc); 100 } 101 102 $wordlist = explode(' ', $text); 103 foreach ($wordlist as $i => $word) { 104 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 105 PhpString::strtolower($word) : strtolower($word); 106 } 107 108 foreach ($wordlist as $i => $word) { 109 if ( 110 (!is_numeric($word) && strlen($word) < static::getMinWordLength()) 111 || in_array($word, static::getStopwords(), true) 112 ) { 113 unset($wordlist[$i]); 114 } 115 } 116 return array_values($wordlist); 117 } 118 119 /** 120 * Check if a search term meets the minimum length requirement 121 * 122 * Strips wildcard characters, then checks the base against the minimum 123 * word length. Numeric terms are always accepted. 124 * 125 * @param string $term the search term, may include * wildcards 126 * @return bool true if the term is valid for searching 127 */ 128 public static function isValidSearchTerm(string $term): bool 129 { 130 $base = trim($term, '*'); 131 if ($base === '') return false; 132 if (is_numeric($base)) return true; 133 return static::tokenLength($base) >= static::getMinWordLength(); 134 } 135 136 /** 137 * Measure the length of a string 138 * 139 * Differs from strlen in handling of asian characters, otherwise byte lengths are used 140 * 141 * @param string $token 142 * @return int 143 * @author Tom N Harris <tnharris@whoopdedo.org> 144 * 145 */ 146 public static function tokenLength(string $token): int 147 { 148 $length = strlen($token); 149 // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked 150 if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) { 151 foreach ($leadbytes[0] as $byte) { 152 $length += ord($byte) - 0xE1; 153 } 154 } 155 return $length; 156 } 157} 158