1<?php 2 3namespace dokuwiki\Utf8; 4 5/** 6 * Methods to assess and clean UTF-8 strings 7 */ 8class Clean 9{ 10 /** 11 * Checks if a string contains 7bit ASCII only 12 * 13 * @author Andreas Haerter <andreas.haerter@dev.mail-node.com> 14 * 15 * @param string $str 16 * @return bool 17 */ 18 public static function isASCII($str) 19 { 20 return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); 21 } 22 23 /** 24 * Tries to detect if a string is in Unicode encoding 25 * 26 * @author <bmorel@ssi.fr> 27 * @link http://php.net/manual/en/function.utf8-encode.php 28 * 29 * @param string $str 30 * @return bool 31 */ 32 public static function isUtf8($str) 33 { 34 $len = strlen($str); 35 for ($i = 0; $i < $len; $i++) { 36 $b = ord($str[$i]); 37 if ($b < 0x80) continue; # 0bbbbbbb 38 elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb 39 elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb 40 elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb 41 elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb 42 elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b 43 else return false; # Does not match any model 44 45 for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ? 46 if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80)) 47 return false; 48 } 49 } 50 return true; 51 } 52 53 /** 54 * Strips all high byte chars 55 * 56 * Returns a pure ASCII7 string 57 * 58 * @author Andreas Gohr <andi@splitbrain.org> 59 * 60 * @param string $str 61 * @return string 62 */ 63 public static function strip($str) 64 { 65 $ascii = ''; 66 $len = strlen($str); 67 for ($i = 0; $i < $len; $i++) { 68 if (ord($str[$i]) < 128) { 69 $ascii .= $str[$i]; 70 } 71 } 72 return $ascii; 73 } 74 75 /** 76 * Removes special characters (nonalphanumeric) from a UTF-8 string 77 * 78 * This function adds the controlchars 0x00 to 0x19 to the array of 79 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 80 * 81 * @author Andreas Gohr <andi@splitbrain.org> 82 * 83 * @param string $string The UTF8 string to strip of special chars 84 * @param string $repl Replace special with this string 85 * @param string $additional Additional chars to strip (used in regexp char class) 86 * @return string 87 */ 88 public static function stripspecials($string, $repl = '', $additional = '') 89 { 90 static $specials = null; 91 if ($specials === null) { 92 $specials = preg_quote(Table::specialChars(), '/'); 93 } 94 95 return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string); 96 } 97 98 /** 99 * Replace bad bytes with an alternative character 100 * 101 * ASCII character is recommended for replacement char 102 * 103 * PCRE Pattern to locate bad bytes in a UTF-8 string 104 * Comes from W3 FAQ: Multilingual Forms 105 * Note: modified to include full ASCII range including control chars 106 * 107 * @author Harry Fuecks <hfuecks@gmail.com> 108 * @see http://www.w3.org/International/questions/qa-forms-utf-8 109 * 110 * @param string $str to search 111 * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII 112 * @return string 113 */ 114 public static function replaceBadBytes($str, $replace = '') 115 { 116 $UTF8_BAD = 117 '([\x00-\x7F]' . # ASCII (including control chars) 118 '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte 119 '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs 120 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte 121 '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates 122 '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 123 '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 124 '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 125 '|(.{1}))'; # invalid byte 126 ob_start(); 127 while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { 128 if (!isset($matches[2])) { 129 echo $matches[0]; 130 } else { 131 echo $replace; 132 } 133 $str = substr($str, strlen($matches[0])); 134 } 135 return ob_get_clean(); 136 } 137 138 139 /** 140 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 141 * 142 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 143 * letters. Default is to deaccent both cases ($case = 0) 144 * 145 * @author Andreas Gohr <andi@splitbrain.org> 146 * 147 * @param string $string 148 * @param int $case 149 * @return string 150 */ 151 public static function deaccent($string, $case = 0) 152 { 153 if ($case <= 0) { 154 $string = strtr($string, Table::lowerAccents()); 155 } 156 if ($case >= 0) { 157 $string = strtr($string, Table::upperAccents()); 158 } 159 return $string; 160 } 161 162 /** 163 * Romanize a non-latin string 164 * 165 * @author Andreas Gohr <andi@splitbrain.org> 166 * 167 * @param string $string 168 * @return string 169 */ 170 public static function romanize($string) 171 { 172 if (self::isASCII($string)) return $string; //nothing to do 173 174 return strtr($string, Table::romanization()); 175 } 176 177 /** 178 * adjust a byte index into a utf8 string to a utf8 character boundary 179 * 180 * @author chris smith <chris@jalakai.co.uk> 181 * 182 * @param string $str utf8 character string 183 * @param int $i byte index into $str 184 * @param bool $next direction to search for boundary, false = up (current character) true = down (next character) 185 * @return int byte index into $str now pointing to a utf8 character boundary 186 */ 187 public static function correctIdx($str, $i, $next = false) 188 { 189 190 if ($i <= 0) return 0; 191 192 $limit = strlen($str); 193 if ($i >= $limit) return $limit; 194 195 if ($next) { 196 while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++; 197 } else { 198 while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--; 199 } 200 201 return $i; 202 } 203} 204