1*f41bbe4cSAndreas Gohr<?php 2*f41bbe4cSAndreas Gohr 3*f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8; 4*f41bbe4cSAndreas Gohr 5*f41bbe4cSAndreas Gohr/** 6*f41bbe4cSAndreas Gohr * Methods to assess and clean UTF-8 strings 7*f41bbe4cSAndreas Gohr */ 8*f41bbe4cSAndreas Gohrclass Clean 9*f41bbe4cSAndreas Gohr{ 10*f41bbe4cSAndreas Gohr /** 11*f41bbe4cSAndreas Gohr * Checks if a string contains 7bit ASCII only 12*f41bbe4cSAndreas Gohr * 13*f41bbe4cSAndreas Gohr * @author Andreas Haerter <andreas.haerter@dev.mail-node.com> 14*f41bbe4cSAndreas Gohr * 15*f41bbe4cSAndreas Gohr * @param string $str 16*f41bbe4cSAndreas Gohr * @return bool 17*f41bbe4cSAndreas Gohr */ 18*f41bbe4cSAndreas Gohr public static function isASCII($str) 19*f41bbe4cSAndreas Gohr { 20*f41bbe4cSAndreas Gohr return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); 21*f41bbe4cSAndreas Gohr } 22*f41bbe4cSAndreas Gohr 23*f41bbe4cSAndreas Gohr /** 24*f41bbe4cSAndreas Gohr * Tries to detect if a string is in Unicode encoding 25*f41bbe4cSAndreas Gohr * 26*f41bbe4cSAndreas Gohr * @author <bmorel@ssi.fr> 27*f41bbe4cSAndreas Gohr * @link http://php.net/manual/en/function.utf8-encode.php 28*f41bbe4cSAndreas Gohr * 29*f41bbe4cSAndreas Gohr * @param string $str 30*f41bbe4cSAndreas Gohr * @return bool 31*f41bbe4cSAndreas Gohr */ 32*f41bbe4cSAndreas Gohr public static function isUtf8($str) 33*f41bbe4cSAndreas Gohr { 34*f41bbe4cSAndreas Gohr $len = strlen($str); 35*f41bbe4cSAndreas Gohr for ($i = 0; $i < $len; $i++) { 36*f41bbe4cSAndreas Gohr $b = ord($str[$i]); 37*f41bbe4cSAndreas Gohr if ($b < 0x80) continue; # 0bbbbbbb 38*f41bbe4cSAndreas Gohr elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb 39*f41bbe4cSAndreas Gohr elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb 40*f41bbe4cSAndreas Gohr elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb 41*f41bbe4cSAndreas Gohr elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb 42*f41bbe4cSAndreas Gohr elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b 43*f41bbe4cSAndreas Gohr else return false; # Does not match any model 44*f41bbe4cSAndreas Gohr 45*f41bbe4cSAndreas Gohr for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ? 46*f41bbe4cSAndreas Gohr if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80)) 47*f41bbe4cSAndreas Gohr return false; 48*f41bbe4cSAndreas Gohr } 49*f41bbe4cSAndreas Gohr } 50*f41bbe4cSAndreas Gohr return true; 51*f41bbe4cSAndreas Gohr } 52*f41bbe4cSAndreas Gohr 53*f41bbe4cSAndreas Gohr /** 54*f41bbe4cSAndreas Gohr * Strips all high byte chars 55*f41bbe4cSAndreas Gohr * 56*f41bbe4cSAndreas Gohr * Returns a pure ASCII7 string 57*f41bbe4cSAndreas Gohr * 58*f41bbe4cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 59*f41bbe4cSAndreas Gohr * 60*f41bbe4cSAndreas Gohr * @param string $str 61*f41bbe4cSAndreas Gohr * @return string 62*f41bbe4cSAndreas Gohr */ 63*f41bbe4cSAndreas Gohr public static function strip($str) 64*f41bbe4cSAndreas Gohr { 65*f41bbe4cSAndreas Gohr $ascii = ''; 66*f41bbe4cSAndreas Gohr $len = strlen($str); 67*f41bbe4cSAndreas Gohr for ($i = 0; $i < $len; $i++) { 68*f41bbe4cSAndreas Gohr if (ord($str{$i}) < 128) { 69*f41bbe4cSAndreas Gohr $ascii .= $str{$i}; 70*f41bbe4cSAndreas Gohr } 71*f41bbe4cSAndreas Gohr } 72*f41bbe4cSAndreas Gohr return $ascii; 73*f41bbe4cSAndreas Gohr } 74*f41bbe4cSAndreas Gohr 75*f41bbe4cSAndreas Gohr /** 76*f41bbe4cSAndreas Gohr * Removes special characters (nonalphanumeric) from a UTF-8 string 77*f41bbe4cSAndreas Gohr * 78*f41bbe4cSAndreas Gohr * This function adds the controlchars 0x00 to 0x19 to the array of 79*f41bbe4cSAndreas Gohr * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 80*f41bbe4cSAndreas Gohr * 81*f41bbe4cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 82*f41bbe4cSAndreas Gohr * 83*f41bbe4cSAndreas Gohr * @param string $string The UTF8 string to strip of special chars 84*f41bbe4cSAndreas Gohr * @param string $repl Replace special with this string 85*f41bbe4cSAndreas Gohr * @param string $additional Additional chars to strip (used in regexp char class) 86*f41bbe4cSAndreas Gohr * @return string 87*f41bbe4cSAndreas Gohr */ 88*f41bbe4cSAndreas Gohr public static function stripspecials($string, $repl = '', $additional = '') 89*f41bbe4cSAndreas Gohr { 90*f41bbe4cSAndreas Gohr static $specials = null; 91*f41bbe4cSAndreas Gohr if ($specials === null) { 92*f41bbe4cSAndreas Gohr $specials = preg_quote(Table::specialChars(), '/'); 93*f41bbe4cSAndreas Gohr } 94*f41bbe4cSAndreas Gohr 95*f41bbe4cSAndreas Gohr return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string); 96*f41bbe4cSAndreas Gohr } 97*f41bbe4cSAndreas Gohr 98*f41bbe4cSAndreas Gohr /** 99*f41bbe4cSAndreas Gohr * Replace bad bytes with an alternative character 100*f41bbe4cSAndreas Gohr * 101*f41bbe4cSAndreas Gohr * ASCII character is recommended for replacement char 102*f41bbe4cSAndreas Gohr * 103*f41bbe4cSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string 104*f41bbe4cSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms 105*f41bbe4cSAndreas Gohr * Note: modified to include full ASCII range including control chars 106*f41bbe4cSAndreas Gohr * 107*f41bbe4cSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 108*f41bbe4cSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8 109*f41bbe4cSAndreas Gohr * 110*f41bbe4cSAndreas Gohr * @param string $str to search 111*f41bbe4cSAndreas Gohr * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII 112*f41bbe4cSAndreas Gohr * @return string 113*f41bbe4cSAndreas Gohr */ 114*f41bbe4cSAndreas Gohr public static function replaceBadBytes($str, $replace = '') 115*f41bbe4cSAndreas Gohr { 116*f41bbe4cSAndreas Gohr $UTF8_BAD = 117*f41bbe4cSAndreas Gohr '([\x00-\x7F]' . # ASCII (including control chars) 118*f41bbe4cSAndreas Gohr '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte 119*f41bbe4cSAndreas Gohr '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs 120*f41bbe4cSAndreas Gohr '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte 121*f41bbe4cSAndreas Gohr '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates 122*f41bbe4cSAndreas Gohr '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 123*f41bbe4cSAndreas Gohr '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 124*f41bbe4cSAndreas Gohr '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 125*f41bbe4cSAndreas Gohr '|(.{1}))'; # invalid byte 126*f41bbe4cSAndreas Gohr ob_start(); 127*f41bbe4cSAndreas Gohr while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { 128*f41bbe4cSAndreas Gohr if (!isset($matches[2])) { 129*f41bbe4cSAndreas Gohr echo $matches[0]; 130*f41bbe4cSAndreas Gohr } else { 131*f41bbe4cSAndreas Gohr echo $replace; 132*f41bbe4cSAndreas Gohr } 133*f41bbe4cSAndreas Gohr $str = substr($str, strlen($matches[0])); 134*f41bbe4cSAndreas Gohr } 135*f41bbe4cSAndreas Gohr return ob_get_clean(); 136*f41bbe4cSAndreas Gohr } 137*f41bbe4cSAndreas Gohr 138*f41bbe4cSAndreas Gohr 139*f41bbe4cSAndreas Gohr /** 140*f41bbe4cSAndreas Gohr * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 141*f41bbe4cSAndreas Gohr * 142*f41bbe4cSAndreas Gohr * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 143*f41bbe4cSAndreas Gohr * letters. Default is to deaccent both cases ($case = 0) 144*f41bbe4cSAndreas Gohr * 145*f41bbe4cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 146*f41bbe4cSAndreas Gohr * 147*f41bbe4cSAndreas Gohr * @param string $string 148*f41bbe4cSAndreas Gohr * @param int $case 149*f41bbe4cSAndreas Gohr * @return string 150*f41bbe4cSAndreas Gohr */ 151*f41bbe4cSAndreas Gohr public static function deaccent($string, $case = 0) 152*f41bbe4cSAndreas Gohr { 153*f41bbe4cSAndreas Gohr if ($case <= 0) { 154*f41bbe4cSAndreas Gohr $string = strtr($string, Table::lowerAccents()); 155*f41bbe4cSAndreas Gohr } 156*f41bbe4cSAndreas Gohr if ($case >= 0) { 157*f41bbe4cSAndreas Gohr $string = strtr($string, Table::upperAccents()); 158*f41bbe4cSAndreas Gohr } 159*f41bbe4cSAndreas Gohr return $string; 160*f41bbe4cSAndreas Gohr } 161*f41bbe4cSAndreas Gohr 162*f41bbe4cSAndreas Gohr /** 163*f41bbe4cSAndreas Gohr * Romanize a non-latin string 164*f41bbe4cSAndreas Gohr * 165*f41bbe4cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 166*f41bbe4cSAndreas Gohr * 167*f41bbe4cSAndreas Gohr * @param string $string 168*f41bbe4cSAndreas Gohr * @return string 169*f41bbe4cSAndreas Gohr */ 170*f41bbe4cSAndreas Gohr public static function romanize($string) 171*f41bbe4cSAndreas Gohr { 172*f41bbe4cSAndreas Gohr if (self::isASCII($string)) return $string; //nothing to do 173*f41bbe4cSAndreas Gohr 174*f41bbe4cSAndreas Gohr return strtr($string, Table::romanization()); 175*f41bbe4cSAndreas Gohr } 176*f41bbe4cSAndreas Gohr 177*f41bbe4cSAndreas Gohr /** 178*f41bbe4cSAndreas Gohr * adjust a byte index into a utf8 string to a utf8 character boundary 179*f41bbe4cSAndreas Gohr * 180*f41bbe4cSAndreas Gohr * @author chris smith <chris@jalakai.co.uk> 181*f41bbe4cSAndreas Gohr * 182*f41bbe4cSAndreas Gohr * @param string $str utf8 character string 183*f41bbe4cSAndreas Gohr * @param int $i byte index into $str 184*f41bbe4cSAndreas Gohr * @param bool $next direction to search for boundary, false = up (current character) true = down (next character) 185*f41bbe4cSAndreas Gohr * @return int byte index into $str now pointing to a utf8 character boundary 186*f41bbe4cSAndreas Gohr */ 187*f41bbe4cSAndreas Gohr public static function correctIdx($str, $i, $next = false) 188*f41bbe4cSAndreas Gohr { 189*f41bbe4cSAndreas Gohr 190*f41bbe4cSAndreas Gohr if ($i <= 0) return 0; 191*f41bbe4cSAndreas Gohr 192*f41bbe4cSAndreas Gohr $limit = strlen($str); 193*f41bbe4cSAndreas Gohr if ($i >= $limit) return $limit; 194*f41bbe4cSAndreas Gohr 195*f41bbe4cSAndreas Gohr if ($next) { 196*f41bbe4cSAndreas Gohr while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++; 197*f41bbe4cSAndreas Gohr } else { 198*f41bbe4cSAndreas Gohr while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--; 199*f41bbe4cSAndreas Gohr } 200*f41bbe4cSAndreas Gohr 201*f41bbe4cSAndreas Gohr return $i; 202*f41bbe4cSAndreas Gohr } 203*f41bbe4cSAndreas Gohr 204*f41bbe4cSAndreas Gohr} 205