1ed7b5f09Sandi<?php 282257610Sandi/** 382257610Sandi * UTF8 helper functions 482257610Sandi * 54a47269fSandi * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 782257610Sandi */ 882257610Sandi 9ab77016bSAndreas Gohr/** 10ab77016bSAndreas Gohr * check for mb_string support 11ab77016bSAndreas Gohr */ 12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){ 13ab77016bSAndreas Gohr if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14ab77016bSAndreas Gohr define('UTF8_MBSTRING',1); 15ab77016bSAndreas Gohr }else{ 16ab77016bSAndreas Gohr define('UTF8_MBSTRING',0); 17ab77016bSAndreas Gohr } 18ab77016bSAndreas Gohr} 19ab77016bSAndreas Gohr 205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 215e613a5cSchris 22ab77016bSAndreas Gohr 2382257610Sandi/** 2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters 2549c713a3Sandi * 2649c713a3Sandi * Slashes are not encoded 2749c713a3Sandi * 28f59b22f0Sandi * When the second parameter is true the string will 29f59b22f0Sandi * be encoded only if non ASCII characters are detected - 30f59b22f0Sandi * This makes it safe to run it multiple times on the 31f59b22f0Sandi * same string (default is true) 32f59b22f0Sandi * 3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 34f59b22f0Sandi * @see urlencode 3549c713a3Sandi */ 36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){ 37f59b22f0Sandi if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38f59b22f0Sandi return $file; 39f59b22f0Sandi } 40f59b22f0Sandi $file = urlencode($file); 4149c713a3Sandi $file = str_replace('%2F','/',$file); 4249c713a3Sandi return $file; 4349c713a3Sandi} 4449c713a3Sandi 4549c713a3Sandi/** 4649c713a3Sandi * URL-Decode a filename 4749c713a3Sandi * 48f59b22f0Sandi * This is just a wrapper around urldecode 49f59b22f0Sandi * 5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 51f59b22f0Sandi * @see urldecode 5249c713a3Sandi */ 5349c713a3Sandifunction utf8_decodeFN($file){ 54f59b22f0Sandi $file = urldecode($file); 5549c713a3Sandi return $file; 5649c713a3Sandi} 5749c713a3Sandi 58f29bd553Sandi/** 5944f669e9Sandi * Checks if a string contains 7bit ASCII only 6044f669e9Sandi * 6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org> 6244f669e9Sandi */ 6344f669e9Sandifunction utf8_isASCII($str){ 6444f669e9Sandi for($i=0; $i<strlen($str); $i++){ 6544f669e9Sandi if(ord($str{$i}) >127) return false; 6644f669e9Sandi } 6744f669e9Sandi return true; 6844f669e9Sandi} 6944f669e9Sandi 7044f669e9Sandi/** 71e1906e6eSandi * Strips all highbyte chars 72e1906e6eSandi * 73e1906e6eSandi * Returns a pure ASCII7 string 74e1906e6eSandi * 75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org> 76e1906e6eSandi */ 77e1906e6eSandifunction utf8_strip($str){ 78e1906e6eSandi $ascii = ''; 79e1906e6eSandi for($i=0; $i<strlen($str); $i++){ 80e1906e6eSandi if(ord($str{$i}) <128){ 81e1906e6eSandi $ascii .= $str{$i}; 82e1906e6eSandi } 83e1906e6eSandi } 84e1906e6eSandi return $ascii; 85e1906e6eSandi} 86e1906e6eSandi 87e1906e6eSandi/** 88f29bd553Sandi * Tries to detect if a string is in Unicode encoding 89f29bd553Sandi * 90f29bd553Sandi * @author <bmorel@ssi.fr> 91f29bd553Sandi * @link http://www.php.net/manual/en/function.utf8-encode.php 92f29bd553Sandi */ 93f29bd553Sandifunction utf8_check($Str) { 94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) { 955e613a5cSchris $b = ord($Str[$i]); 965e613a5cSchris if ($b < 0x80) continue; # 0bbbbbbb 975e613a5cSchris elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 985e613a5cSchris elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 995e613a5cSchris elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 1005e613a5cSchris elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 1015e613a5cSchris elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 102f29bd553Sandi else return false; # Does not match any model 103f29bd553Sandi for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 104f29bd553Sandi if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 105f29bd553Sandi return false; 106f29bd553Sandi } 107f29bd553Sandi } 108f29bd553Sandi return true; 109f29bd553Sandi} 11049c713a3Sandi 1112f954959Sandi/** 112f29317c1Sandi * Unicode aware replacement for strlen() 1132f954959Sandi * 114f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1 115f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's 116f29317c1Sandi * even faster than mb_strlen. 1172f954959Sandi * 118f29317c1Sandi * @author <chernyshevsky at hotmail dot com> 1192f954959Sandi * @see strlen() 120f29317c1Sandi * @see utf8_decode() 1212f954959Sandi */ 1222f954959Sandifunction utf8_strlen($string){ 123dc57ef04Sandi return strlen(utf8_decode($string)); 1242f954959Sandi} 1252f954959Sandi 1267077c942Sandi/** 12710f09f2aSAndreas Gohr * UTF-8 aware alternative to substr 1287077c942Sandi * 12910f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length) 13010f09f2aSAndreas Gohr * 13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 1325e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk> 13310f09f2aSAndreas Gohr * @param string 13410f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left) 13510f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset 13644881bd0Shenning.noren * @return mixed string or false if failure 1377077c942Sandi */ 13810f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) { 139ab77016bSAndreas Gohr if(UTF8_MBSTRING){ 14010f09f2aSAndreas Gohr if( $length === null ){ 14119a32233Schris return mb_substr($str, $offset); 1427d8be200Sandi }else{ 14319a32233Schris return mb_substr($str, $offset, $length); 144f29317c1Sandi } 145f29317c1Sandi } 146f29317c1Sandi 1472626ee0cSchris /* 1482626ee0cSchris * Notes: 1492626ee0cSchris * 1502626ee0cSchris * no mb string support, so we'll use pcre regex's with 'u' flag 1512626ee0cSchris * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 1522626ee0cSchris * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 1532626ee0cSchris * 1542626ee0cSchris * substr documentation states false can be returned in some cases (e.g. offset > string length) 1552626ee0cSchris * mb_substr never returns false, it will return an empty string instead. 1562626ee0cSchris * 1572626ee0cSchris * calculating the number of characters in the string is a relatively expensive operation, so 1582626ee0cSchris * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 1592626ee0cSchris */ 16010f09f2aSAndreas Gohr 1612626ee0cSchris // cast parameters to appropriate types to avoid multiple notices/warnings 1622626ee0cSchris $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 1632626ee0cSchris $offset = (int)$offset; 1642626ee0cSchris if (!is_null($length)) $length = (int)$length; 16510f09f2aSAndreas Gohr 1662626ee0cSchris // handle trivial cases 1675e613a5cSchris if ($length === 0) return ''; 1682626ee0cSchris if ($offset < 0 && $length < 0 && $length < $offset) return ''; 1695e613a5cSchris 1702626ee0cSchris $offset_pattern = ''; 1712626ee0cSchris $length_pattern = ''; 1722626ee0cSchris 1732626ee0cSchris // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 1742626ee0cSchris if ($offset < 0) { 1752626ee0cSchris $strlen = strlen(utf8_decode($str)); // see notes 1762626ee0cSchris $offset = $strlen + $offset; 1772626ee0cSchris if ($offset < 0) $offset = 0; 1782626ee0cSchris } 1792626ee0cSchris 1802626ee0cSchris // establish a pattern for offset, a non-captured group equal in length to offset 1812626ee0cSchris if ($offset > 0) { 1822626ee0cSchris $Ox = (int)($offset/65535); 1832626ee0cSchris $Oy = $offset%65535; 1842626ee0cSchris 1852626ee0cSchris if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 1862626ee0cSchris $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 1872626ee0cSchris } else { 1882626ee0cSchris $offset_pattern = '^'; // offset == 0; just anchor the pattern 1892626ee0cSchris } 1902626ee0cSchris 1912626ee0cSchris // establish a pattern for length 1922626ee0cSchris if (is_null($length)) { 1932626ee0cSchris $length_pattern = '(.*)$'; // the rest of the string 1942626ee0cSchris } else { 1952626ee0cSchris 1962626ee0cSchris if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 1972626ee0cSchris if ($offset > $strlen) return ''; // another trivial case 1982626ee0cSchris 1992626ee0cSchris if ($length > 0) { 2002626ee0cSchris 2012626ee0cSchris $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 2022626ee0cSchris 2032626ee0cSchris $Lx = (int)($length/65535); 2042626ee0cSchris $Ly = $length%65535; 2052626ee0cSchris 2062626ee0cSchris // +ve length requires ... a captured group of length characters 2072626ee0cSchris if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 2082626ee0cSchris $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 2092626ee0cSchris 2102626ee0cSchris } else if ($length < 0) { 2112626ee0cSchris 2122626ee0cSchris if ($length < ($offset - $strlen)) return ''; 2132626ee0cSchris 2142626ee0cSchris $Lx = (int)((-$length)/65535); 2152626ee0cSchris $Ly = (-$length)%65535; 2162626ee0cSchris 2172626ee0cSchris // -ve length requires ... capture everything except a group of -length characters 2182626ee0cSchris // anchored at the tail-end of the string 2192626ee0cSchris if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 2202626ee0cSchris $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 22110f09f2aSAndreas Gohr } 22210f09f2aSAndreas Gohr } 22310f09f2aSAndreas Gohr 2242626ee0cSchris if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 2252626ee0cSchris return $match[1]; 2262626ee0cSchris} 22710f09f2aSAndreas Gohr 228f29317c1Sandi/** 229dc57ef04Sandi * Unicode aware replacement for substr_replace() 230dc57ef04Sandi * 231dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org> 232dc57ef04Sandi * @see substr_replace() 233dc57ef04Sandi */ 234dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){ 235dc57ef04Sandi $ret = ''; 236dc57ef04Sandi if($start>0) $ret .= utf8_substr($string, 0, $start); 237dc57ef04Sandi $ret .= $replacement; 238dc57ef04Sandi $ret .= utf8_substr($string, $start+$length); 239dc57ef04Sandi return $ret; 240dc57ef04Sandi} 241dc57ef04Sandi 242dc57ef04Sandi/** 243f29317c1Sandi * Unicode aware replacement for explode 244f29317c1Sandi * 245f29317c1Sandi * @TODO support third limit arg 246f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 247f29317c1Sandi * @see explode(); 248f29317c1Sandi */ 249f29317c1Sandifunction utf8_explode($sep, $str) { 250f29317c1Sandi if ( $sep == '' ) { 251f29317c1Sandi trigger_error('Empty delimiter',E_USER_WARNING); 25244881bd0Shenning.noren return false; 253f29317c1Sandi } 254f29317c1Sandi 255f29317c1Sandi return preg_split('!'.preg_quote($sep,'!').'!u',$str); 256f29317c1Sandi} 257f29317c1Sandi 258f29317c1Sandi/** 259f29317c1Sandi * Unicode aware replacement for strrepalce() 260f29317c1Sandi * 261f29317c1Sandi * @todo support PHP5 count (fourth arg) 262f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 263f29317c1Sandi * @see strreplace(); 264f29317c1Sandi */ 265f29317c1Sandifunction utf8_str_replace($s,$r,$str){ 266f29317c1Sandi if(!is_array($s)){ 267f29317c1Sandi $s = '!'.preg_quote($s,'!').'!u'; 268f29317c1Sandi }else{ 269f29317c1Sandi foreach ($s as $k => $v) { 270f29317c1Sandi $s[$k] = '!'.preg_quote($v).'!u'; 271f29317c1Sandi } 272f29317c1Sandi } 273f29317c1Sandi return preg_replace($s,$r,$str); 274f29317c1Sandi} 275f29317c1Sandi 276f29317c1Sandi/** 277f29317c1Sandi * Unicode aware replacement for ltrim() 278f29317c1Sandi * 279f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 280f29317c1Sandi * @see ltrim() 281f29317c1Sandi * @return string 282f29317c1Sandi */ 283f29317c1Sandifunction utf8_ltrim($str,$charlist=''){ 284f29317c1Sandi if($charlist == '') return ltrim($str); 285f29317c1Sandi 286f29317c1Sandi //quote charlist for use in a characterclass 287f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 288f29317c1Sandi 289f29317c1Sandi return preg_replace('/^['.$charlist.']+/u','',$str); 290f29317c1Sandi} 291f29317c1Sandi 292f29317c1Sandi/** 293ea2eed85Sandi * Unicode aware replacement for rtrim() 294f29317c1Sandi * 295f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 296f29317c1Sandi * @see rtrim() 297f29317c1Sandi * @return string 298f29317c1Sandi */ 299f29317c1Sandifunction utf8_rtrim($str,$charlist=''){ 300f29317c1Sandi if($charlist == '') return rtrim($str); 301f29317c1Sandi 302f29317c1Sandi //quote charlist for use in a characterclass 303f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 304f29317c1Sandi 305f29317c1Sandi return preg_replace('/['.$charlist.']+$/u','',$str); 306f29317c1Sandi} 307f29317c1Sandi 308f29317c1Sandi/** 309f29317c1Sandi * Unicode aware replacement for trim() 310f29317c1Sandi * 311f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 312f29317c1Sandi * @see trim() 313f29317c1Sandi * @return string 314f29317c1Sandi */ 315f29317c1Sandifunction utf8_trim($str,$charlist='') { 316f29317c1Sandi if($charlist == '') return trim($str); 317f29317c1Sandi 318f29317c1Sandi return utf8_ltrim(utf8_rtrim($str)); 319f29317c1Sandi} 320f29317c1Sandi 3212f954959Sandi 32249c713a3Sandi/** 32382257610Sandi * This is a unicode aware replacement for strtolower() 32482257610Sandi * 32582257610Sandi * Uses mb_string extension if available 32682257610Sandi * 32782257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 32882257610Sandi * @see strtolower() 32982257610Sandi * @see utf8_strtoupper() 33082257610Sandi */ 33182257610Sandifunction utf8_strtolower($string){ 332ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 33382257610Sandi 33482257610Sandi global $UTF8_UPPER_TO_LOWER; 33582257610Sandi $uni = utf8_to_unicode($string); 3362cd2db38Sandi $cnt = count($uni); 3372cd2db38Sandi for ($i=0; $i < $cnt; $i++){ 33882257610Sandi if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 33982257610Sandi $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 34082257610Sandi } 34182257610Sandi } 34282257610Sandi return unicode_to_utf8($uni); 34382257610Sandi} 34482257610Sandi 34582257610Sandi/** 34682257610Sandi * This is a unicode aware replacement for strtoupper() 34782257610Sandi * 34882257610Sandi * Uses mb_string extension if available 34982257610Sandi * 35082257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 35182257610Sandi * @see strtoupper() 35282257610Sandi * @see utf8_strtoupper() 35382257610Sandi */ 35482257610Sandifunction utf8_strtoupper($string){ 355ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 35682257610Sandi 35782257610Sandi global $UTF8_LOWER_TO_UPPER; 35882257610Sandi $uni = utf8_to_unicode($string); 3592cd2db38Sandi $cnt = count($uni); 3602cd2db38Sandi for ($i=0; $i < $cnt; $i++){ 36182257610Sandi if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 36282257610Sandi $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 36382257610Sandi } 36482257610Sandi } 36582257610Sandi return unicode_to_utf8($uni); 36682257610Sandi} 36782257610Sandi 36882257610Sandi/** 36982257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 37082257610Sandi * 37182257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 37282257610Sandi * letters. Default is to deaccent both cases ($case = 0) 37382257610Sandi * 37482257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 37582257610Sandi */ 37682257610Sandifunction utf8_deaccent($string,$case=0){ 37782257610Sandi if($case <= 0){ 37882257610Sandi global $UTF8_LOWER_ACCENTS; 37982257610Sandi $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 38082257610Sandi } 38182257610Sandi if($case >= 0){ 38282257610Sandi global $UTF8_UPPER_ACCENTS; 38382257610Sandi $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 38482257610Sandi } 38582257610Sandi return $string; 38682257610Sandi} 38782257610Sandi 38882257610Sandi/** 3898a831f2bSAndreas Gohr * Romanize a non-latin string 3908a831f2bSAndreas Gohr * 3918a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 3928a831f2bSAndreas Gohr */ 3938a831f2bSAndreas Gohrfunction utf8_romanize($string){ 3948a831f2bSAndreas Gohr if(utf8_isASCII($string)) return $string; //nothing to do 3958a831f2bSAndreas Gohr 3968a831f2bSAndreas Gohr global $UTF8_ROMANIZATION; 3978a831f2bSAndreas Gohr return strtr($string,$UTF8_ROMANIZATION); 3988a831f2bSAndreas Gohr} 3998a831f2bSAndreas Gohr 4008a831f2bSAndreas Gohr/** 401099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string 402099ada41Sandi * 403099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of 404099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 405099ada41Sandi * 406099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 407099ada41Sandi * @param string $string The UTF8 string to strip of special chars 408099ada41Sandi * @param string $repl Replace special with this string 409b4ce25e9SAndreas Gohr * @param string $additional Additional chars to strip (used in regexp char class) 410099ada41Sandi */ 411b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){ 412099ada41Sandi global $UTF8_SPECIAL_CHARS; 413720307d9Schris global $UTF8_SPECIAL_CHARS2; 414099ada41Sandi 4155c812709Sandi static $specials = null; 4165c812709Sandi if(is_null($specials)){ 417720307d9Schris# $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 418720307d9Schris $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 4195c812709Sandi } 420099ada41Sandi 421b4ce25e9SAndreas Gohr return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 422099ada41Sandi} 423099ada41Sandi 424099ada41Sandi/** 4252f954959Sandi * This is an Unicode aware replacement for strpos 4262f954959Sandi * 4272f954959Sandi * Uses mb_string extension if available 4282f954959Sandi * 429f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 4302f954959Sandi * @see strpos() 4312f954959Sandi */ 4322f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) { 433ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8'); 4342f954959Sandi 435f29317c1Sandi if(!$offset){ 436eaa525a0SAndreas Gohr $ar = utf8_explode($needle, $haystack); 437f29317c1Sandi if ( count($ar) > 1 ) { 438f29317c1Sandi return utf8_strlen($ar[0]); 439f29317c1Sandi } 440f29317c1Sandi return false; 441f29317c1Sandi }else{ 442f29317c1Sandi if ( !is_int($offset) ) { 443f29317c1Sandi trigger_error('Offset must be an integer',E_USER_WARNING); 444f29317c1Sandi return false; 445f29317c1Sandi } 4462f954959Sandi 447eaa525a0SAndreas Gohr $haystack = utf8_substr($haystack, $offset); 448f29317c1Sandi 449eaa525a0SAndreas Gohr if ( false !== ($pos = utf8_strpos($haystack,$needle))){ 450f29317c1Sandi return $pos + $offset; 4512f954959Sandi } 452f29317c1Sandi return false; 4532f954959Sandi } 4542f954959Sandi} 4552f954959Sandi 4562f954959Sandi/** 457ea2eed85Sandi * Encodes UTF-8 characters to HTML entities 458ea2eed85Sandi * 4599f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 460ea2eed85Sandi * @author <vpribish at shopping dot com> 461ea2eed85Sandi * @link http://www.php.net/manual/en/function.utf8-decode.php 462ea2eed85Sandi */ 463ea2eed85Sandifunction utf8_tohtml ($str) { 464ea2eed85Sandi $ret = ''; 4659f9fb0e5STom N Harris foreach (utf8_to_unicode($str) as $cp) { 4669f9fb0e5STom N Harris if ($cp < 0x80) 4679f9fb0e5STom N Harris $ret .= chr($cp); 4689f9fb0e5STom N Harris elseif ($cp < 0x100) 4699f9fb0e5STom N Harris $ret .= "&#$cp;"; 4709f9fb0e5STom N Harris else 4719f9fb0e5STom N Harris $ret .= '&#x'.dechex($cp).';'; 4729f9fb0e5STom N Harris } 4739f9fb0e5STom N Harris return $ret; 4749f9fb0e5STom N Harris} 4759f9fb0e5STom N Harris 4769f9fb0e5STom N Harris/** 4779f9fb0e5STom N Harris * Decodes HTML entities to UTF-8 characters 4789f9fb0e5STom N Harris * 4799f9fb0e5STom N Harris * Convert any &#..; entity to a codepoint, 4809f9fb0e5STom N Harris * The entities flag defaults to only decoding numeric entities. 4819f9fb0e5STom N Harris * Pass HTML_ENTITIES and named entities, including & < etc. 4829f9fb0e5STom N Harris * are handled as well. Avoids the problem that would occur if you 4839f9fb0e5STom N Harris * had to decode "&#38;&amp;#38;" 4849f9fb0e5STom N Harris * 4859f9fb0e5STom N Harris * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 4869f9fb0e5STom N Harris * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 4879f9fb0e5STom N Harris * what it should be -> "&&#38;" 4889f9fb0e5STom N Harris * 4899f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 4909f9fb0e5STom N Harris * @param string $str UTF-8 encoded string 4919f9fb0e5STom N Harris * @param boolean $entities Flag controlling decoding of named entities. 4929f9fb0e5STom N Harris * @return UTF-8 encoded string with numeric (and named) entities replaced. 4939f9fb0e5STom N Harris */ 4949f9fb0e5STom N Harrisfunction utf8_unhtml($str, $entities=null) { 4959f9fb0e5STom N Harris static $decoder = null; 4969f9fb0e5STom N Harris if (is_null($decoder)) 4979f9fb0e5STom N Harris $decoder = new utf8_entity_decoder(); 4989f9fb0e5STom N Harris if (is_null($entities)) 4999f9fb0e5STom N Harris return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 5009f9fb0e5STom N Harris 'utf8_decode_numeric', $str); 5019f9fb0e5STom N Harris else 5029f9fb0e5STom N Harris return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 5039f9fb0e5STom N Harris array(&$decoder, 'decode'), $str); 5049f9fb0e5STom N Harris} 5059f9fb0e5STom N Harrisfunction utf8_decode_numeric($ent) { 5069f9fb0e5STom N Harris switch ($ent[2]) { 5079f9fb0e5STom N Harris case 'X': 5089f9fb0e5STom N Harris case 'x': 5099f9fb0e5STom N Harris $cp = hexdec($ent[3]); 5109f9fb0e5STom N Harris break; 5119f9fb0e5STom N Harris default: 5129f9fb0e5STom N Harris $cp = intval($ent[3]); 5139f9fb0e5STom N Harris break; 5149f9fb0e5STom N Harris } 5159f9fb0e5STom N Harris return unicode_to_utf8(array($cp)); 5169f9fb0e5STom N Harris} 5179f9fb0e5STom N Harrisclass utf8_entity_decoder { 5189f9fb0e5STom N Harris var $table; 5199f9fb0e5STom N Harris function utf8_entity_decoder() { 5209f9fb0e5STom N Harris $table = get_html_translation_table(HTML_ENTITIES); 5219f9fb0e5STom N Harris $table = array_flip($table); 5229f9fb0e5STom N Harris $this->table = array_map(array(&$this,'makeutf8'), $table); 5239f9fb0e5STom N Harris } 5249f9fb0e5STom N Harris function makeutf8($c) { 5259f9fb0e5STom N Harris return unicode_to_utf8(array(ord($c))); 5269f9fb0e5STom N Harris } 5279f9fb0e5STom N Harris function decode($ent) { 5289f9fb0e5STom N Harris if ($ent[1] == '#') { 5299f9fb0e5STom N Harris return utf8_decode_numeric($ent); 5309f9fb0e5STom N Harris } elseif (array_key_exists($ent[0],$this->table)) { 5319f9fb0e5STom N Harris return $this->table[$ent[0]]; 5329f9fb0e5STom N Harris } else { 5339f9fb0e5STom N Harris return $ent[0]; 534ea2eed85Sandi } 535ea2eed85Sandi } 536ea2eed85Sandi} 537ea2eed85Sandi 538ea2eed85Sandi/** 5391abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the 5401abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the 5411abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 5421abfaba4SAndreas Gohr * are not allowed. 54382257610Sandi * 5441abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 5451abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at 5461abfaba4SAndreas Gohr * level E_USER_WARNING 5471abfaba4SAndreas Gohr * 5481abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to 5491abfaba4SAndreas Gohr * trigger errors on encountering bad bytes 5501abfaba4SAndreas Gohr * 5511abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 5521abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 5531abfaba4SAndreas Gohr * @param string UTF-8 encoded string 5541abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 55544881bd0Shenning.noren * @return mixed array of unicode code points or false if UTF-8 invalid 5561abfaba4SAndreas Gohr * @see unicode_to_utf8 5571abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 5581abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 55982257610Sandi */ 5601abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) { 5611abfaba4SAndreas Gohr $mState = 0; // cached expected number of octets after the current octet 5621abfaba4SAndreas Gohr // until the beginning of the next UTF8 character sequence 5631abfaba4SAndreas Gohr $mUcs4 = 0; // cached Unicode character 5641abfaba4SAndreas Gohr $mBytes = 1; // cached expected number of octets in the current sequence 56582257610Sandi 5661abfaba4SAndreas Gohr $out = array(); 5671abfaba4SAndreas Gohr 5681abfaba4SAndreas Gohr $len = strlen($str); 5691abfaba4SAndreas Gohr 5701abfaba4SAndreas Gohr for($i = 0; $i < $len; $i++) { 5711abfaba4SAndreas Gohr 5721abfaba4SAndreas Gohr $in = ord($str{$i}); 5731abfaba4SAndreas Gohr 5741abfaba4SAndreas Gohr if ( $mState == 0) { 5751abfaba4SAndreas Gohr 5761abfaba4SAndreas Gohr // When mState is zero we expect either a US-ASCII character or a 5771abfaba4SAndreas Gohr // multi-octet sequence. 5781abfaba4SAndreas Gohr if (0 == (0x80 & ($in))) { 5791abfaba4SAndreas Gohr // US-ASCII, pass straight through. 5801abfaba4SAndreas Gohr $out[] = $in; 5811abfaba4SAndreas Gohr $mBytes = 1; 5821abfaba4SAndreas Gohr 5831abfaba4SAndreas Gohr } else if (0xC0 == (0xE0 & ($in))) { 5841abfaba4SAndreas Gohr // First octet of 2 octet sequence 5851abfaba4SAndreas Gohr $mUcs4 = ($in); 5861abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x1F) << 6; 5871abfaba4SAndreas Gohr $mState = 1; 5881abfaba4SAndreas Gohr $mBytes = 2; 5891abfaba4SAndreas Gohr 5901abfaba4SAndreas Gohr } else if (0xE0 == (0xF0 & ($in))) { 5911abfaba4SAndreas Gohr // First octet of 3 octet sequence 5921abfaba4SAndreas Gohr $mUcs4 = ($in); 5931abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x0F) << 12; 5941abfaba4SAndreas Gohr $mState = 2; 5951abfaba4SAndreas Gohr $mBytes = 3; 5961abfaba4SAndreas Gohr 5971abfaba4SAndreas Gohr } else if (0xF0 == (0xF8 & ($in))) { 5981abfaba4SAndreas Gohr // First octet of 4 octet sequence 5991abfaba4SAndreas Gohr $mUcs4 = ($in); 6001abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x07) << 18; 6011abfaba4SAndreas Gohr $mState = 3; 6021abfaba4SAndreas Gohr $mBytes = 4; 6031abfaba4SAndreas Gohr 6041abfaba4SAndreas Gohr } else if (0xF8 == (0xFC & ($in))) { 6051abfaba4SAndreas Gohr /* First octet of 5 octet sequence. 6061abfaba4SAndreas Gohr * 6071abfaba4SAndreas Gohr * This is illegal because the encoded codepoint must be either 6081abfaba4SAndreas Gohr * (a) not the shortest form or 6091abfaba4SAndreas Gohr * (b) outside the Unicode range of 0-0x10FFFF. 6101abfaba4SAndreas Gohr * Rather than trying to resynchronize, we will carry on until the end 6111abfaba4SAndreas Gohr * of the sequence and let the later error handling code catch it. 6121abfaba4SAndreas Gohr */ 6131abfaba4SAndreas Gohr $mUcs4 = ($in); 6141abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x03) << 24; 6151abfaba4SAndreas Gohr $mState = 4; 6161abfaba4SAndreas Gohr $mBytes = 5; 6171abfaba4SAndreas Gohr 6181abfaba4SAndreas Gohr } else if (0xFC == (0xFE & ($in))) { 6191abfaba4SAndreas Gohr // First octet of 6 octet sequence, see comments for 5 octet sequence. 6201abfaba4SAndreas Gohr $mUcs4 = ($in); 6211abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 1) << 30; 6221abfaba4SAndreas Gohr $mState = 5; 6231abfaba4SAndreas Gohr $mBytes = 6; 6241abfaba4SAndreas Gohr 6251abfaba4SAndreas Gohr } elseif($strict) { 6261abfaba4SAndreas Gohr /* Current octet is neither in the US-ASCII range nor a legal first 6271abfaba4SAndreas Gohr * octet of a multi-octet sequence. 6281abfaba4SAndreas Gohr */ 6291abfaba4SAndreas Gohr trigger_error( 6301abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence identifier '. 6311abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 6321abfaba4SAndreas Gohr E_USER_WARNING 6331abfaba4SAndreas Gohr ); 63444881bd0Shenning.noren return false; 6351abfaba4SAndreas Gohr 6361abfaba4SAndreas Gohr } 6371abfaba4SAndreas Gohr 6381abfaba4SAndreas Gohr } else { 6391abfaba4SAndreas Gohr 6401abfaba4SAndreas Gohr // When mState is non-zero, we expect a continuation of the multi-octet 6411abfaba4SAndreas Gohr // sequence 6421abfaba4SAndreas Gohr if (0x80 == (0xC0 & ($in))) { 6431abfaba4SAndreas Gohr 6441abfaba4SAndreas Gohr // Legal continuation. 6451abfaba4SAndreas Gohr $shift = ($mState - 1) * 6; 6461abfaba4SAndreas Gohr $tmp = $in; 6471abfaba4SAndreas Gohr $tmp = ($tmp & 0x0000003F) << $shift; 6481abfaba4SAndreas Gohr $mUcs4 |= $tmp; 6491abfaba4SAndreas Gohr 6501abfaba4SAndreas Gohr /** 6511abfaba4SAndreas Gohr * End of the multi-octet sequence. mUcs4 now contains the final 6521abfaba4SAndreas Gohr * Unicode codepoint to be output 6531abfaba4SAndreas Gohr */ 6541abfaba4SAndreas Gohr if (0 == --$mState) { 6551abfaba4SAndreas Gohr 6561abfaba4SAndreas Gohr /* 6571abfaba4SAndreas Gohr * Check for illegal sequences and codepoints. 6581abfaba4SAndreas Gohr */ 6591abfaba4SAndreas Gohr // From Unicode 3.1, non-shortest form is illegal 6601abfaba4SAndreas Gohr if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 6611abfaba4SAndreas Gohr ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 6621abfaba4SAndreas Gohr ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 6631abfaba4SAndreas Gohr (4 < $mBytes) || 6641abfaba4SAndreas Gohr // From Unicode 3.2, surrogate characters are illegal 6651abfaba4SAndreas Gohr (($mUcs4 & 0xFFFFF800) == 0xD800) || 6661abfaba4SAndreas Gohr // Codepoints outside the Unicode range are illegal 6671abfaba4SAndreas Gohr ($mUcs4 > 0x10FFFF)) { 6681abfaba4SAndreas Gohr 6691abfaba4SAndreas Gohr if($strict){ 6701abfaba4SAndreas Gohr trigger_error( 6711abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence or codepoint '. 6721abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 6731abfaba4SAndreas Gohr E_USER_WARNING 6741abfaba4SAndreas Gohr ); 6751abfaba4SAndreas Gohr 67644881bd0Shenning.noren return false; 6771abfaba4SAndreas Gohr } 6781abfaba4SAndreas Gohr 6791abfaba4SAndreas Gohr } 6801abfaba4SAndreas Gohr 6811abfaba4SAndreas Gohr if (0xFEFF != $mUcs4) { 6821abfaba4SAndreas Gohr // BOM is legal but we don't want to output it 6831abfaba4SAndreas Gohr $out[] = $mUcs4; 6841abfaba4SAndreas Gohr } 6851abfaba4SAndreas Gohr 6861abfaba4SAndreas Gohr //initialize UTF8 cache 6871abfaba4SAndreas Gohr $mState = 0; 6881abfaba4SAndreas Gohr $mUcs4 = 0; 6891abfaba4SAndreas Gohr $mBytes = 1; 6901abfaba4SAndreas Gohr } 6911abfaba4SAndreas Gohr 6921abfaba4SAndreas Gohr } elseif($strict) { 6931abfaba4SAndreas Gohr /** 6941abfaba4SAndreas Gohr *((0xC0 & (*in) != 0x80) && (mState != 0)) 6951abfaba4SAndreas Gohr * Incomplete multi-octet sequence. 6961abfaba4SAndreas Gohr */ 6971abfaba4SAndreas Gohr trigger_error( 6981abfaba4SAndreas Gohr 'utf8_to_unicode: Incomplete multi-octet '. 6991abfaba4SAndreas Gohr ' sequence in UTF-8 at byte '.$i, 7001abfaba4SAndreas Gohr E_USER_WARNING 7011abfaba4SAndreas Gohr ); 7021abfaba4SAndreas Gohr 70344881bd0Shenning.noren return false; 70482257610Sandi } 70582257610Sandi } 70682257610Sandi } 7071abfaba4SAndreas Gohr return $out; 70882257610Sandi} 70982257610Sandi 71082257610Sandi/** 7111abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns 7121abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the 7131abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 7141abfaba4SAndreas Gohr * are not allowed. 71582257610Sandi * 7161abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 7171abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the 7181abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING 7191abfaba4SAndreas Gohr * 7201abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use 7211abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as 7221abfaba4SAndreas Gohr * reference the array by it's keys 7231abfaba4SAndreas Gohr * 7241abfaba4SAndreas Gohr * @param array of unicode code points representing a string 7251abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 72644881bd0Shenning.noren * @return mixed UTF-8 string or false if array contains invalid code points 7271abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 7281abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 7291abfaba4SAndreas Gohr * @see utf8_to_unicode 7301abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 7311abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 73282257610Sandi */ 7331abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) { 7341abfaba4SAndreas Gohr if (!is_array($arr)) return ''; 7351abfaba4SAndreas Gohr ob_start(); 736f949a01cSAndreas Gohr 7371abfaba4SAndreas Gohr foreach (array_keys($arr) as $k) { 7381abfaba4SAndreas Gohr 7391abfaba4SAndreas Gohr # ASCII range (including control chars) 7401abfaba4SAndreas Gohr if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 7411abfaba4SAndreas Gohr 7421abfaba4SAndreas Gohr echo chr($arr[$k]); 7431abfaba4SAndreas Gohr 7441abfaba4SAndreas Gohr # 2 byte sequence 7451abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x07ff) { 7461abfaba4SAndreas Gohr 7471abfaba4SAndreas Gohr echo chr(0xc0 | ($arr[$k] >> 6)); 7481abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 7491abfaba4SAndreas Gohr 7501abfaba4SAndreas Gohr # Byte order mark (skip) 7511abfaba4SAndreas Gohr } else if($arr[$k] == 0xFEFF) { 7521abfaba4SAndreas Gohr 7531abfaba4SAndreas Gohr // nop -- zap the BOM 7541abfaba4SAndreas Gohr 7551abfaba4SAndreas Gohr # Test for illegal surrogates 7561abfaba4SAndreas Gohr } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 7571abfaba4SAndreas Gohr 7581abfaba4SAndreas Gohr // found a surrogate 7591abfaba4SAndreas Gohr if($strict){ 7601abfaba4SAndreas Gohr trigger_error( 7611abfaba4SAndreas Gohr 'unicode_to_utf8: Illegal surrogate '. 7621abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 7631abfaba4SAndreas Gohr E_USER_WARNING 7641abfaba4SAndreas Gohr ); 76544881bd0Shenning.noren return false; 7661abfaba4SAndreas Gohr } 7671abfaba4SAndreas Gohr 7681abfaba4SAndreas Gohr # 3 byte sequence 7691abfaba4SAndreas Gohr } else if ($arr[$k] <= 0xffff) { 7701abfaba4SAndreas Gohr 7711abfaba4SAndreas Gohr echo chr(0xe0 | ($arr[$k] >> 12)); 7721abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 7731abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 7741abfaba4SAndreas Gohr 7751abfaba4SAndreas Gohr # 4 byte sequence 7761abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x10ffff) { 7771abfaba4SAndreas Gohr 7781abfaba4SAndreas Gohr echo chr(0xf0 | ($arr[$k] >> 18)); 7791abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 7801abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 7811abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x3f)); 7821abfaba4SAndreas Gohr 7831abfaba4SAndreas Gohr } elseif($strict) { 7841abfaba4SAndreas Gohr 7851abfaba4SAndreas Gohr trigger_error( 7861abfaba4SAndreas Gohr 'unicode_to_utf8: Codepoint out of Unicode range '. 7871abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 7881abfaba4SAndreas Gohr E_USER_WARNING 7891abfaba4SAndreas Gohr ); 7901abfaba4SAndreas Gohr 7911abfaba4SAndreas Gohr // out of range 79244881bd0Shenning.noren return false; 79382257610Sandi } 79482257610Sandi } 7951abfaba4SAndreas Gohr 7961abfaba4SAndreas Gohr $result = ob_get_contents(); 7971abfaba4SAndreas Gohr ob_end_clean(); 7981abfaba4SAndreas Gohr return $result; 79982257610Sandi} 80082257610Sandi 80182257610Sandi/** 80215fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 80315fa0b4fSAndreas Gohr * 80415fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 80515fa0b4fSAndreas Gohr */ 80615fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) { 80715fa0b4fSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 808ab77016bSAndreas Gohr if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 80915fa0b4fSAndreas Gohr 81015fa0b4fSAndreas Gohr $uni = utf8_to_unicode($str); 81115fa0b4fSAndreas Gohr foreach($uni as $cp){ 81215fa0b4fSAndreas Gohr $out .= pack('n',$cp); 81315fa0b4fSAndreas Gohr } 81415fa0b4fSAndreas Gohr return $out; 81515fa0b4fSAndreas Gohr} 81615fa0b4fSAndreas Gohr 81715fa0b4fSAndreas Gohr/** 81815fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 81915fa0b4fSAndreas Gohr * 82015fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 82115fa0b4fSAndreas Gohr */ 82215fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) { 82315fa0b4fSAndreas Gohr $uni = unpack('n*',$str); 82415fa0b4fSAndreas Gohr return unicode_to_utf8($uni); 82515fa0b4fSAndreas Gohr} 82615fa0b4fSAndreas Gohr 8270eac1afbSAndreas Gohr/** 8280eac1afbSAndreas Gohr * Replace bad bytes with an alternative character 8290eac1afbSAndreas Gohr * 8300eac1afbSAndreas Gohr * ASCII character is recommended for replacement char 8310eac1afbSAndreas Gohr * 8320eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string 8330eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms 8340eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars 8350eac1afbSAndreas Gohr * 8360eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 8370eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8 8380eac1afbSAndreas Gohr * @param string to search 8390eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII 8400eac1afbSAndreas Gohr * @return string 8410eac1afbSAndreas Gohr */ 8420eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') { 8430eac1afbSAndreas Gohr $UTF8_BAD = 8440eac1afbSAndreas Gohr '([\x00-\x7F]'. # ASCII (including control chars) 8450eac1afbSAndreas Gohr '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 8460eac1afbSAndreas Gohr '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 8470eac1afbSAndreas Gohr '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 8480eac1afbSAndreas Gohr '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 8490eac1afbSAndreas Gohr '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 8500eac1afbSAndreas Gohr '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 8510eac1afbSAndreas Gohr '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 8520eac1afbSAndreas Gohr '|(.{1}))'; # invalid byte 8530eac1afbSAndreas Gohr ob_start(); 8540eac1afbSAndreas Gohr while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 8550eac1afbSAndreas Gohr if ( !isset($matches[2])) { 8560eac1afbSAndreas Gohr echo $matches[0]; 8570eac1afbSAndreas Gohr } else { 8580eac1afbSAndreas Gohr echo $replace; 8590eac1afbSAndreas Gohr } 8600eac1afbSAndreas Gohr $str = substr($str,strlen($matches[0])); 8610eac1afbSAndreas Gohr } 8620eac1afbSAndreas Gohr $result = ob_get_contents(); 8630eac1afbSAndreas Gohr ob_end_clean(); 8640eac1afbSAndreas Gohr return $result; 8650eac1afbSAndreas Gohr} 866ab77016bSAndreas Gohr 8675953e889Schris/** 8685953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary 8695953e889Schris * 8705953e889Schris * @param $str string utf8 character string 8715953e889Schris * @param $i int byte index into $str 8725953e889Schris * @param $next bool direction to search for boundary, 8735953e889Schris * false = up (current character) 8745953e889Schris * true = down (next character) 8755953e889Schris * 8765953e889Schris * @return int byte index into $str now pointing to a utf8 character boundary 8775953e889Schris * 8785953e889Schris * @author chris smith <chris@jalakai.co.uk> 8795953e889Schris */ 8805953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) { 8815953e889Schris 882f50163d1Schris if ($i <= 0) return 0; 883f50163d1Schris 8845953e889Schris $limit = strlen($str); 885f50163d1Schris if ($i>=$limit) return $limit; 886f50163d1Schris 887f50163d1Schris if ($next) { 8885953e889Schris while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 8895953e889Schris } else { 8905953e889Schris while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 8915953e889Schris } 8925953e889Schris 8935953e889Schris return $i; 8945953e889Schris} 8955953e889Schris 896ab77016bSAndreas Gohr// only needed if no mb_string available 897ab77016bSAndreas Gohrif(!UTF8_MBSTRING){ 898ab77016bSAndreas Gohr 89915fa0b4fSAndreas Gohr /** 90082257610Sandi * UTF-8 Case lookup table 90182257610Sandi * 90282257610Sandi * This lookuptable defines the upper case letters to their correspponding 90382257610Sandi * lower case letter in UTF-8 90482257610Sandi * 90582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 90682257610Sandi */ 90754662a04SAndreas Gohr global $UTF8_LOWER_TO_UPPER; 90854662a04SAndreas Gohr $UTF8_LOWER_TO_UPPER = array( 90982257610Sandi 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 91082257610Sandi 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 91182257610Sandi 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 91282257610Sandi 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 91382257610Sandi 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 91482257610Sandi 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 91582257610Sandi 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 91682257610Sandi 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 91782257610Sandi 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 91882257610Sandi 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 91982257610Sandi 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 92082257610Sandi 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 92182257610Sandi 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 92282257610Sandi 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 92382257610Sandi 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 92482257610Sandi 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 92582257610Sandi 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 92682257610Sandi 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 92782257610Sandi 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 92882257610Sandi 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 92982257610Sandi 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 93082257610Sandi 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 93182257610Sandi 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 93282257610Sandi 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 93382257610Sandi 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 93482257610Sandi 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 93582257610Sandi 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 93682257610Sandi 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 93782257610Sandi 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 93882257610Sandi 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 93982257610Sandi 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 94082257610Sandi 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 94182257610Sandi 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 94282257610Sandi 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 94382257610Sandi 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 94482257610Sandi 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 94582257610Sandi 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 94682257610Sandi 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 94782257610Sandi 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 94882257610Sandi 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 94982257610Sandi 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 95082257610Sandi 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 95182257610Sandi 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 95282257610Sandi ); 95382257610Sandi 95482257610Sandi /** 95582257610Sandi * UTF-8 Case lookup table 95682257610Sandi * 95782257610Sandi * This lookuptable defines the lower case letters to their correspponding 95882257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 95982257610Sandi * 96082257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 96182257610Sandi */ 96254662a04SAndreas Gohr global $UTF8_UPPER_TO_LOWER; 96382257610Sandi $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 96482257610Sandi 965ab77016bSAndreas Gohr} // end of case lookup tables 966ab77016bSAndreas Gohr 967ab77016bSAndreas Gohr 96882257610Sandi/** 96982257610Sandi * UTF-8 lookup table for lower case accented letters 97082257610Sandi * 97182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 97282257610Sandi * range. This are lower case letters only. 97382257610Sandi * 97482257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 97582257610Sandi * @see utf8_deaccent() 97682257610Sandi */ 97754662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS; 97882257610Sandi$UTF8_LOWER_ACCENTS = array( 97982257610Sandi 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 98082257610Sandi 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 98182257610Sandi 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 98282257610Sandi 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 98382257610Sandi 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 98482257610Sandi 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 98582257610Sandi 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 98682257610Sandi 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 98782257610Sandi 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 98882257610Sandi 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 98982257610Sandi 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 99082257610Sandi 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 99182257610Sandi 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 99282257610Sandi 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 99374c0c504Schris 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 99482257610Sandi); 99582257610Sandi 99682257610Sandi/** 99782257610Sandi * UTF-8 lookup table for upper case accented letters 99882257610Sandi * 99982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 100082257610Sandi * range. This are upper case letters only. 100182257610Sandi * 100282257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 100382257610Sandi * @see utf8_deaccent() 100482257610Sandi */ 100554662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS; 100682257610Sandi$UTF8_UPPER_ACCENTS = array( 1007df3ecd55SAndreas Gohr 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 1008df3ecd55SAndreas Gohr 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 1009df3ecd55SAndreas Gohr 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 1010df3ecd55SAndreas Gohr 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 1011df3ecd55SAndreas Gohr 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 1012df3ecd55SAndreas Gohr 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 1013df3ecd55SAndreas Gohr 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 1014df3ecd55SAndreas Gohr 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1015df3ecd55SAndreas Gohr 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1016df3ecd55SAndreas Gohr 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1017df3ecd55SAndreas Gohr 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1018df3ecd55SAndreas Gohr 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1019df3ecd55SAndreas Gohr 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1020df3ecd55SAndreas Gohr 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 102174c0c504Schris 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 102282257610Sandi); 102382257610Sandi 1024099ada41Sandi/** 1025099ada41Sandi * UTF-8 array of common special characters 1026099ada41Sandi * 1027099ada41Sandi * This array should contain all special characters (not a letter or digit) 1028099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum 1029099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special 1030099ada41Sandi * chars. 1031099ada41Sandi * 1032099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1033ad81d431SAndreas Gohr * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1034099ada41Sandi * 1035099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 1036099ada41Sandi * @see utf8_stripspecials() 1037099ada41Sandi */ 103854662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS; 1039099ada41Sandi$UTF8_SPECIAL_CHARS = array( 1040099ada41Sandi 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1041ad81d431SAndreas Gohr 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 10425c812709Sandi 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 10435c812709Sandi 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1044099ada41Sandi 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1045099ada41Sandi 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1046099ada41Sandi 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1047099ada41Sandi 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1048099ada41Sandi 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1049099ada41Sandi 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1050099ada41Sandi 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1051099ada41Sandi 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1052099ada41Sandi 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1053099ada41Sandi 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1054099ada41Sandi 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1055099ada41Sandi 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1056099ada41Sandi 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1057099ada41Sandi 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1058099ada41Sandi 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1059099ada41Sandi 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1060099ada41Sandi 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1061099ada41Sandi 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1062099ada41Sandi 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1063099ada41Sandi 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1064099ada41Sandi 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1065099ada41Sandi 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1066099ada41Sandi 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1067099ada41Sandi 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1068099ada41Sandi 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1069099ada41Sandi 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1070099ada41Sandi 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1071099ada41Sandi 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1072099ada41Sandi 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1073099ada41Sandi 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1074099ada41Sandi 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1075099ada41Sandi 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1076099ada41Sandi 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1077099ada41Sandi 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1078099ada41Sandi 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1079099ada41Sandi 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1080099ada41Sandi 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1081099ada41Sandi 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1082099ada41Sandi 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1083099ada41Sandi 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1084099ada41Sandi 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1085d5b23302STom N Harris 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 1086d5b23302STom N Harris 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, 1087d5b23302STom N Harris 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 1088d5b23302STom N Harris 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1089099ada41Sandi 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1090099ada41Sandi 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1091099ada41Sandi 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1092099ada41Sandi 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1093d5b23302STom N Harris 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 1094d5b23302STom N Harris 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, 1095d5b23302STom N Harris 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, 1096d5b23302STom N Harris 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 1097d5b23302STom N Harris 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, 1098d5b23302STom N Harris 0xffeb, 0xffec, 0xffed, 0xffee, 1099099ada41Sandi); 1100340756e4Sandi 1101720307d9Schris// utf8 version of above data 1102720307d9Schrisglobal $UTF8_SPECIAL_CHARS2; 1103720307d9Schris$UTF8_SPECIAL_CHARS2 = 1104*37242afaSTom N Harris "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 1105720307d9Schris '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 1106720307d9Schris '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1107720307d9Schris '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1108720307d9Schris '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1109720307d9Schris '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1110720307d9Schris '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1111720307d9Schris '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1112720307d9Schris '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1113720307d9Schris '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1114720307d9Schris '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1115720307d9Schris '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1116720307d9Schris '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1117720307d9Schris '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1118d5b23302STom N Harris '➷➸➹➺➻➼➽➾'. 1119d5b23302STom N Harris ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. 1120d5b23302STom N Harris '�'. 1121d5b23302STom N Harris '�ﹼﹽ'. 1122d5b23302STom N Harris '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. 1123d5b23302STom N Harris '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'; 1124720307d9Schris 11258a831f2bSAndreas Gohr/** 11268a831f2bSAndreas Gohr * Romanization lookup table 11278a831f2bSAndreas Gohr * 11288a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language 11298a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII. 11308a831f2bSAndreas Gohr * 11318a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works 11328a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement 11338a831f2bSAndreas Gohr * only. Specialities of each language are not supported. 11348a831f2bSAndreas Gohr * 11358a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 11368a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com> 11378a831f2bSAndreas Gohr * @link http://www.uconv.com/translit.htm 11388a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi> 11398a831f2bSAndreas Gohr * @link http://kanjidict.stc.cx/hiragana.php?src=2 11408a831f2bSAndreas Gohr * @link http://www.translatum.gr/converter/greek-transliteration.htm 11418a831f2bSAndreas Gohr * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 11428a831f2bSAndreas Gohr * @link http://www.btranslations.com/resources/romanization/korean.asp 11438a831f2bSAndreas Gohr */ 114454662a04SAndreas Gohrglobal $UTF8_ROMANIZATION; 11458a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array( 11468a831f2bSAndreas Gohr //russian cyrillic 11478a831f2bSAndreas Gohr 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 11488a831f2bSAndreas Gohr 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 11498a831f2bSAndreas Gohr 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 11508a831f2bSAndreas Gohr 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 11518a831f2bSAndreas Gohr 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 11528a831f2bSAndreas Gohr 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1153d8cb2602SDenis Simakov 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1154f5e334deSAndreas Gohr 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 11558a831f2bSAndreas Gohr 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 11568a831f2bSAndreas Gohr // Ukrainian cyrillic 11578a831f2bSAndreas Gohr 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 11588a831f2bSAndreas Gohr // Georgian 11598a831f2bSAndreas Gohr 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 11608a831f2bSAndreas Gohr 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 11618a831f2bSAndreas Gohr 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 11628a831f2bSAndreas Gohr 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 11638a831f2bSAndreas Gohr 'ჰ'=>'xh', 11648a831f2bSAndreas Gohr //Sanskrit 11658a831f2bSAndreas Gohr 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 11668a831f2bSAndreas Gohr 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 11678a831f2bSAndreas Gohr 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 11688a831f2bSAndreas Gohr 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 11698a831f2bSAndreas Gohr 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 11708a831f2bSAndreas Gohr 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 11718a831f2bSAndreas Gohr 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 11728a831f2bSAndreas Gohr //Hebrew 11733dbad6dcSDenis Simakov 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 11743dbad6dcSDenis Simakov 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 11753dbad6dcSDenis Simakov 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 11768a831f2bSAndreas Gohr 'ש'=>'sh','ת'=>'t', 11778a831f2bSAndreas Gohr //Arabic 11788a831f2bSAndreas Gohr 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 11798a831f2bSAndreas Gohr 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 11808a831f2bSAndreas Gohr 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 11818a831f2bSAndreas Gohr 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 11828a831f2bSAndreas Gohr 11838a831f2bSAndreas Gohr // Japanese hiragana 11848a831f2bSAndreas Gohr 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 11858a831f2bSAndreas Gohr 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 11868a831f2bSAndreas Gohr 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 11878a831f2bSAndreas Gohr 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 11888a831f2bSAndreas Gohr 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 11898a831f2bSAndreas Gohr 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 11908a831f2bSAndreas Gohr 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 11918a831f2bSAndreas Gohr 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 11928a831f2bSAndreas Gohr 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 11938a831f2bSAndreas Gohr 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 11948a831f2bSAndreas Gohr 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 11958a831f2bSAndreas Gohr 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 11968a831f2bSAndreas Gohr 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 11978a831f2bSAndreas Gohr 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 11988a831f2bSAndreas Gohr 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 11998a831f2bSAndreas Gohr 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 12008a831f2bSAndreas Gohr 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 12018a831f2bSAndreas Gohr 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 12028a831f2bSAndreas Gohr 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 12038a831f2bSAndreas Gohr 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 12048a831f2bSAndreas Gohr 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 12058a831f2bSAndreas Gohr 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 12068a831f2bSAndreas Gohr 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 12078a831f2bSAndreas Gohr 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 12088a831f2bSAndreas Gohr 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 12098a831f2bSAndreas Gohr 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 12108a831f2bSAndreas Gohr 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 12118a831f2bSAndreas Gohr 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 12128a831f2bSAndreas Gohr 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 12138a831f2bSAndreas Gohr 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 12148a831f2bSAndreas Gohr 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 12158a831f2bSAndreas Gohr 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 12168a831f2bSAndreas Gohr 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 12178a831f2bSAndreas Gohr 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 12188a831f2bSAndreas Gohr 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 12198a831f2bSAndreas Gohr 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 12208a831f2bSAndreas Gohr 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 12218a831f2bSAndreas Gohr 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 12228a831f2bSAndreas Gohr 'じゅ'=>'zyu', 12238a831f2bSAndreas Gohr // Japanese katakana 12248a831f2bSAndreas Gohr 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 12258a831f2bSAndreas Gohr 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 12268a831f2bSAndreas Gohr 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 12278a831f2bSAndreas Gohr 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 12288a831f2bSAndreas Gohr 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 12298a831f2bSAndreas Gohr 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 12308a831f2bSAndreas Gohr 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 12318a831f2bSAndreas Gohr 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 12328a831f2bSAndreas Gohr 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 12338a831f2bSAndreas Gohr 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 12348a831f2bSAndreas Gohr 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 12358a831f2bSAndreas Gohr 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 12368a831f2bSAndreas Gohr 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 12378a831f2bSAndreas Gohr 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 12388a831f2bSAndreas Gohr 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 12398a831f2bSAndreas Gohr 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 12408a831f2bSAndreas Gohr 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 12418a831f2bSAndreas Gohr 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 12428a831f2bSAndreas Gohr 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 12438a831f2bSAndreas Gohr 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 12448a831f2bSAndreas Gohr 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 12458a831f2bSAndreas Gohr 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 12468a831f2bSAndreas Gohr 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 12478a831f2bSAndreas Gohr 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 12488a831f2bSAndreas Gohr 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 12498a831f2bSAndreas Gohr 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 12508a831f2bSAndreas Gohr 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 12518a831f2bSAndreas Gohr 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 12528a831f2bSAndreas Gohr 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 12538a831f2bSAndreas Gohr 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 12548a831f2bSAndreas Gohr 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 12558a831f2bSAndreas Gohr 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 12568a831f2bSAndreas Gohr 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 12578a831f2bSAndreas Gohr 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 12588a831f2bSAndreas Gohr 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 12598a831f2bSAndreas Gohr 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 12608a831f2bSAndreas Gohr 'ジョ'=>'zyo','ジュ'=>'zyu', 12618a831f2bSAndreas Gohr 12628a831f2bSAndreas Gohr // "Greeklish" 12638a831f2bSAndreas Gohr 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 12648a831f2bSAndreas Gohr 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 12658a831f2bSAndreas Gohr 12668a831f2bSAndreas Gohr // Thai 12678a831f2bSAndreas Gohr 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 12688a831f2bSAndreas Gohr 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 12698a831f2bSAndreas Gohr 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 12708a831f2bSAndreas Gohr 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 12718a831f2bSAndreas Gohr 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 12728a831f2bSAndreas Gohr 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 12738a831f2bSAndreas Gohr 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 12748a831f2bSAndreas Gohr '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 12758a831f2bSAndreas Gohr 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 12768a831f2bSAndreas Gohr 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 12778a831f2bSAndreas Gohr '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 12788a831f2bSAndreas Gohr 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 12798a831f2bSAndreas Gohr 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 12808a831f2bSAndreas Gohr 'เ–ียว'=>'iao', 12818a831f2bSAndreas Gohr 12828a831f2bSAndreas Gohr // Korean 12838a831f2bSAndreas Gohr 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 12848a831f2bSAndreas Gohr 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 12858a831f2bSAndreas Gohr 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 12868a831f2bSAndreas Gohr 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 12878a831f2bSAndreas Gohr 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 12888a831f2bSAndreas Gohr 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 12898a831f2bSAndreas Gohr); 1290340756e4Sandi 1291340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 : 12928a831f2bSAndreas Gohr 1293