1ed7b5f09Sandi<?php 282257610Sandi/** 382257610Sandi * UTF8 helper functions 482257610Sandi * 54a47269fSandi * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 782257610Sandi */ 882257610Sandi 982257610Sandi/** 1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters 1149c713a3Sandi * 1249c713a3Sandi * Slashes are not encoded 1349c713a3Sandi * 14f59b22f0Sandi * When the second parameter is true the string will 15f59b22f0Sandi * be encoded only if non ASCII characters are detected - 16f59b22f0Sandi * This makes it safe to run it multiple times on the 17f59b22f0Sandi * same string (default is true) 18f59b22f0Sandi * 1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 20f59b22f0Sandi * @see urlencode 2149c713a3Sandi */ 22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){ 23f59b22f0Sandi if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 24f59b22f0Sandi return $file; 25f59b22f0Sandi } 26f59b22f0Sandi $file = urlencode($file); 2749c713a3Sandi $file = str_replace('%2F','/',$file); 2849c713a3Sandi return $file; 2949c713a3Sandi} 3049c713a3Sandi 3149c713a3Sandi/** 3249c713a3Sandi * URL-Decode a filename 3349c713a3Sandi * 34f59b22f0Sandi * This is just a wrapper around urldecode 35f59b22f0Sandi * 3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 37f59b22f0Sandi * @see urldecode 3849c713a3Sandi */ 3949c713a3Sandifunction utf8_decodeFN($file){ 40f59b22f0Sandi $file = urldecode($file); 4149c713a3Sandi return $file; 4249c713a3Sandi} 4349c713a3Sandi 44f29bd553Sandi/** 4544f669e9Sandi * Checks if a string contains 7bit ASCII only 4644f669e9Sandi * 4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org> 4844f669e9Sandi */ 4944f669e9Sandifunction utf8_isASCII($str){ 5044f669e9Sandi for($i=0; $i<strlen($str); $i++){ 5144f669e9Sandi if(ord($str{$i}) >127) return false; 5244f669e9Sandi } 5344f669e9Sandi return true; 5444f669e9Sandi} 5544f669e9Sandi 5644f669e9Sandi/** 57e1906e6eSandi * Strips all highbyte chars 58e1906e6eSandi * 59e1906e6eSandi * Returns a pure ASCII7 string 60e1906e6eSandi * 61e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org> 62e1906e6eSandi */ 63e1906e6eSandifunction utf8_strip($str){ 64e1906e6eSandi $ascii = ''; 65e1906e6eSandi for($i=0; $i<strlen($str); $i++){ 66e1906e6eSandi if(ord($str{$i}) <128){ 67e1906e6eSandi $ascii .= $str{$i}; 68e1906e6eSandi } 69e1906e6eSandi } 70e1906e6eSandi return $ascii; 71e1906e6eSandi} 72e1906e6eSandi 73e1906e6eSandi/** 74f29bd553Sandi * Tries to detect if a string is in Unicode encoding 75f29bd553Sandi * 76f29bd553Sandi * @author <bmorel@ssi.fr> 77f29bd553Sandi * @link http://www.php.net/manual/en/function.utf8-encode.php 78f29bd553Sandi */ 79f29bd553Sandifunction utf8_check($Str) { 80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) { 81f29bd553Sandi if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 82f29bd553Sandi elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 83f29bd553Sandi elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 84f29bd553Sandi elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 85f29bd553Sandi elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 86f29bd553Sandi elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 87f29bd553Sandi else return false; # Does not match any model 88f29bd553Sandi for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 89f29bd553Sandi if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 90f29bd553Sandi return false; 91f29bd553Sandi } 92f29bd553Sandi } 93f29bd553Sandi return true; 94f29bd553Sandi} 9549c713a3Sandi 962f954959Sandi/** 97f29317c1Sandi * Unicode aware replacement for strlen() 982f954959Sandi * 99f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1 100f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's 101f29317c1Sandi * even faster than mb_strlen. 1022f954959Sandi * 103f29317c1Sandi * @author <chernyshevsky at hotmail dot com> 1042f954959Sandi * @see strlen() 105f29317c1Sandi * @see utf8_decode() 1062f954959Sandi */ 1072f954959Sandifunction utf8_strlen($string){ 108dc57ef04Sandi return strlen(utf8_decode($string)); 1092f954959Sandi} 1102f954959Sandi 1117077c942Sandi/** 112f29317c1Sandi * Unicode aware replacement for substr() 1137077c942Sandi * 1147d8be200Sandi * @author lmak at NOSPAM dot iti dot gr 1157d8be200Sandi * @link http://www.php.net/manual/en/function.substr.php 1167077c942Sandi * @see substr() 1177077c942Sandi */ 1187077c942Sandifunction utf8_substr($str,$start,$length=null){ 1197d8be200Sandi preg_match_all("/./u", $str, $ar); 120f29317c1Sandi 1217d8be200Sandi if($length != null) { 1227d8be200Sandi return join("",array_slice($ar[0],$start,$length)); 1237d8be200Sandi } else { 1247d8be200Sandi return join("",array_slice($ar[0],$start)); 125f29317c1Sandi } 126f29317c1Sandi} 127f29317c1Sandi 128f29317c1Sandi/** 129dc57ef04Sandi * Unicode aware replacement for substr_replace() 130dc57ef04Sandi * 131dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org> 132dc57ef04Sandi * @see substr_replace() 133dc57ef04Sandi */ 134dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){ 135dc57ef04Sandi $ret = ''; 136dc57ef04Sandi if($start>0) $ret .= utf8_substr($string, 0, $start); 137dc57ef04Sandi $ret .= $replacement; 138dc57ef04Sandi $ret .= utf8_substr($string, $start+$length); 139dc57ef04Sandi return $ret; 140dc57ef04Sandi} 141dc57ef04Sandi 142dc57ef04Sandi/** 143f29317c1Sandi * Unicode aware replacement for explode 144f29317c1Sandi * 145f29317c1Sandi * @TODO support third limit arg 146f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 147f29317c1Sandi * @see explode(); 148f29317c1Sandi */ 149f29317c1Sandifunction utf8_explode($sep, $str) { 150f29317c1Sandi if ( $sep == '' ) { 151f29317c1Sandi trigger_error('Empty delimiter',E_USER_WARNING); 152f29317c1Sandi return FALSE; 153f29317c1Sandi } 154f29317c1Sandi 155f29317c1Sandi return preg_split('!'.preg_quote($sep,'!').'!u',$str); 156f29317c1Sandi} 157f29317c1Sandi 158f29317c1Sandi/** 159f29317c1Sandi * Unicode aware replacement for strrepalce() 160f29317c1Sandi * 161f29317c1Sandi * @todo support PHP5 count (fourth arg) 162f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 163f29317c1Sandi * @see strreplace(); 164f29317c1Sandi */ 165f29317c1Sandifunction utf8_str_replace($s,$r,$str){ 166f29317c1Sandi if(!is_array($s)){ 167f29317c1Sandi $s = '!'.preg_quote($s,'!').'!u'; 168f29317c1Sandi }else{ 169f29317c1Sandi foreach ($s as $k => $v) { 170f29317c1Sandi $s[$k] = '!'.preg_quote($v).'!u'; 171f29317c1Sandi } 172f29317c1Sandi } 173f29317c1Sandi return preg_replace($s,$r,$str); 174f29317c1Sandi} 175f29317c1Sandi 176f29317c1Sandi/** 177f29317c1Sandi * Unicode aware replacement for ltrim() 178f29317c1Sandi * 179f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 180f29317c1Sandi * @see ltrim() 181f29317c1Sandi * @return string 182f29317c1Sandi */ 183f29317c1Sandifunction utf8_ltrim($str,$charlist=''){ 184f29317c1Sandi if($charlist == '') return ltrim($str); 185f29317c1Sandi 186f29317c1Sandi //quote charlist for use in a characterclass 187f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 188f29317c1Sandi 189f29317c1Sandi return preg_replace('/^['.$charlist.']+/u','',$str); 190f29317c1Sandi} 191f29317c1Sandi 192f29317c1Sandi/** 193ea2eed85Sandi * Unicode aware replacement for rtrim() 194f29317c1Sandi * 195f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 196f29317c1Sandi * @see rtrim() 197f29317c1Sandi * @return string 198f29317c1Sandi */ 199f29317c1Sandifunction utf8_rtrim($str,$charlist=''){ 200f29317c1Sandi if($charlist == '') return rtrim($str); 201f29317c1Sandi 202f29317c1Sandi //quote charlist for use in a characterclass 203f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 204f29317c1Sandi 205f29317c1Sandi return preg_replace('/['.$charlist.']+$/u','',$str); 206f29317c1Sandi} 207f29317c1Sandi 208f29317c1Sandi/** 209f29317c1Sandi * Unicode aware replacement for trim() 210f29317c1Sandi * 211f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 212f29317c1Sandi * @see trim() 213f29317c1Sandi * @return string 214f29317c1Sandi */ 215f29317c1Sandifunction utf8_trim($str,$charlist='') { 216f29317c1Sandi if($charlist == '') return trim($str); 217f29317c1Sandi 218f29317c1Sandi return utf8_ltrim(utf8_rtrim($str)); 219f29317c1Sandi} 220f29317c1Sandi 2212f954959Sandi 22249c713a3Sandi/** 22382257610Sandi * This is a unicode aware replacement for strtolower() 22482257610Sandi * 22582257610Sandi * Uses mb_string extension if available 22682257610Sandi * 22782257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 22882257610Sandi * @see strtolower() 22982257610Sandi * @see utf8_strtoupper() 23082257610Sandi */ 23182257610Sandifunction utf8_strtolower($string){ 23282257610Sandi if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 23382257610Sandi return mb_strtolower($string,'utf-8'); 23482257610Sandi 23582257610Sandi global $UTF8_UPPER_TO_LOWER; 23682257610Sandi $uni = utf8_to_unicode($string); 2372cd2db38Sandi $cnt = count($uni); 2382cd2db38Sandi for ($i=0; $i < $cnt; $i++){ 23982257610Sandi if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 24082257610Sandi $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 24182257610Sandi } 24282257610Sandi } 24382257610Sandi return unicode_to_utf8($uni); 24482257610Sandi} 24582257610Sandi 24682257610Sandi/** 24782257610Sandi * This is a unicode aware replacement for strtoupper() 24882257610Sandi * 24982257610Sandi * Uses mb_string extension if available 25082257610Sandi * 25182257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 25282257610Sandi * @see strtoupper() 25382257610Sandi * @see utf8_strtoupper() 25482257610Sandi */ 25582257610Sandifunction utf8_strtoupper($string){ 25682257610Sandi if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 2574d807ea6SAndreas Gohr return mb_strtoupper($string,'utf-8'); 25882257610Sandi 25982257610Sandi global $UTF8_LOWER_TO_UPPER; 26082257610Sandi $uni = utf8_to_unicode($string); 2612cd2db38Sandi $cnt = count($uni); 2622cd2db38Sandi for ($i=0; $i < $cnt; $i++){ 26382257610Sandi if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 26482257610Sandi $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 26582257610Sandi } 26682257610Sandi } 26782257610Sandi return unicode_to_utf8($uni); 26882257610Sandi} 26982257610Sandi 27082257610Sandi/** 27182257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 27282257610Sandi * 27382257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 27482257610Sandi * letters. Default is to deaccent both cases ($case = 0) 27582257610Sandi * 27682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 27782257610Sandi */ 27882257610Sandifunction utf8_deaccent($string,$case=0){ 27982257610Sandi if($case <= 0){ 28082257610Sandi global $UTF8_LOWER_ACCENTS; 28182257610Sandi $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 28282257610Sandi } 28382257610Sandi if($case >= 0){ 28482257610Sandi global $UTF8_UPPER_ACCENTS; 28582257610Sandi $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 28682257610Sandi } 28782257610Sandi return $string; 28882257610Sandi} 28982257610Sandi 29082257610Sandi/** 2918a831f2bSAndreas Gohr * Romanize a non-latin string 2928a831f2bSAndreas Gohr * 2938a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 2948a831f2bSAndreas Gohr */ 2958a831f2bSAndreas Gohrfunction utf8_romanize($string){ 2968a831f2bSAndreas Gohr if(utf8_isASCII($string)) return $string; //nothing to do 2978a831f2bSAndreas Gohr 2988a831f2bSAndreas Gohr global $UTF8_ROMANIZATION; 2998a831f2bSAndreas Gohr return strtr($string,$UTF8_ROMANIZATION); 3008a831f2bSAndreas Gohr} 3018a831f2bSAndreas Gohr 3028a831f2bSAndreas Gohr/** 303099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string 304099ada41Sandi * 305099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of 306099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 307099ada41Sandi * 308099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 309099ada41Sandi * @param string $string The UTF8 string to strip of special chars 310099ada41Sandi * @param string $repl Replace special with this string 311b4ce25e9SAndreas Gohr * @param string $additional Additional chars to strip (used in regexp char class) 312099ada41Sandi */ 313b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){ 314099ada41Sandi global $UTF8_SPECIAL_CHARS; 315099ada41Sandi 3165c812709Sandi static $specials = null; 3175c812709Sandi if(is_null($specials)){ 3185c812709Sandi $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 3195c812709Sandi } 320099ada41Sandi 321b4ce25e9SAndreas Gohr return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 322099ada41Sandi} 323099ada41Sandi 324099ada41Sandi/** 3252f954959Sandi * This is an Unicode aware replacement for strpos 3262f954959Sandi * 3272f954959Sandi * Uses mb_string extension if available 3282f954959Sandi * 329f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 3302f954959Sandi * @see strpos() 3312f954959Sandi */ 3322f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) { 3332f954959Sandi if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) 3342f954959Sandi return mb_strpos($haystack,$needle,$offset,'utf-8'); 3352f954959Sandi 336f29317c1Sandi if(!$offset){ 337f29317c1Sandi $ar = utf8_explode($needle, $str); 338f29317c1Sandi if ( count($ar) > 1 ) { 339f29317c1Sandi return utf8_strlen($ar[0]); 340f29317c1Sandi } 341f29317c1Sandi return false; 342f29317c1Sandi }else{ 343f29317c1Sandi if ( !is_int($offset) ) { 344f29317c1Sandi trigger_error('Offset must be an integer',E_USER_WARNING); 345f29317c1Sandi return false; 346f29317c1Sandi } 3472f954959Sandi 348f29317c1Sandi $str = utf8_substr($str, $offset); 349f29317c1Sandi 350f29317c1Sandi if ( false !== ($pos = utf8_strpos($str,$needle))){ 351f29317c1Sandi return $pos + $offset; 3522f954959Sandi } 353f29317c1Sandi return false; 3542f954959Sandi } 3552f954959Sandi} 3562f954959Sandi 3572f954959Sandi/** 358ea2eed85Sandi * Encodes UTF-8 characters to HTML entities 359ea2eed85Sandi * 360ea2eed85Sandi * @author <vpribish at shopping dot com> 361ea2eed85Sandi * @link http://www.php.net/manual/en/function.utf8-decode.php 362ea2eed85Sandi */ 363ea2eed85Sandifunction utf8_tohtml ($str) { 364ea2eed85Sandi $ret = ''; 365ea2eed85Sandi $max = strlen($str); 366ea2eed85Sandi $last = 0; // keeps the index of the last regular character 367ea2eed85Sandi for ($i=0; $i<$max; $i++) { 368ea2eed85Sandi $c = $str{$i}; 369ea2eed85Sandi $c1 = ord($c); 370ea2eed85Sandi if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 371ea2eed85Sandi $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 372ea2eed85Sandi $c1 &= 31; // remove the 3 bit two bytes prefix 373ea2eed85Sandi $c2 = ord($str{++$i}); // the next byte 374ea2eed85Sandi $c2 &= 63; // remove the 2 bit trailing byte prefix 375ea2eed85Sandi $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 376ea2eed85Sandi $c1 >>= 2; // c1 shifts 2 to the right 377ea2eed85Sandi $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 378ea2eed85Sandi $last = $i+1; 379ea2eed85Sandi } 380ea2eed85Sandi } 381ea2eed85Sandi return $ret . substr($str, $last, $i); // append the last batch of regular characters 382ea2eed85Sandi} 383ea2eed85Sandi 384ea2eed85Sandi/** 3851abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the 3861abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the 3871abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 3881abfaba4SAndreas Gohr * are not allowed. 38982257610Sandi * 3901abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 3911abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at 3921abfaba4SAndreas Gohr * level E_USER_WARNING 3931abfaba4SAndreas Gohr * 3941abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to 3951abfaba4SAndreas Gohr * trigger errors on encountering bad bytes 3961abfaba4SAndreas Gohr * 3971abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 3981abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 3991abfaba4SAndreas Gohr * @param string UTF-8 encoded string 4001abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 4011abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid 4021abfaba4SAndreas Gohr * @see unicode_to_utf8 4031abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 4041abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 40582257610Sandi */ 4061abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) { 4071abfaba4SAndreas Gohr $mState = 0; // cached expected number of octets after the current octet 4081abfaba4SAndreas Gohr // until the beginning of the next UTF8 character sequence 4091abfaba4SAndreas Gohr $mUcs4 = 0; // cached Unicode character 4101abfaba4SAndreas Gohr $mBytes = 1; // cached expected number of octets in the current sequence 41182257610Sandi 4121abfaba4SAndreas Gohr $out = array(); 4131abfaba4SAndreas Gohr 4141abfaba4SAndreas Gohr $len = strlen($str); 4151abfaba4SAndreas Gohr 4161abfaba4SAndreas Gohr for($i = 0; $i < $len; $i++) { 4171abfaba4SAndreas Gohr 4181abfaba4SAndreas Gohr $in = ord($str{$i}); 4191abfaba4SAndreas Gohr 4201abfaba4SAndreas Gohr if ( $mState == 0) { 4211abfaba4SAndreas Gohr 4221abfaba4SAndreas Gohr // When mState is zero we expect either a US-ASCII character or a 4231abfaba4SAndreas Gohr // multi-octet sequence. 4241abfaba4SAndreas Gohr if (0 == (0x80 & ($in))) { 4251abfaba4SAndreas Gohr // US-ASCII, pass straight through. 4261abfaba4SAndreas Gohr $out[] = $in; 4271abfaba4SAndreas Gohr $mBytes = 1; 4281abfaba4SAndreas Gohr 4291abfaba4SAndreas Gohr } else if (0xC0 == (0xE0 & ($in))) { 4301abfaba4SAndreas Gohr // First octet of 2 octet sequence 4311abfaba4SAndreas Gohr $mUcs4 = ($in); 4321abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x1F) << 6; 4331abfaba4SAndreas Gohr $mState = 1; 4341abfaba4SAndreas Gohr $mBytes = 2; 4351abfaba4SAndreas Gohr 4361abfaba4SAndreas Gohr } else if (0xE0 == (0xF0 & ($in))) { 4371abfaba4SAndreas Gohr // First octet of 3 octet sequence 4381abfaba4SAndreas Gohr $mUcs4 = ($in); 4391abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x0F) << 12; 4401abfaba4SAndreas Gohr $mState = 2; 4411abfaba4SAndreas Gohr $mBytes = 3; 4421abfaba4SAndreas Gohr 4431abfaba4SAndreas Gohr } else if (0xF0 == (0xF8 & ($in))) { 4441abfaba4SAndreas Gohr // First octet of 4 octet sequence 4451abfaba4SAndreas Gohr $mUcs4 = ($in); 4461abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x07) << 18; 4471abfaba4SAndreas Gohr $mState = 3; 4481abfaba4SAndreas Gohr $mBytes = 4; 4491abfaba4SAndreas Gohr 4501abfaba4SAndreas Gohr } else if (0xF8 == (0xFC & ($in))) { 4511abfaba4SAndreas Gohr /* First octet of 5 octet sequence. 4521abfaba4SAndreas Gohr * 4531abfaba4SAndreas Gohr * This is illegal because the encoded codepoint must be either 4541abfaba4SAndreas Gohr * (a) not the shortest form or 4551abfaba4SAndreas Gohr * (b) outside the Unicode range of 0-0x10FFFF. 4561abfaba4SAndreas Gohr * Rather than trying to resynchronize, we will carry on until the end 4571abfaba4SAndreas Gohr * of the sequence and let the later error handling code catch it. 4581abfaba4SAndreas Gohr */ 4591abfaba4SAndreas Gohr $mUcs4 = ($in); 4601abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x03) << 24; 4611abfaba4SAndreas Gohr $mState = 4; 4621abfaba4SAndreas Gohr $mBytes = 5; 4631abfaba4SAndreas Gohr 4641abfaba4SAndreas Gohr } else if (0xFC == (0xFE & ($in))) { 4651abfaba4SAndreas Gohr // First octet of 6 octet sequence, see comments for 5 octet sequence. 4661abfaba4SAndreas Gohr $mUcs4 = ($in); 4671abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 1) << 30; 4681abfaba4SAndreas Gohr $mState = 5; 4691abfaba4SAndreas Gohr $mBytes = 6; 4701abfaba4SAndreas Gohr 4711abfaba4SAndreas Gohr } elseif($strict) { 4721abfaba4SAndreas Gohr /* Current octet is neither in the US-ASCII range nor a legal first 4731abfaba4SAndreas Gohr * octet of a multi-octet sequence. 4741abfaba4SAndreas Gohr */ 4751abfaba4SAndreas Gohr trigger_error( 4761abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence identifier '. 4771abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 4781abfaba4SAndreas Gohr E_USER_WARNING 4791abfaba4SAndreas Gohr ); 4801abfaba4SAndreas Gohr return FALSE; 4811abfaba4SAndreas Gohr 4821abfaba4SAndreas Gohr } 4831abfaba4SAndreas Gohr 4841abfaba4SAndreas Gohr } else { 4851abfaba4SAndreas Gohr 4861abfaba4SAndreas Gohr // When mState is non-zero, we expect a continuation of the multi-octet 4871abfaba4SAndreas Gohr // sequence 4881abfaba4SAndreas Gohr if (0x80 == (0xC0 & ($in))) { 4891abfaba4SAndreas Gohr 4901abfaba4SAndreas Gohr // Legal continuation. 4911abfaba4SAndreas Gohr $shift = ($mState - 1) * 6; 4921abfaba4SAndreas Gohr $tmp = $in; 4931abfaba4SAndreas Gohr $tmp = ($tmp & 0x0000003F) << $shift; 4941abfaba4SAndreas Gohr $mUcs4 |= $tmp; 4951abfaba4SAndreas Gohr 4961abfaba4SAndreas Gohr /** 4971abfaba4SAndreas Gohr * End of the multi-octet sequence. mUcs4 now contains the final 4981abfaba4SAndreas Gohr * Unicode codepoint to be output 4991abfaba4SAndreas Gohr */ 5001abfaba4SAndreas Gohr if (0 == --$mState) { 5011abfaba4SAndreas Gohr 5021abfaba4SAndreas Gohr /* 5031abfaba4SAndreas Gohr * Check for illegal sequences and codepoints. 5041abfaba4SAndreas Gohr */ 5051abfaba4SAndreas Gohr // From Unicode 3.1, non-shortest form is illegal 5061abfaba4SAndreas Gohr if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 5071abfaba4SAndreas Gohr ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 5081abfaba4SAndreas Gohr ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 5091abfaba4SAndreas Gohr (4 < $mBytes) || 5101abfaba4SAndreas Gohr // From Unicode 3.2, surrogate characters are illegal 5111abfaba4SAndreas Gohr (($mUcs4 & 0xFFFFF800) == 0xD800) || 5121abfaba4SAndreas Gohr // Codepoints outside the Unicode range are illegal 5131abfaba4SAndreas Gohr ($mUcs4 > 0x10FFFF)) { 5141abfaba4SAndreas Gohr 5151abfaba4SAndreas Gohr if($strict){ 5161abfaba4SAndreas Gohr trigger_error( 5171abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence or codepoint '. 5181abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 5191abfaba4SAndreas Gohr E_USER_WARNING 5201abfaba4SAndreas Gohr ); 5211abfaba4SAndreas Gohr 5221abfaba4SAndreas Gohr return FALSE; 5231abfaba4SAndreas Gohr } 5241abfaba4SAndreas Gohr 5251abfaba4SAndreas Gohr } 5261abfaba4SAndreas Gohr 5271abfaba4SAndreas Gohr if (0xFEFF != $mUcs4) { 5281abfaba4SAndreas Gohr // BOM is legal but we don't want to output it 5291abfaba4SAndreas Gohr $out[] = $mUcs4; 5301abfaba4SAndreas Gohr } 5311abfaba4SAndreas Gohr 5321abfaba4SAndreas Gohr //initialize UTF8 cache 5331abfaba4SAndreas Gohr $mState = 0; 5341abfaba4SAndreas Gohr $mUcs4 = 0; 5351abfaba4SAndreas Gohr $mBytes = 1; 5361abfaba4SAndreas Gohr } 5371abfaba4SAndreas Gohr 5381abfaba4SAndreas Gohr } elseif($strict) { 5391abfaba4SAndreas Gohr /** 5401abfaba4SAndreas Gohr *((0xC0 & (*in) != 0x80) && (mState != 0)) 5411abfaba4SAndreas Gohr * Incomplete multi-octet sequence. 5421abfaba4SAndreas Gohr */ 5431abfaba4SAndreas Gohr trigger_error( 5441abfaba4SAndreas Gohr 'utf8_to_unicode: Incomplete multi-octet '. 5451abfaba4SAndreas Gohr ' sequence in UTF-8 at byte '.$i, 5461abfaba4SAndreas Gohr E_USER_WARNING 5471abfaba4SAndreas Gohr ); 5481abfaba4SAndreas Gohr 5491abfaba4SAndreas Gohr return FALSE; 55082257610Sandi } 55182257610Sandi } 55282257610Sandi } 5531abfaba4SAndreas Gohr return $out; 55482257610Sandi} 55582257610Sandi 55682257610Sandi/** 5571abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns 5581abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the 5591abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 5601abfaba4SAndreas Gohr * are not allowed. 56182257610Sandi * 5621abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 5631abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the 5641abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING 5651abfaba4SAndreas Gohr * 5661abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use 5671abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as 5681abfaba4SAndreas Gohr * reference the array by it's keys 5691abfaba4SAndreas Gohr * 5701abfaba4SAndreas Gohr * @param array of unicode code points representing a string 5711abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 5721abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points 5731abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 5741abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 5751abfaba4SAndreas Gohr * @see utf8_to_unicode 5761abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 5771abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 57882257610Sandi */ 5791abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) { 5801abfaba4SAndreas Gohr if (!is_array($arr)) return ''; 5811abfaba4SAndreas Gohr ob_start(); 582f949a01cSAndreas Gohr 5831abfaba4SAndreas Gohr foreach (array_keys($arr) as $k) { 5841abfaba4SAndreas Gohr 5851abfaba4SAndreas Gohr # ASCII range (including control chars) 5861abfaba4SAndreas Gohr if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 5871abfaba4SAndreas Gohr 5881abfaba4SAndreas Gohr echo chr($arr[$k]); 5891abfaba4SAndreas Gohr 5901abfaba4SAndreas Gohr # 2 byte sequence 5911abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x07ff) { 5921abfaba4SAndreas Gohr 5931abfaba4SAndreas Gohr echo chr(0xc0 | ($arr[$k] >> 6)); 5941abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 5951abfaba4SAndreas Gohr 5961abfaba4SAndreas Gohr # Byte order mark (skip) 5971abfaba4SAndreas Gohr } else if($arr[$k] == 0xFEFF) { 5981abfaba4SAndreas Gohr 5991abfaba4SAndreas Gohr // nop -- zap the BOM 6001abfaba4SAndreas Gohr 6011abfaba4SAndreas Gohr # Test for illegal surrogates 6021abfaba4SAndreas Gohr } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 6031abfaba4SAndreas Gohr 6041abfaba4SAndreas Gohr // found a surrogate 6051abfaba4SAndreas Gohr if($strict){ 6061abfaba4SAndreas Gohr trigger_error( 6071abfaba4SAndreas Gohr 'unicode_to_utf8: Illegal surrogate '. 6081abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 6091abfaba4SAndreas Gohr E_USER_WARNING 6101abfaba4SAndreas Gohr ); 6111abfaba4SAndreas Gohr return FALSE; 6121abfaba4SAndreas Gohr } 6131abfaba4SAndreas Gohr 6141abfaba4SAndreas Gohr # 3 byte sequence 6151abfaba4SAndreas Gohr } else if ($arr[$k] <= 0xffff) { 6161abfaba4SAndreas Gohr 6171abfaba4SAndreas Gohr echo chr(0xe0 | ($arr[$k] >> 12)); 6181abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 6191abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 6201abfaba4SAndreas Gohr 6211abfaba4SAndreas Gohr # 4 byte sequence 6221abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x10ffff) { 6231abfaba4SAndreas Gohr 6241abfaba4SAndreas Gohr echo chr(0xf0 | ($arr[$k] >> 18)); 6251abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 6261abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 6271abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x3f)); 6281abfaba4SAndreas Gohr 6291abfaba4SAndreas Gohr } elseif($strict) { 6301abfaba4SAndreas Gohr 6311abfaba4SAndreas Gohr trigger_error( 6321abfaba4SAndreas Gohr 'unicode_to_utf8: Codepoint out of Unicode range '. 6331abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 6341abfaba4SAndreas Gohr E_USER_WARNING 6351abfaba4SAndreas Gohr ); 6361abfaba4SAndreas Gohr 6371abfaba4SAndreas Gohr // out of range 6381abfaba4SAndreas Gohr return FALSE; 63982257610Sandi } 64082257610Sandi } 6411abfaba4SAndreas Gohr 6421abfaba4SAndreas Gohr $result = ob_get_contents(); 6431abfaba4SAndreas Gohr ob_end_clean(); 6441abfaba4SAndreas Gohr return $result; 64582257610Sandi} 64682257610Sandi 64782257610Sandi/** 64815fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 64915fa0b4fSAndreas Gohr * 65015fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 65115fa0b4fSAndreas Gohr */ 65215fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) { 65315fa0b4fSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 65415fa0b4fSAndreas Gohr if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding')) 65515fa0b4fSAndreas Gohr return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 65615fa0b4fSAndreas Gohr 65715fa0b4fSAndreas Gohr $uni = utf8_to_unicode($str); 65815fa0b4fSAndreas Gohr foreach($uni as $cp){ 65915fa0b4fSAndreas Gohr $out .= pack('n',$cp); 66015fa0b4fSAndreas Gohr } 66115fa0b4fSAndreas Gohr return $out; 66215fa0b4fSAndreas Gohr} 66315fa0b4fSAndreas Gohr 66415fa0b4fSAndreas Gohr/** 66515fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 66615fa0b4fSAndreas Gohr * 66715fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 66815fa0b4fSAndreas Gohr */ 66915fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) { 67015fa0b4fSAndreas Gohr $uni = unpack('n*',$str); 67115fa0b4fSAndreas Gohr return unicode_to_utf8($uni); 67215fa0b4fSAndreas Gohr} 67315fa0b4fSAndreas Gohr 67415fa0b4fSAndreas Gohr/** 67582257610Sandi * UTF-8 Case lookup table 67682257610Sandi * 67782257610Sandi * This lookuptable defines the upper case letters to their correspponding 67882257610Sandi * lower case letter in UTF-8 67982257610Sandi * 68082257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 68182257610Sandi */ 6828a831f2bSAndreas Gohrstatic $UTF8_LOWER_TO_UPPER = array( 68382257610Sandi 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 68482257610Sandi 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 68582257610Sandi 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 68682257610Sandi 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 68782257610Sandi 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 68882257610Sandi 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 68982257610Sandi 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 69082257610Sandi 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 69182257610Sandi 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 69282257610Sandi 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 69382257610Sandi 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 69482257610Sandi 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 69582257610Sandi 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 69682257610Sandi 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 69782257610Sandi 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 69882257610Sandi 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 69982257610Sandi 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 70082257610Sandi 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 70182257610Sandi 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 70282257610Sandi 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 70382257610Sandi 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 70482257610Sandi 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 70582257610Sandi 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 70682257610Sandi 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 70782257610Sandi 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 70882257610Sandi 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 70982257610Sandi 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 71082257610Sandi 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 71182257610Sandi 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 71282257610Sandi 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 71382257610Sandi 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 71482257610Sandi 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 71582257610Sandi 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 71682257610Sandi 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 71782257610Sandi 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 71882257610Sandi 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 71982257610Sandi 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 72082257610Sandi 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 72182257610Sandi 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 72282257610Sandi 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 72382257610Sandi 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 72482257610Sandi 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 72582257610Sandi 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 72682257610Sandi); 72782257610Sandi 72882257610Sandi/** 72982257610Sandi * UTF-8 Case lookup table 73082257610Sandi * 73182257610Sandi * This lookuptable defines the lower case letters to their correspponding 73282257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 73382257610Sandi * 73482257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 73582257610Sandi */ 73682257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 73782257610Sandi 73882257610Sandi/** 73982257610Sandi * UTF-8 lookup table for lower case accented letters 74082257610Sandi * 74182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 74282257610Sandi * range. This are lower case letters only. 74382257610Sandi * 74482257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 74582257610Sandi * @see utf8_deaccent() 74682257610Sandi */ 74782257610Sandi$UTF8_LOWER_ACCENTS = array( 74882257610Sandi 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 74982257610Sandi 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 75082257610Sandi 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 75182257610Sandi 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 75282257610Sandi 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 75382257610Sandi 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 75482257610Sandi 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 75582257610Sandi 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 75682257610Sandi 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 75782257610Sandi 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 75882257610Sandi 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 75982257610Sandi 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 76082257610Sandi 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 76182257610Sandi 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 7620c59b0cfSandi 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 76382257610Sandi); 76482257610Sandi 76582257610Sandi/** 76682257610Sandi * UTF-8 lookup table for upper case accented letters 76782257610Sandi * 76882257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 76982257610Sandi * range. This are upper case letters only. 77082257610Sandi * 77182257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 77282257610Sandi * @see utf8_deaccent() 77382257610Sandi */ 77482257610Sandi$UTF8_UPPER_ACCENTS = array( 775df3ecd55SAndreas Gohr 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 776df3ecd55SAndreas Gohr 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 777df3ecd55SAndreas Gohr 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 778df3ecd55SAndreas Gohr 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 779df3ecd55SAndreas Gohr 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 780df3ecd55SAndreas Gohr 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 781df3ecd55SAndreas Gohr 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 782df3ecd55SAndreas Gohr 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 783df3ecd55SAndreas Gohr 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 784df3ecd55SAndreas Gohr 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 785df3ecd55SAndreas Gohr 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 786df3ecd55SAndreas Gohr 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 787df3ecd55SAndreas Gohr 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 788df3ecd55SAndreas Gohr 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 789df3ecd55SAndreas Gohr 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 79082257610Sandi); 79182257610Sandi 792099ada41Sandi/** 793099ada41Sandi * UTF-8 array of common special characters 794099ada41Sandi * 795099ada41Sandi * This array should contain all special characters (not a letter or digit) 796099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum 797099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special 798099ada41Sandi * chars. 799099ada41Sandi * 800099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 801ad81d431SAndreas Gohr * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 802099ada41Sandi * 803099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 804099ada41Sandi * @see utf8_stripspecials() 805099ada41Sandi */ 806099ada41Sandi$UTF8_SPECIAL_CHARS = array( 807099ada41Sandi 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 808ad81d431SAndreas Gohr 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 8095c812709Sandi 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 8105c812709Sandi 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 811099ada41Sandi 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 812099ada41Sandi 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 813099ada41Sandi 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 814099ada41Sandi 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 815099ada41Sandi 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 816099ada41Sandi 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 817099ada41Sandi 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 818099ada41Sandi 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 819099ada41Sandi 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 820099ada41Sandi 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 821099ada41Sandi 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 822099ada41Sandi 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 823099ada41Sandi 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 824099ada41Sandi 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 825099ada41Sandi 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 826099ada41Sandi 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 827099ada41Sandi 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 828099ada41Sandi 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 829099ada41Sandi 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 830099ada41Sandi 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 831099ada41Sandi 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 832099ada41Sandi 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 833099ada41Sandi 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 834099ada41Sandi 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 835099ada41Sandi 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 836099ada41Sandi 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 837099ada41Sandi 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 838099ada41Sandi 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 839099ada41Sandi 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 840099ada41Sandi 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 841099ada41Sandi 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 842099ada41Sandi 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 843099ada41Sandi 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 844099ada41Sandi 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 845099ada41Sandi 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 846099ada41Sandi 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 847099ada41Sandi 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 848099ada41Sandi 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 849099ada41Sandi 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 850099ada41Sandi 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 851099ada41Sandi 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 852099ada41Sandi 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 853099ada41Sandi 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 854099ada41Sandi 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 855099ada41Sandi 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 856099ada41Sandi 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 857099ada41Sandi); 858340756e4Sandi 8598a831f2bSAndreas Gohr/** 8608a831f2bSAndreas Gohr * Romanization lookup table 8618a831f2bSAndreas Gohr * 8628a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language 8638a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII. 8648a831f2bSAndreas Gohr * 8658a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works 8668a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement 8678a831f2bSAndreas Gohr * only. Specialities of each language are not supported. 8688a831f2bSAndreas Gohr * 8698a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 8708a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com> 8718a831f2bSAndreas Gohr * @link http://www.uconv.com/translit.htm 8728a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi> 8738a831f2bSAndreas Gohr * @link http://kanjidict.stc.cx/hiragana.php?src=2 8748a831f2bSAndreas Gohr * @link http://www.translatum.gr/converter/greek-transliteration.htm 8758a831f2bSAndreas Gohr * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 8768a831f2bSAndreas Gohr * @link http://www.btranslations.com/resources/romanization/korean.asp 8778a831f2bSAndreas Gohr */ 8788a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array( 8798a831f2bSAndreas Gohr //russian cyrillic 8808a831f2bSAndreas Gohr 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 8818a831f2bSAndreas Gohr 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 8828a831f2bSAndreas Gohr 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 8838a831f2bSAndreas Gohr 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 8848a831f2bSAndreas Gohr 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 8858a831f2bSAndreas Gohr 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 8868a831f2bSAndreas Gohr 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'th','Щ'=>'Th','ъ'=>'qh', 8878a831f2bSAndreas Gohr 'Ъ'=>'Qh','ы'=>'y','Ы'=>'Y','ь'=>'q','Ь'=>'Q','э'=>'eh','Э'=>'Eh','ю'=>'ju', 8888a831f2bSAndreas Gohr 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 8898a831f2bSAndreas Gohr // Ukrainian cyrillic 8908a831f2bSAndreas Gohr 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 8918a831f2bSAndreas Gohr // Georgian 8928a831f2bSAndreas Gohr 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 8938a831f2bSAndreas Gohr 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 8948a831f2bSAndreas Gohr 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 8958a831f2bSAndreas Gohr 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 8968a831f2bSAndreas Gohr 'ჰ'=>'xh', 8978a831f2bSAndreas Gohr //Sanskrit 8988a831f2bSAndreas Gohr 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 8998a831f2bSAndreas Gohr 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 9008a831f2bSAndreas Gohr 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 9018a831f2bSAndreas Gohr 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 9028a831f2bSAndreas Gohr 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 9038a831f2bSAndreas Gohr 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 9048a831f2bSAndreas Gohr 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 9058a831f2bSAndreas Gohr //Hebrew 906*3dbad6dcSDenis Simakov 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 907*3dbad6dcSDenis Simakov 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 908*3dbad6dcSDenis Simakov 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 9098a831f2bSAndreas Gohr 'ש'=>'sh','ת'=>'t', 9108a831f2bSAndreas Gohr //Arabic 9118a831f2bSAndreas Gohr 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 9128a831f2bSAndreas Gohr 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 9138a831f2bSAndreas Gohr 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 9148a831f2bSAndreas Gohr 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 9158a831f2bSAndreas Gohr 9168a831f2bSAndreas Gohr // Japanese hiragana 9178a831f2bSAndreas Gohr 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 9188a831f2bSAndreas Gohr 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 9198a831f2bSAndreas Gohr 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 9208a831f2bSAndreas Gohr 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 9218a831f2bSAndreas Gohr 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 9228a831f2bSAndreas Gohr 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 9238a831f2bSAndreas Gohr 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 9248a831f2bSAndreas Gohr 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 9258a831f2bSAndreas Gohr 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 9268a831f2bSAndreas Gohr 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 9278a831f2bSAndreas Gohr 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 9288a831f2bSAndreas Gohr 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 9298a831f2bSAndreas Gohr 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 9308a831f2bSAndreas Gohr 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 9318a831f2bSAndreas Gohr 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 9328a831f2bSAndreas Gohr 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 9338a831f2bSAndreas Gohr 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 9348a831f2bSAndreas Gohr 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 9358a831f2bSAndreas Gohr 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 9368a831f2bSAndreas Gohr 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 9378a831f2bSAndreas Gohr 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 9388a831f2bSAndreas Gohr 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 9398a831f2bSAndreas Gohr 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 9408a831f2bSAndreas Gohr 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 9418a831f2bSAndreas Gohr 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 9428a831f2bSAndreas Gohr 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 9438a831f2bSAndreas Gohr 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 9448a831f2bSAndreas Gohr 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 9458a831f2bSAndreas Gohr 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 9468a831f2bSAndreas Gohr 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 9478a831f2bSAndreas Gohr 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 9488a831f2bSAndreas Gohr 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 9498a831f2bSAndreas Gohr 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 9508a831f2bSAndreas Gohr 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 9518a831f2bSAndreas Gohr 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 9528a831f2bSAndreas Gohr 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 9538a831f2bSAndreas Gohr 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 9548a831f2bSAndreas Gohr 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 9558a831f2bSAndreas Gohr 'じゅ'=>'zyu', 9568a831f2bSAndreas Gohr // Japanese katakana 9578a831f2bSAndreas Gohr 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 9588a831f2bSAndreas Gohr 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 9598a831f2bSAndreas Gohr 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 9608a831f2bSAndreas Gohr 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 9618a831f2bSAndreas Gohr 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 9628a831f2bSAndreas Gohr 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 9638a831f2bSAndreas Gohr 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 9648a831f2bSAndreas Gohr 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 9658a831f2bSAndreas Gohr 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 9668a831f2bSAndreas Gohr 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 9678a831f2bSAndreas Gohr 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 9688a831f2bSAndreas Gohr 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 9698a831f2bSAndreas Gohr 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 9708a831f2bSAndreas Gohr 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 9718a831f2bSAndreas Gohr 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 9728a831f2bSAndreas Gohr 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 9738a831f2bSAndreas Gohr 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 9748a831f2bSAndreas Gohr 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 9758a831f2bSAndreas Gohr 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 9768a831f2bSAndreas Gohr 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 9778a831f2bSAndreas Gohr 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 9788a831f2bSAndreas Gohr 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 9798a831f2bSAndreas Gohr 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 9808a831f2bSAndreas Gohr 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 9818a831f2bSAndreas Gohr 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 9828a831f2bSAndreas Gohr 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 9838a831f2bSAndreas Gohr 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 9848a831f2bSAndreas Gohr 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 9858a831f2bSAndreas Gohr 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 9868a831f2bSAndreas Gohr 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 9878a831f2bSAndreas Gohr 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 9888a831f2bSAndreas Gohr 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 9898a831f2bSAndreas Gohr 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 9908a831f2bSAndreas Gohr 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 9918a831f2bSAndreas Gohr 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 9928a831f2bSAndreas Gohr 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 9938a831f2bSAndreas Gohr 'ジョ'=>'zyo','ジュ'=>'zyu', 9948a831f2bSAndreas Gohr 9958a831f2bSAndreas Gohr // "Greeklish" 9968a831f2bSAndreas Gohr 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 9978a831f2bSAndreas Gohr 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 9988a831f2bSAndreas Gohr 9998a831f2bSAndreas Gohr // Thai 10008a831f2bSAndreas Gohr 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 10018a831f2bSAndreas Gohr 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 10028a831f2bSAndreas Gohr 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 10038a831f2bSAndreas Gohr 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 10048a831f2bSAndreas Gohr 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 10058a831f2bSAndreas Gohr 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 10068a831f2bSAndreas Gohr 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 10078a831f2bSAndreas Gohr '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 10088a831f2bSAndreas Gohr 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 10098a831f2bSAndreas Gohr 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 10108a831f2bSAndreas Gohr '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 10118a831f2bSAndreas Gohr 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 10128a831f2bSAndreas Gohr 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 10138a831f2bSAndreas Gohr 'เ–ียว'=>'iao', 10148a831f2bSAndreas Gohr 10158a831f2bSAndreas Gohr // Korean 10168a831f2bSAndreas Gohr 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 10178a831f2bSAndreas Gohr 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 10188a831f2bSAndreas Gohr 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 10198a831f2bSAndreas Gohr 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 10208a831f2bSAndreas Gohr 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 10218a831f2bSAndreas Gohr 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 10228a831f2bSAndreas Gohr); 1023340756e4Sandi 1024340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 : 10258a831f2bSAndreas Gohr 1026