1ed7b5f09Sandi<?php 282257610Sandi/** 382257610Sandi * UTF8 helper functions 482257610Sandi * 54a47269fSandi * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 782257610Sandi */ 882257610Sandi 9ab77016bSAndreas Gohr 10ab77016bSAndreas Gohr/** 11ab77016bSAndreas Gohr * check for mb_string support 12ab77016bSAndreas Gohr */ 13ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){ 14ab77016bSAndreas Gohr if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 15ab77016bSAndreas Gohr define('UTF8_MBSTRING',1); 16ab77016bSAndreas Gohr }else{ 17ab77016bSAndreas Gohr define('UTF8_MBSTRING',0); 18ab77016bSAndreas Gohr } 19ab77016bSAndreas Gohr} 20ab77016bSAndreas Gohr 21ab77016bSAndreas Gohr 2282257610Sandi/** 2349c713a3Sandi * URL-Encode a filename to allow unicodecharacters 2449c713a3Sandi * 2549c713a3Sandi * Slashes are not encoded 2649c713a3Sandi * 27f59b22f0Sandi * When the second parameter is true the string will 28f59b22f0Sandi * be encoded only if non ASCII characters are detected - 29f59b22f0Sandi * This makes it safe to run it multiple times on the 30f59b22f0Sandi * same string (default is true) 31f59b22f0Sandi * 3249c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 33f59b22f0Sandi * @see urlencode 3449c713a3Sandi */ 35f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){ 36f59b22f0Sandi if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 37f59b22f0Sandi return $file; 38f59b22f0Sandi } 39f59b22f0Sandi $file = urlencode($file); 4049c713a3Sandi $file = str_replace('%2F','/',$file); 4149c713a3Sandi return $file; 4249c713a3Sandi} 4349c713a3Sandi 4449c713a3Sandi/** 4549c713a3Sandi * URL-Decode a filename 4649c713a3Sandi * 47f59b22f0Sandi * This is just a wrapper around urldecode 48f59b22f0Sandi * 4949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 50f59b22f0Sandi * @see urldecode 5149c713a3Sandi */ 5249c713a3Sandifunction utf8_decodeFN($file){ 53f59b22f0Sandi $file = urldecode($file); 5449c713a3Sandi return $file; 5549c713a3Sandi} 5649c713a3Sandi 57f29bd553Sandi/** 5844f669e9Sandi * Checks if a string contains 7bit ASCII only 5944f669e9Sandi * 6044f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org> 6144f669e9Sandi */ 6244f669e9Sandifunction utf8_isASCII($str){ 6344f669e9Sandi for($i=0; $i<strlen($str); $i++){ 6444f669e9Sandi if(ord($str{$i}) >127) return false; 6544f669e9Sandi } 6644f669e9Sandi return true; 6744f669e9Sandi} 6844f669e9Sandi 6944f669e9Sandi/** 70e1906e6eSandi * Strips all highbyte chars 71e1906e6eSandi * 72e1906e6eSandi * Returns a pure ASCII7 string 73e1906e6eSandi * 74e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org> 75e1906e6eSandi */ 76e1906e6eSandifunction utf8_strip($str){ 77e1906e6eSandi $ascii = ''; 78e1906e6eSandi for($i=0; $i<strlen($str); $i++){ 79e1906e6eSandi if(ord($str{$i}) <128){ 80e1906e6eSandi $ascii .= $str{$i}; 81e1906e6eSandi } 82e1906e6eSandi } 83e1906e6eSandi return $ascii; 84e1906e6eSandi} 85e1906e6eSandi 86e1906e6eSandi/** 87f29bd553Sandi * Tries to detect if a string is in Unicode encoding 88f29bd553Sandi * 89f29bd553Sandi * @author <bmorel@ssi.fr> 90f29bd553Sandi * @link http://www.php.net/manual/en/function.utf8-encode.php 91f29bd553Sandi */ 92f29bd553Sandifunction utf8_check($Str) { 93f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) { 94f29bd553Sandi if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 95f29bd553Sandi elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 96f29bd553Sandi elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 97f29bd553Sandi elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 98f29bd553Sandi elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 99f29bd553Sandi elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 100f29bd553Sandi else return false; # Does not match any model 101f29bd553Sandi for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 102f29bd553Sandi if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 103f29bd553Sandi return false; 104f29bd553Sandi } 105f29bd553Sandi } 106f29bd553Sandi return true; 107f29bd553Sandi} 10849c713a3Sandi 1092f954959Sandi/** 110f29317c1Sandi * Unicode aware replacement for strlen() 1112f954959Sandi * 112f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1 113f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's 114f29317c1Sandi * even faster than mb_strlen. 1152f954959Sandi * 116f29317c1Sandi * @author <chernyshevsky at hotmail dot com> 1172f954959Sandi * @see strlen() 118f29317c1Sandi * @see utf8_decode() 1192f954959Sandi */ 1202f954959Sandifunction utf8_strlen($string){ 121dc57ef04Sandi return strlen(utf8_decode($string)); 1222f954959Sandi} 1232f954959Sandi 1247077c942Sandi/** 12510f09f2aSAndreas Gohr * UTF-8 aware alternative to substr 1267077c942Sandi * 12710f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length) 12810f09f2aSAndreas Gohr * Note: supports use of negative offsets and lengths but will be slower 12910f09f2aSAndreas Gohr * when doing so 13010f09f2aSAndreas Gohr * 13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 13210f09f2aSAndreas Gohr * @param string 13310f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left) 13410f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset 13510f09f2aSAndreas Gohr * @return mixed string or FALSE if failure 1367077c942Sandi */ 13710f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) { 138ab77016bSAndreas Gohr if(UTF8_MBSTRING){ 13910f09f2aSAndreas Gohr if( $length === null ){ 14010f09f2aSAndreas Gohr mb_substr($str, $offset); 1417d8be200Sandi }else{ 14210f09f2aSAndreas Gohr mb_substr($str, $offset, $length); 143f29317c1Sandi } 144f29317c1Sandi } 145f29317c1Sandi 14610f09f2aSAndreas Gohr if ( $offset >= 0 && $length >= 0 ) { 14710f09f2aSAndreas Gohr if ( $length === null ) { 14810f09f2aSAndreas Gohr $length = '*'; 14910f09f2aSAndreas Gohr } else { 15010f09f2aSAndreas Gohr $strlen = strlen(utf8_decode($str)); 15110f09f2aSAndreas Gohr if ( $offset > $strlen ) { 15210f09f2aSAndreas Gohr return ''; 15310f09f2aSAndreas Gohr } 15410f09f2aSAndreas Gohr 15510f09f2aSAndreas Gohr if ( ( $offset + $length ) > $strlen ) { 15610f09f2aSAndreas Gohr $length = '*'; 15710f09f2aSAndreas Gohr } else { 15810f09f2aSAndreas Gohr $length = '{'.$length.'}'; 15910f09f2aSAndreas Gohr } 16010f09f2aSAndreas Gohr } 16110f09f2aSAndreas Gohr 16210f09f2aSAndreas Gohr $pattern = '/^.{'.$offset.'}(.'.$length.')/us'; 16310f09f2aSAndreas Gohr preg_match($pattern, $str, $matches); 16410f09f2aSAndreas Gohr 16510f09f2aSAndreas Gohr if ( isset($matches[1]) ) { 16610f09f2aSAndreas Gohr return $matches[1]; 16710f09f2aSAndreas Gohr } 16810f09f2aSAndreas Gohr return false; 16910f09f2aSAndreas Gohr 17010f09f2aSAndreas Gohr } else { 17110f09f2aSAndreas Gohr // Handle negatives using different, slower technique 17210f09f2aSAndreas Gohr // From: http://www.php.net/manual/en/function.substr.php#44838 17310f09f2aSAndreas Gohr preg_match_all('/./u', $str, $ar); 17410f09f2aSAndreas Gohr if( $length !== null ) { 17510f09f2aSAndreas Gohr return join('',array_slice($ar[0],$offset,$length)); 17610f09f2aSAndreas Gohr } else { 17710f09f2aSAndreas Gohr return join('',array_slice($ar[0],$offset)); 17810f09f2aSAndreas Gohr } 17910f09f2aSAndreas Gohr } 18010f09f2aSAndreas Gohr} 18110f09f2aSAndreas Gohr 18210f09f2aSAndreas Gohr 183f29317c1Sandi/** 184dc57ef04Sandi * Unicode aware replacement for substr_replace() 185dc57ef04Sandi * 186dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org> 187dc57ef04Sandi * @see substr_replace() 188dc57ef04Sandi */ 189dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){ 190dc57ef04Sandi $ret = ''; 191dc57ef04Sandi if($start>0) $ret .= utf8_substr($string, 0, $start); 192dc57ef04Sandi $ret .= $replacement; 193dc57ef04Sandi $ret .= utf8_substr($string, $start+$length); 194dc57ef04Sandi return $ret; 195dc57ef04Sandi} 196dc57ef04Sandi 197dc57ef04Sandi/** 198f29317c1Sandi * Unicode aware replacement for explode 199f29317c1Sandi * 200f29317c1Sandi * @TODO support third limit arg 201f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 202f29317c1Sandi * @see explode(); 203f29317c1Sandi */ 204f29317c1Sandifunction utf8_explode($sep, $str) { 205f29317c1Sandi if ( $sep == '' ) { 206f29317c1Sandi trigger_error('Empty delimiter',E_USER_WARNING); 207f29317c1Sandi return FALSE; 208f29317c1Sandi } 209f29317c1Sandi 210f29317c1Sandi return preg_split('!'.preg_quote($sep,'!').'!u',$str); 211f29317c1Sandi} 212f29317c1Sandi 213f29317c1Sandi/** 214f29317c1Sandi * Unicode aware replacement for strrepalce() 215f29317c1Sandi * 216f29317c1Sandi * @todo support PHP5 count (fourth arg) 217f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 218f29317c1Sandi * @see strreplace(); 219f29317c1Sandi */ 220f29317c1Sandifunction utf8_str_replace($s,$r,$str){ 221f29317c1Sandi if(!is_array($s)){ 222f29317c1Sandi $s = '!'.preg_quote($s,'!').'!u'; 223f29317c1Sandi }else{ 224f29317c1Sandi foreach ($s as $k => $v) { 225f29317c1Sandi $s[$k] = '!'.preg_quote($v).'!u'; 226f29317c1Sandi } 227f29317c1Sandi } 228f29317c1Sandi return preg_replace($s,$r,$str); 229f29317c1Sandi} 230f29317c1Sandi 231f29317c1Sandi/** 232f29317c1Sandi * Unicode aware replacement for ltrim() 233f29317c1Sandi * 234f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 235f29317c1Sandi * @see ltrim() 236f29317c1Sandi * @return string 237f29317c1Sandi */ 238f29317c1Sandifunction utf8_ltrim($str,$charlist=''){ 239f29317c1Sandi if($charlist == '') return ltrim($str); 240f29317c1Sandi 241f29317c1Sandi //quote charlist for use in a characterclass 242f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 243f29317c1Sandi 244f29317c1Sandi return preg_replace('/^['.$charlist.']+/u','',$str); 245f29317c1Sandi} 246f29317c1Sandi 247f29317c1Sandi/** 248ea2eed85Sandi * Unicode aware replacement for rtrim() 249f29317c1Sandi * 250f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 251f29317c1Sandi * @see rtrim() 252f29317c1Sandi * @return string 253f29317c1Sandi */ 254f29317c1Sandifunction utf8_rtrim($str,$charlist=''){ 255f29317c1Sandi if($charlist == '') return rtrim($str); 256f29317c1Sandi 257f29317c1Sandi //quote charlist for use in a characterclass 258f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 259f29317c1Sandi 260f29317c1Sandi return preg_replace('/['.$charlist.']+$/u','',$str); 261f29317c1Sandi} 262f29317c1Sandi 263f29317c1Sandi/** 264f29317c1Sandi * Unicode aware replacement for trim() 265f29317c1Sandi * 266f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 267f29317c1Sandi * @see trim() 268f29317c1Sandi * @return string 269f29317c1Sandi */ 270f29317c1Sandifunction utf8_trim($str,$charlist='') { 271f29317c1Sandi if($charlist == '') return trim($str); 272f29317c1Sandi 273f29317c1Sandi return utf8_ltrim(utf8_rtrim($str)); 274f29317c1Sandi} 275f29317c1Sandi 2762f954959Sandi 27749c713a3Sandi/** 27882257610Sandi * This is a unicode aware replacement for strtolower() 27982257610Sandi * 28082257610Sandi * Uses mb_string extension if available 28182257610Sandi * 28282257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 28382257610Sandi * @see strtolower() 28482257610Sandi * @see utf8_strtoupper() 28582257610Sandi */ 28682257610Sandifunction utf8_strtolower($string){ 287ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 28882257610Sandi 28982257610Sandi global $UTF8_UPPER_TO_LOWER; 29082257610Sandi $uni = utf8_to_unicode($string); 2912cd2db38Sandi $cnt = count($uni); 2922cd2db38Sandi for ($i=0; $i < $cnt; $i++){ 29382257610Sandi if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 29482257610Sandi $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 29582257610Sandi } 29682257610Sandi } 29782257610Sandi return unicode_to_utf8($uni); 29882257610Sandi} 29982257610Sandi 30082257610Sandi/** 30182257610Sandi * This is a unicode aware replacement for strtoupper() 30282257610Sandi * 30382257610Sandi * Uses mb_string extension if available 30482257610Sandi * 30582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 30682257610Sandi * @see strtoupper() 30782257610Sandi * @see utf8_strtoupper() 30882257610Sandi */ 30982257610Sandifunction utf8_strtoupper($string){ 310ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 31182257610Sandi 31282257610Sandi global $UTF8_LOWER_TO_UPPER; 31382257610Sandi $uni = utf8_to_unicode($string); 3142cd2db38Sandi $cnt = count($uni); 3152cd2db38Sandi for ($i=0; $i < $cnt; $i++){ 31682257610Sandi if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 31782257610Sandi $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 31882257610Sandi } 31982257610Sandi } 32082257610Sandi return unicode_to_utf8($uni); 32182257610Sandi} 32282257610Sandi 32382257610Sandi/** 32482257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 32582257610Sandi * 32682257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 32782257610Sandi * letters. Default is to deaccent both cases ($case = 0) 32882257610Sandi * 32982257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 33082257610Sandi */ 33182257610Sandifunction utf8_deaccent($string,$case=0){ 33282257610Sandi if($case <= 0){ 33382257610Sandi global $UTF8_LOWER_ACCENTS; 33482257610Sandi $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 33582257610Sandi } 33682257610Sandi if($case >= 0){ 33782257610Sandi global $UTF8_UPPER_ACCENTS; 33882257610Sandi $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 33982257610Sandi } 34082257610Sandi return $string; 34182257610Sandi} 34282257610Sandi 34382257610Sandi/** 3448a831f2bSAndreas Gohr * Romanize a non-latin string 3458a831f2bSAndreas Gohr * 3468a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 3478a831f2bSAndreas Gohr */ 3488a831f2bSAndreas Gohrfunction utf8_romanize($string){ 3498a831f2bSAndreas Gohr if(utf8_isASCII($string)) return $string; //nothing to do 3508a831f2bSAndreas Gohr 3518a831f2bSAndreas Gohr global $UTF8_ROMANIZATION; 3528a831f2bSAndreas Gohr return strtr($string,$UTF8_ROMANIZATION); 3538a831f2bSAndreas Gohr} 3548a831f2bSAndreas Gohr 3558a831f2bSAndreas Gohr/** 356099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string 357099ada41Sandi * 358099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of 359099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 360099ada41Sandi * 361099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 362099ada41Sandi * @param string $string The UTF8 string to strip of special chars 363099ada41Sandi * @param string $repl Replace special with this string 364b4ce25e9SAndreas Gohr * @param string $additional Additional chars to strip (used in regexp char class) 365099ada41Sandi */ 366b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){ 367099ada41Sandi global $UTF8_SPECIAL_CHARS; 368099ada41Sandi 3695c812709Sandi static $specials = null; 3705c812709Sandi if(is_null($specials)){ 3715c812709Sandi $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 3725c812709Sandi } 373099ada41Sandi 374b4ce25e9SAndreas Gohr return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 375099ada41Sandi} 376099ada41Sandi 377099ada41Sandi/** 3782f954959Sandi * This is an Unicode aware replacement for strpos 3792f954959Sandi * 3802f954959Sandi * Uses mb_string extension if available 3812f954959Sandi * 382f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 3832f954959Sandi * @see strpos() 3842f954959Sandi */ 3852f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) { 386ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8'); 3872f954959Sandi 388f29317c1Sandi if(!$offset){ 389eaa525a0SAndreas Gohr $ar = utf8_explode($needle, $haystack); 390f29317c1Sandi if ( count($ar) > 1 ) { 391f29317c1Sandi return utf8_strlen($ar[0]); 392f29317c1Sandi } 393f29317c1Sandi return false; 394f29317c1Sandi }else{ 395f29317c1Sandi if ( !is_int($offset) ) { 396f29317c1Sandi trigger_error('Offset must be an integer',E_USER_WARNING); 397f29317c1Sandi return false; 398f29317c1Sandi } 3992f954959Sandi 400eaa525a0SAndreas Gohr $haystack = utf8_substr($haystack, $offset); 401f29317c1Sandi 402eaa525a0SAndreas Gohr if ( false !== ($pos = utf8_strpos($haystack,$needle))){ 403f29317c1Sandi return $pos + $offset; 4042f954959Sandi } 405f29317c1Sandi return false; 4062f954959Sandi } 4072f954959Sandi} 4082f954959Sandi 4092f954959Sandi/** 410ea2eed85Sandi * Encodes UTF-8 characters to HTML entities 411ea2eed85Sandi * 412ea2eed85Sandi * @author <vpribish at shopping dot com> 413ea2eed85Sandi * @link http://www.php.net/manual/en/function.utf8-decode.php 414ea2eed85Sandi */ 415ea2eed85Sandifunction utf8_tohtml ($str) { 416ea2eed85Sandi $ret = ''; 417ea2eed85Sandi $max = strlen($str); 418ea2eed85Sandi $last = 0; // keeps the index of the last regular character 419ea2eed85Sandi for ($i=0; $i<$max; $i++) { 420ea2eed85Sandi $c = $str{$i}; 421ea2eed85Sandi $c1 = ord($c); 422ea2eed85Sandi if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 423ea2eed85Sandi $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 424ea2eed85Sandi $c1 &= 31; // remove the 3 bit two bytes prefix 425ea2eed85Sandi $c2 = ord($str{++$i}); // the next byte 426ea2eed85Sandi $c2 &= 63; // remove the 2 bit trailing byte prefix 427ea2eed85Sandi $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 428ea2eed85Sandi $c1 >>= 2; // c1 shifts 2 to the right 429ea2eed85Sandi $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 430ea2eed85Sandi $last = $i+1; 431ea2eed85Sandi } 432ea2eed85Sandi } 433ea2eed85Sandi return $ret . substr($str, $last, $i); // append the last batch of regular characters 434ea2eed85Sandi} 435ea2eed85Sandi 436ea2eed85Sandi/** 4371abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the 4381abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the 4391abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 4401abfaba4SAndreas Gohr * are not allowed. 44182257610Sandi * 4421abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 4431abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at 4441abfaba4SAndreas Gohr * level E_USER_WARNING 4451abfaba4SAndreas Gohr * 4461abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to 4471abfaba4SAndreas Gohr * trigger errors on encountering bad bytes 4481abfaba4SAndreas Gohr * 4491abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 4501abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 4511abfaba4SAndreas Gohr * @param string UTF-8 encoded string 4521abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 4531abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid 4541abfaba4SAndreas Gohr * @see unicode_to_utf8 4551abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 4561abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 45782257610Sandi */ 4581abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) { 4591abfaba4SAndreas Gohr $mState = 0; // cached expected number of octets after the current octet 4601abfaba4SAndreas Gohr // until the beginning of the next UTF8 character sequence 4611abfaba4SAndreas Gohr $mUcs4 = 0; // cached Unicode character 4621abfaba4SAndreas Gohr $mBytes = 1; // cached expected number of octets in the current sequence 46382257610Sandi 4641abfaba4SAndreas Gohr $out = array(); 4651abfaba4SAndreas Gohr 4661abfaba4SAndreas Gohr $len = strlen($str); 4671abfaba4SAndreas Gohr 4681abfaba4SAndreas Gohr for($i = 0; $i < $len; $i++) { 4691abfaba4SAndreas Gohr 4701abfaba4SAndreas Gohr $in = ord($str{$i}); 4711abfaba4SAndreas Gohr 4721abfaba4SAndreas Gohr if ( $mState == 0) { 4731abfaba4SAndreas Gohr 4741abfaba4SAndreas Gohr // When mState is zero we expect either a US-ASCII character or a 4751abfaba4SAndreas Gohr // multi-octet sequence. 4761abfaba4SAndreas Gohr if (0 == (0x80 & ($in))) { 4771abfaba4SAndreas Gohr // US-ASCII, pass straight through. 4781abfaba4SAndreas Gohr $out[] = $in; 4791abfaba4SAndreas Gohr $mBytes = 1; 4801abfaba4SAndreas Gohr 4811abfaba4SAndreas Gohr } else if (0xC0 == (0xE0 & ($in))) { 4821abfaba4SAndreas Gohr // First octet of 2 octet sequence 4831abfaba4SAndreas Gohr $mUcs4 = ($in); 4841abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x1F) << 6; 4851abfaba4SAndreas Gohr $mState = 1; 4861abfaba4SAndreas Gohr $mBytes = 2; 4871abfaba4SAndreas Gohr 4881abfaba4SAndreas Gohr } else if (0xE0 == (0xF0 & ($in))) { 4891abfaba4SAndreas Gohr // First octet of 3 octet sequence 4901abfaba4SAndreas Gohr $mUcs4 = ($in); 4911abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x0F) << 12; 4921abfaba4SAndreas Gohr $mState = 2; 4931abfaba4SAndreas Gohr $mBytes = 3; 4941abfaba4SAndreas Gohr 4951abfaba4SAndreas Gohr } else if (0xF0 == (0xF8 & ($in))) { 4961abfaba4SAndreas Gohr // First octet of 4 octet sequence 4971abfaba4SAndreas Gohr $mUcs4 = ($in); 4981abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x07) << 18; 4991abfaba4SAndreas Gohr $mState = 3; 5001abfaba4SAndreas Gohr $mBytes = 4; 5011abfaba4SAndreas Gohr 5021abfaba4SAndreas Gohr } else if (0xF8 == (0xFC & ($in))) { 5031abfaba4SAndreas Gohr /* First octet of 5 octet sequence. 5041abfaba4SAndreas Gohr * 5051abfaba4SAndreas Gohr * This is illegal because the encoded codepoint must be either 5061abfaba4SAndreas Gohr * (a) not the shortest form or 5071abfaba4SAndreas Gohr * (b) outside the Unicode range of 0-0x10FFFF. 5081abfaba4SAndreas Gohr * Rather than trying to resynchronize, we will carry on until the end 5091abfaba4SAndreas Gohr * of the sequence and let the later error handling code catch it. 5101abfaba4SAndreas Gohr */ 5111abfaba4SAndreas Gohr $mUcs4 = ($in); 5121abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x03) << 24; 5131abfaba4SAndreas Gohr $mState = 4; 5141abfaba4SAndreas Gohr $mBytes = 5; 5151abfaba4SAndreas Gohr 5161abfaba4SAndreas Gohr } else if (0xFC == (0xFE & ($in))) { 5171abfaba4SAndreas Gohr // First octet of 6 octet sequence, see comments for 5 octet sequence. 5181abfaba4SAndreas Gohr $mUcs4 = ($in); 5191abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 1) << 30; 5201abfaba4SAndreas Gohr $mState = 5; 5211abfaba4SAndreas Gohr $mBytes = 6; 5221abfaba4SAndreas Gohr 5231abfaba4SAndreas Gohr } elseif($strict) { 5241abfaba4SAndreas Gohr /* Current octet is neither in the US-ASCII range nor a legal first 5251abfaba4SAndreas Gohr * octet of a multi-octet sequence. 5261abfaba4SAndreas Gohr */ 5271abfaba4SAndreas Gohr trigger_error( 5281abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence identifier '. 5291abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 5301abfaba4SAndreas Gohr E_USER_WARNING 5311abfaba4SAndreas Gohr ); 5321abfaba4SAndreas Gohr return FALSE; 5331abfaba4SAndreas Gohr 5341abfaba4SAndreas Gohr } 5351abfaba4SAndreas Gohr 5361abfaba4SAndreas Gohr } else { 5371abfaba4SAndreas Gohr 5381abfaba4SAndreas Gohr // When mState is non-zero, we expect a continuation of the multi-octet 5391abfaba4SAndreas Gohr // sequence 5401abfaba4SAndreas Gohr if (0x80 == (0xC0 & ($in))) { 5411abfaba4SAndreas Gohr 5421abfaba4SAndreas Gohr // Legal continuation. 5431abfaba4SAndreas Gohr $shift = ($mState - 1) * 6; 5441abfaba4SAndreas Gohr $tmp = $in; 5451abfaba4SAndreas Gohr $tmp = ($tmp & 0x0000003F) << $shift; 5461abfaba4SAndreas Gohr $mUcs4 |= $tmp; 5471abfaba4SAndreas Gohr 5481abfaba4SAndreas Gohr /** 5491abfaba4SAndreas Gohr * End of the multi-octet sequence. mUcs4 now contains the final 5501abfaba4SAndreas Gohr * Unicode codepoint to be output 5511abfaba4SAndreas Gohr */ 5521abfaba4SAndreas Gohr if (0 == --$mState) { 5531abfaba4SAndreas Gohr 5541abfaba4SAndreas Gohr /* 5551abfaba4SAndreas Gohr * Check for illegal sequences and codepoints. 5561abfaba4SAndreas Gohr */ 5571abfaba4SAndreas Gohr // From Unicode 3.1, non-shortest form is illegal 5581abfaba4SAndreas Gohr if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 5591abfaba4SAndreas Gohr ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 5601abfaba4SAndreas Gohr ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 5611abfaba4SAndreas Gohr (4 < $mBytes) || 5621abfaba4SAndreas Gohr // From Unicode 3.2, surrogate characters are illegal 5631abfaba4SAndreas Gohr (($mUcs4 & 0xFFFFF800) == 0xD800) || 5641abfaba4SAndreas Gohr // Codepoints outside the Unicode range are illegal 5651abfaba4SAndreas Gohr ($mUcs4 > 0x10FFFF)) { 5661abfaba4SAndreas Gohr 5671abfaba4SAndreas Gohr if($strict){ 5681abfaba4SAndreas Gohr trigger_error( 5691abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence or codepoint '. 5701abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 5711abfaba4SAndreas Gohr E_USER_WARNING 5721abfaba4SAndreas Gohr ); 5731abfaba4SAndreas Gohr 5741abfaba4SAndreas Gohr return FALSE; 5751abfaba4SAndreas Gohr } 5761abfaba4SAndreas Gohr 5771abfaba4SAndreas Gohr } 5781abfaba4SAndreas Gohr 5791abfaba4SAndreas Gohr if (0xFEFF != $mUcs4) { 5801abfaba4SAndreas Gohr // BOM is legal but we don't want to output it 5811abfaba4SAndreas Gohr $out[] = $mUcs4; 5821abfaba4SAndreas Gohr } 5831abfaba4SAndreas Gohr 5841abfaba4SAndreas Gohr //initialize UTF8 cache 5851abfaba4SAndreas Gohr $mState = 0; 5861abfaba4SAndreas Gohr $mUcs4 = 0; 5871abfaba4SAndreas Gohr $mBytes = 1; 5881abfaba4SAndreas Gohr } 5891abfaba4SAndreas Gohr 5901abfaba4SAndreas Gohr } elseif($strict) { 5911abfaba4SAndreas Gohr /** 5921abfaba4SAndreas Gohr *((0xC0 & (*in) != 0x80) && (mState != 0)) 5931abfaba4SAndreas Gohr * Incomplete multi-octet sequence. 5941abfaba4SAndreas Gohr */ 5951abfaba4SAndreas Gohr trigger_error( 5961abfaba4SAndreas Gohr 'utf8_to_unicode: Incomplete multi-octet '. 5971abfaba4SAndreas Gohr ' sequence in UTF-8 at byte '.$i, 5981abfaba4SAndreas Gohr E_USER_WARNING 5991abfaba4SAndreas Gohr ); 6001abfaba4SAndreas Gohr 6011abfaba4SAndreas Gohr return FALSE; 60282257610Sandi } 60382257610Sandi } 60482257610Sandi } 6051abfaba4SAndreas Gohr return $out; 60682257610Sandi} 60782257610Sandi 60882257610Sandi/** 6091abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns 6101abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the 6111abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 6121abfaba4SAndreas Gohr * are not allowed. 61382257610Sandi * 6141abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 6151abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the 6161abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING 6171abfaba4SAndreas Gohr * 6181abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use 6191abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as 6201abfaba4SAndreas Gohr * reference the array by it's keys 6211abfaba4SAndreas Gohr * 6221abfaba4SAndreas Gohr * @param array of unicode code points representing a string 6231abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 6241abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points 6251abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 6261abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 6271abfaba4SAndreas Gohr * @see utf8_to_unicode 6281abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 6291abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 63082257610Sandi */ 6311abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) { 6321abfaba4SAndreas Gohr if (!is_array($arr)) return ''; 6331abfaba4SAndreas Gohr ob_start(); 634f949a01cSAndreas Gohr 6351abfaba4SAndreas Gohr foreach (array_keys($arr) as $k) { 6361abfaba4SAndreas Gohr 6371abfaba4SAndreas Gohr # ASCII range (including control chars) 6381abfaba4SAndreas Gohr if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 6391abfaba4SAndreas Gohr 6401abfaba4SAndreas Gohr echo chr($arr[$k]); 6411abfaba4SAndreas Gohr 6421abfaba4SAndreas Gohr # 2 byte sequence 6431abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x07ff) { 6441abfaba4SAndreas Gohr 6451abfaba4SAndreas Gohr echo chr(0xc0 | ($arr[$k] >> 6)); 6461abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 6471abfaba4SAndreas Gohr 6481abfaba4SAndreas Gohr # Byte order mark (skip) 6491abfaba4SAndreas Gohr } else if($arr[$k] == 0xFEFF) { 6501abfaba4SAndreas Gohr 6511abfaba4SAndreas Gohr // nop -- zap the BOM 6521abfaba4SAndreas Gohr 6531abfaba4SAndreas Gohr # Test for illegal surrogates 6541abfaba4SAndreas Gohr } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 6551abfaba4SAndreas Gohr 6561abfaba4SAndreas Gohr // found a surrogate 6571abfaba4SAndreas Gohr if($strict){ 6581abfaba4SAndreas Gohr trigger_error( 6591abfaba4SAndreas Gohr 'unicode_to_utf8: Illegal surrogate '. 6601abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 6611abfaba4SAndreas Gohr E_USER_WARNING 6621abfaba4SAndreas Gohr ); 6631abfaba4SAndreas Gohr return FALSE; 6641abfaba4SAndreas Gohr } 6651abfaba4SAndreas Gohr 6661abfaba4SAndreas Gohr # 3 byte sequence 6671abfaba4SAndreas Gohr } else if ($arr[$k] <= 0xffff) { 6681abfaba4SAndreas Gohr 6691abfaba4SAndreas Gohr echo chr(0xe0 | ($arr[$k] >> 12)); 6701abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 6711abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 6721abfaba4SAndreas Gohr 6731abfaba4SAndreas Gohr # 4 byte sequence 6741abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x10ffff) { 6751abfaba4SAndreas Gohr 6761abfaba4SAndreas Gohr echo chr(0xf0 | ($arr[$k] >> 18)); 6771abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 6781abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 6791abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x3f)); 6801abfaba4SAndreas Gohr 6811abfaba4SAndreas Gohr } elseif($strict) { 6821abfaba4SAndreas Gohr 6831abfaba4SAndreas Gohr trigger_error( 6841abfaba4SAndreas Gohr 'unicode_to_utf8: Codepoint out of Unicode range '. 6851abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 6861abfaba4SAndreas Gohr E_USER_WARNING 6871abfaba4SAndreas Gohr ); 6881abfaba4SAndreas Gohr 6891abfaba4SAndreas Gohr // out of range 6901abfaba4SAndreas Gohr return FALSE; 69182257610Sandi } 69282257610Sandi } 6931abfaba4SAndreas Gohr 6941abfaba4SAndreas Gohr $result = ob_get_contents(); 6951abfaba4SAndreas Gohr ob_end_clean(); 6961abfaba4SAndreas Gohr return $result; 69782257610Sandi} 69882257610Sandi 69982257610Sandi/** 70015fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 70115fa0b4fSAndreas Gohr * 70215fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 70315fa0b4fSAndreas Gohr */ 70415fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) { 70515fa0b4fSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 706ab77016bSAndreas Gohr if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 70715fa0b4fSAndreas Gohr 70815fa0b4fSAndreas Gohr $uni = utf8_to_unicode($str); 70915fa0b4fSAndreas Gohr foreach($uni as $cp){ 71015fa0b4fSAndreas Gohr $out .= pack('n',$cp); 71115fa0b4fSAndreas Gohr } 71215fa0b4fSAndreas Gohr return $out; 71315fa0b4fSAndreas Gohr} 71415fa0b4fSAndreas Gohr 71515fa0b4fSAndreas Gohr/** 71615fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 71715fa0b4fSAndreas Gohr * 71815fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 71915fa0b4fSAndreas Gohr */ 72015fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) { 72115fa0b4fSAndreas Gohr $uni = unpack('n*',$str); 72215fa0b4fSAndreas Gohr return unicode_to_utf8($uni); 72315fa0b4fSAndreas Gohr} 72415fa0b4fSAndreas Gohr 7250eac1afbSAndreas Gohr/** 7260eac1afbSAndreas Gohr * Replace bad bytes with an alternative character 7270eac1afbSAndreas Gohr * 7280eac1afbSAndreas Gohr * ASCII character is recommended for replacement char 7290eac1afbSAndreas Gohr * 7300eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string 7310eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms 7320eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars 7330eac1afbSAndreas Gohr * 7340eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 7350eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8 7360eac1afbSAndreas Gohr * @param string to search 7370eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII 7380eac1afbSAndreas Gohr * @return string 7390eac1afbSAndreas Gohr */ 7400eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') { 7410eac1afbSAndreas Gohr $UTF8_BAD = 7420eac1afbSAndreas Gohr '([\x00-\x7F]'. # ASCII (including control chars) 7430eac1afbSAndreas Gohr '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 7440eac1afbSAndreas Gohr '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 7450eac1afbSAndreas Gohr '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 7460eac1afbSAndreas Gohr '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 7470eac1afbSAndreas Gohr '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 7480eac1afbSAndreas Gohr '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 7490eac1afbSAndreas Gohr '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 7500eac1afbSAndreas Gohr '|(.{1}))'; # invalid byte 7510eac1afbSAndreas Gohr ob_start(); 7520eac1afbSAndreas Gohr while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 7530eac1afbSAndreas Gohr if ( !isset($matches[2])) { 7540eac1afbSAndreas Gohr echo $matches[0]; 7550eac1afbSAndreas Gohr } else { 7560eac1afbSAndreas Gohr echo $replace; 7570eac1afbSAndreas Gohr } 7580eac1afbSAndreas Gohr $str = substr($str,strlen($matches[0])); 7590eac1afbSAndreas Gohr } 7600eac1afbSAndreas Gohr $result = ob_get_contents(); 7610eac1afbSAndreas Gohr ob_end_clean(); 7620eac1afbSAndreas Gohr return $result; 7630eac1afbSAndreas Gohr} 764ab77016bSAndreas Gohr 7655953e889Schris/** 7665953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary 7675953e889Schris * 7685953e889Schris * @param $str string utf8 character string 7695953e889Schris * @param $i int byte index into $str 7705953e889Schris * @param $next bool direction to search for boundary, 7715953e889Schris * false = up (current character) 7725953e889Schris * true = down (next character) 7735953e889Schris * 7745953e889Schris * @return int byte index into $str now pointing to a utf8 character boundary 7755953e889Schris * 7765953e889Schris * @author chris smith <chris@jalakai.co.uk> 7775953e889Schris */ 7785953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) { 7795953e889Schris 780*f50163d1Schris if ($i <= 0) return 0; 781*f50163d1Schris 7825953e889Schris $limit = strlen($str); 783*f50163d1Schris if ($i>=$limit) return $limit; 784*f50163d1Schris 785*f50163d1Schris if ($next) { 7865953e889Schris while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 7875953e889Schris } else { 7885953e889Schris while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 7895953e889Schris } 7905953e889Schris 7915953e889Schris return $i; 7925953e889Schris} 7935953e889Schris 794ab77016bSAndreas Gohr// only needed if no mb_string available 795ab77016bSAndreas Gohrif(!UTF8_MBSTRING){ 796ab77016bSAndreas Gohr 79715fa0b4fSAndreas Gohr /** 79882257610Sandi * UTF-8 Case lookup table 79982257610Sandi * 80082257610Sandi * This lookuptable defines the upper case letters to their correspponding 80182257610Sandi * lower case letter in UTF-8 80282257610Sandi * 80382257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 80482257610Sandi */ 80554662a04SAndreas Gohr global $UTF8_LOWER_TO_UPPER; 80654662a04SAndreas Gohr $UTF8_LOWER_TO_UPPER = array( 80782257610Sandi 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 80882257610Sandi 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 80982257610Sandi 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 81082257610Sandi 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 81182257610Sandi 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 81282257610Sandi 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 81382257610Sandi 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 81482257610Sandi 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 81582257610Sandi 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 81682257610Sandi 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 81782257610Sandi 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 81882257610Sandi 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 81982257610Sandi 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 82082257610Sandi 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 82182257610Sandi 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 82282257610Sandi 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 82382257610Sandi 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 82482257610Sandi 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 82582257610Sandi 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 82682257610Sandi 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 82782257610Sandi 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 82882257610Sandi 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 82982257610Sandi 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 83082257610Sandi 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 83182257610Sandi 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 83282257610Sandi 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 83382257610Sandi 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 83482257610Sandi 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 83582257610Sandi 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 83682257610Sandi 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 83782257610Sandi 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 83882257610Sandi 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 83982257610Sandi 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 84082257610Sandi 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 84182257610Sandi 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 84282257610Sandi 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 84382257610Sandi 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 84482257610Sandi 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 84582257610Sandi 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 84682257610Sandi 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 84782257610Sandi 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 84882257610Sandi 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 84982257610Sandi 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 85082257610Sandi ); 85182257610Sandi 85282257610Sandi /** 85382257610Sandi * UTF-8 Case lookup table 85482257610Sandi * 85582257610Sandi * This lookuptable defines the lower case letters to their correspponding 85682257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 85782257610Sandi * 85882257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 85982257610Sandi */ 86054662a04SAndreas Gohr global $UTF8_UPPER_TO_LOWER; 86182257610Sandi $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 86282257610Sandi 863ab77016bSAndreas Gohr} // end of case lookup tables 864ab77016bSAndreas Gohr 865ab77016bSAndreas Gohr 86682257610Sandi/** 86782257610Sandi * UTF-8 lookup table for lower case accented letters 86882257610Sandi * 86982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 87082257610Sandi * range. This are lower case letters only. 87182257610Sandi * 87282257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 87382257610Sandi * @see utf8_deaccent() 87482257610Sandi */ 87554662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS; 87682257610Sandi$UTF8_LOWER_ACCENTS = array( 87782257610Sandi 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 87882257610Sandi 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 87982257610Sandi 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 88082257610Sandi 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 88182257610Sandi 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 88282257610Sandi 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 88382257610Sandi 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 88482257610Sandi 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 88582257610Sandi 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 88682257610Sandi 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 88782257610Sandi 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 88882257610Sandi 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 88982257610Sandi 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 89082257610Sandi 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 89174c0c504Schris 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 89282257610Sandi); 89382257610Sandi 89482257610Sandi/** 89582257610Sandi * UTF-8 lookup table for upper case accented letters 89682257610Sandi * 89782257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 89882257610Sandi * range. This are upper case letters only. 89982257610Sandi * 90082257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 90182257610Sandi * @see utf8_deaccent() 90282257610Sandi */ 90354662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS; 90482257610Sandi$UTF8_UPPER_ACCENTS = array( 905df3ecd55SAndreas Gohr 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 906df3ecd55SAndreas Gohr 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 907df3ecd55SAndreas Gohr 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 908df3ecd55SAndreas Gohr 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 909df3ecd55SAndreas Gohr 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 910df3ecd55SAndreas Gohr 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 911df3ecd55SAndreas Gohr 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 912df3ecd55SAndreas Gohr 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 913df3ecd55SAndreas Gohr 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 914df3ecd55SAndreas Gohr 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 915df3ecd55SAndreas Gohr 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 916df3ecd55SAndreas Gohr 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 917df3ecd55SAndreas Gohr 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 918df3ecd55SAndreas Gohr 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 91974c0c504Schris 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 92082257610Sandi); 92182257610Sandi 922099ada41Sandi/** 923099ada41Sandi * UTF-8 array of common special characters 924099ada41Sandi * 925099ada41Sandi * This array should contain all special characters (not a letter or digit) 926099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum 927099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special 928099ada41Sandi * chars. 929099ada41Sandi * 930099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 931ad81d431SAndreas Gohr * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 932099ada41Sandi * 933099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 934099ada41Sandi * @see utf8_stripspecials() 935099ada41Sandi */ 93654662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS; 937099ada41Sandi$UTF8_SPECIAL_CHARS = array( 938099ada41Sandi 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 939ad81d431SAndreas Gohr 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 9405c812709Sandi 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 9415c812709Sandi 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 942099ada41Sandi 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 943099ada41Sandi 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 944099ada41Sandi 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 945099ada41Sandi 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 946099ada41Sandi 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 947099ada41Sandi 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 948099ada41Sandi 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 949099ada41Sandi 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 950099ada41Sandi 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 951099ada41Sandi 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 952099ada41Sandi 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 953099ada41Sandi 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 954099ada41Sandi 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 955099ada41Sandi 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 956099ada41Sandi 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 957099ada41Sandi 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 958099ada41Sandi 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 959099ada41Sandi 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 960099ada41Sandi 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 961099ada41Sandi 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 962099ada41Sandi 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 963099ada41Sandi 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 964099ada41Sandi 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 965099ada41Sandi 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 966099ada41Sandi 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 967099ada41Sandi 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 968099ada41Sandi 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 969099ada41Sandi 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 970099ada41Sandi 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 971099ada41Sandi 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 972099ada41Sandi 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 973099ada41Sandi 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 974099ada41Sandi 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 975099ada41Sandi 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 976099ada41Sandi 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 977099ada41Sandi 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 978099ada41Sandi 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 979099ada41Sandi 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 980099ada41Sandi 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 981099ada41Sandi 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 982099ada41Sandi 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 983099ada41Sandi 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 984099ada41Sandi 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 985099ada41Sandi 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 986099ada41Sandi 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 987099ada41Sandi 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 988099ada41Sandi); 989340756e4Sandi 9908a831f2bSAndreas Gohr/** 9918a831f2bSAndreas Gohr * Romanization lookup table 9928a831f2bSAndreas Gohr * 9938a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language 9948a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII. 9958a831f2bSAndreas Gohr * 9968a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works 9978a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement 9988a831f2bSAndreas Gohr * only. Specialities of each language are not supported. 9998a831f2bSAndreas Gohr * 10008a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 10018a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com> 10028a831f2bSAndreas Gohr * @link http://www.uconv.com/translit.htm 10038a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi> 10048a831f2bSAndreas Gohr * @link http://kanjidict.stc.cx/hiragana.php?src=2 10058a831f2bSAndreas Gohr * @link http://www.translatum.gr/converter/greek-transliteration.htm 10068a831f2bSAndreas Gohr * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 10078a831f2bSAndreas Gohr * @link http://www.btranslations.com/resources/romanization/korean.asp 10088a831f2bSAndreas Gohr */ 100954662a04SAndreas Gohrglobal $UTF8_ROMANIZATION; 10108a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array( 10118a831f2bSAndreas Gohr //russian cyrillic 10128a831f2bSAndreas Gohr 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 10138a831f2bSAndreas Gohr 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 10148a831f2bSAndreas Gohr 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 10158a831f2bSAndreas Gohr 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 10168a831f2bSAndreas Gohr 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 10178a831f2bSAndreas Gohr 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1018d8cb2602SDenis Simakov 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1019d8cb2602SDenis Simakov 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 10208a831f2bSAndreas Gohr 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 10218a831f2bSAndreas Gohr // Ukrainian cyrillic 10228a831f2bSAndreas Gohr 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 10238a831f2bSAndreas Gohr // Georgian 10248a831f2bSAndreas Gohr 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 10258a831f2bSAndreas Gohr 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 10268a831f2bSAndreas Gohr 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 10278a831f2bSAndreas Gohr 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 10288a831f2bSAndreas Gohr 'ჰ'=>'xh', 10298a831f2bSAndreas Gohr //Sanskrit 10308a831f2bSAndreas Gohr 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 10318a831f2bSAndreas Gohr 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 10328a831f2bSAndreas Gohr 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 10338a831f2bSAndreas Gohr 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 10348a831f2bSAndreas Gohr 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 10358a831f2bSAndreas Gohr 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 10368a831f2bSAndreas Gohr 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 10378a831f2bSAndreas Gohr //Hebrew 10383dbad6dcSDenis Simakov 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 10393dbad6dcSDenis Simakov 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 10403dbad6dcSDenis Simakov 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 10418a831f2bSAndreas Gohr 'ש'=>'sh','ת'=>'t', 10428a831f2bSAndreas Gohr //Arabic 10438a831f2bSAndreas Gohr 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 10448a831f2bSAndreas Gohr 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 10458a831f2bSAndreas Gohr 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 10468a831f2bSAndreas Gohr 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 10478a831f2bSAndreas Gohr 10488a831f2bSAndreas Gohr // Japanese hiragana 10498a831f2bSAndreas Gohr 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 10508a831f2bSAndreas Gohr 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 10518a831f2bSAndreas Gohr 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 10528a831f2bSAndreas Gohr 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 10538a831f2bSAndreas Gohr 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 10548a831f2bSAndreas Gohr 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 10558a831f2bSAndreas Gohr 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 10568a831f2bSAndreas Gohr 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 10578a831f2bSAndreas Gohr 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 10588a831f2bSAndreas Gohr 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 10598a831f2bSAndreas Gohr 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 10608a831f2bSAndreas Gohr 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 10618a831f2bSAndreas Gohr 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 10628a831f2bSAndreas Gohr 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 10638a831f2bSAndreas Gohr 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 10648a831f2bSAndreas Gohr 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 10658a831f2bSAndreas Gohr 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 10668a831f2bSAndreas Gohr 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 10678a831f2bSAndreas Gohr 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 10688a831f2bSAndreas Gohr 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 10698a831f2bSAndreas Gohr 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 10708a831f2bSAndreas Gohr 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 10718a831f2bSAndreas Gohr 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 10728a831f2bSAndreas Gohr 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 10738a831f2bSAndreas Gohr 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 10748a831f2bSAndreas Gohr 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 10758a831f2bSAndreas Gohr 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 10768a831f2bSAndreas Gohr 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 10778a831f2bSAndreas Gohr 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 10788a831f2bSAndreas Gohr 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 10798a831f2bSAndreas Gohr 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 10808a831f2bSAndreas Gohr 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 10818a831f2bSAndreas Gohr 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 10828a831f2bSAndreas Gohr 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 10838a831f2bSAndreas Gohr 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 10848a831f2bSAndreas Gohr 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 10858a831f2bSAndreas Gohr 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 10868a831f2bSAndreas Gohr 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 10878a831f2bSAndreas Gohr 'じゅ'=>'zyu', 10888a831f2bSAndreas Gohr // Japanese katakana 10898a831f2bSAndreas Gohr 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 10908a831f2bSAndreas Gohr 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 10918a831f2bSAndreas Gohr 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 10928a831f2bSAndreas Gohr 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 10938a831f2bSAndreas Gohr 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 10948a831f2bSAndreas Gohr 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 10958a831f2bSAndreas Gohr 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 10968a831f2bSAndreas Gohr 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 10978a831f2bSAndreas Gohr 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 10988a831f2bSAndreas Gohr 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 10998a831f2bSAndreas Gohr 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 11008a831f2bSAndreas Gohr 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 11018a831f2bSAndreas Gohr 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 11028a831f2bSAndreas Gohr 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 11038a831f2bSAndreas Gohr 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 11048a831f2bSAndreas Gohr 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 11058a831f2bSAndreas Gohr 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 11068a831f2bSAndreas Gohr 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 11078a831f2bSAndreas Gohr 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 11088a831f2bSAndreas Gohr 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 11098a831f2bSAndreas Gohr 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 11108a831f2bSAndreas Gohr 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 11118a831f2bSAndreas Gohr 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 11128a831f2bSAndreas Gohr 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 11138a831f2bSAndreas Gohr 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 11148a831f2bSAndreas Gohr 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 11158a831f2bSAndreas Gohr 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 11168a831f2bSAndreas Gohr 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 11178a831f2bSAndreas Gohr 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 11188a831f2bSAndreas Gohr 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 11198a831f2bSAndreas Gohr 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 11208a831f2bSAndreas Gohr 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 11218a831f2bSAndreas Gohr 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 11228a831f2bSAndreas Gohr 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 11238a831f2bSAndreas Gohr 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 11248a831f2bSAndreas Gohr 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 11258a831f2bSAndreas Gohr 'ジョ'=>'zyo','ジュ'=>'zyu', 11268a831f2bSAndreas Gohr 11278a831f2bSAndreas Gohr // "Greeklish" 11288a831f2bSAndreas Gohr 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 11298a831f2bSAndreas Gohr 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 11308a831f2bSAndreas Gohr 11318a831f2bSAndreas Gohr // Thai 11328a831f2bSAndreas Gohr 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 11338a831f2bSAndreas Gohr 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 11348a831f2bSAndreas Gohr 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 11358a831f2bSAndreas Gohr 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 11368a831f2bSAndreas Gohr 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 11378a831f2bSAndreas Gohr 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 11388a831f2bSAndreas Gohr 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 11398a831f2bSAndreas Gohr '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 11408a831f2bSAndreas Gohr 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 11418a831f2bSAndreas Gohr 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 11428a831f2bSAndreas Gohr '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 11438a831f2bSAndreas Gohr 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 11448a831f2bSAndreas Gohr 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 11458a831f2bSAndreas Gohr 'เ–ียว'=>'iao', 11468a831f2bSAndreas Gohr 11478a831f2bSAndreas Gohr // Korean 11488a831f2bSAndreas Gohr 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 11498a831f2bSAndreas Gohr 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 11508a831f2bSAndreas Gohr 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 11518a831f2bSAndreas Gohr 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 11528a831f2bSAndreas Gohr 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 11538a831f2bSAndreas Gohr 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 11548a831f2bSAndreas Gohr); 1155340756e4Sandi 1156340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 : 11578a831f2bSAndreas Gohr 1158