1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * URL-Encode a filename to allow unicodecharacters 11 * 12 * Slashes are not encoded 13 * 14 * When the second parameter is true the string will 15 * be encoded only if non ASCII characters are detected - 16 * This makes it safe to run it multiple times on the 17 * same string (default is true) 18 * 19 * @author Andreas Gohr <andi@splitbrain.org> 20 * @see urlencode 21 */ 22function utf8_encodeFN($file,$safe=true){ 23 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 24 return $file; 25 } 26 $file = urlencode($file); 27 $file = str_replace('%2F','/',$file); 28 return $file; 29} 30 31/** 32 * URL-Decode a filename 33 * 34 * This is just a wrapper around urldecode 35 * 36 * @author Andreas Gohr <andi@splitbrain.org> 37 * @see urldecode 38 */ 39function utf8_decodeFN($file){ 40 $file = urldecode($file); 41 return $file; 42} 43 44/** 45 * Checks if a string contains 7bit ASCII only 46 * 47 * @author Andreas Gohr <andi@splitbrain.org> 48 */ 49function utf8_isASCII($str){ 50 for($i=0; $i<strlen($str); $i++){ 51 if(ord($str{$i}) >127) return false; 52 } 53 return true; 54} 55 56/** 57 * Strips all highbyte chars 58 * 59 * Returns a pure ASCII7 string 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63function utf8_strip($str){ 64 $ascii = ''; 65 for($i=0; $i<strlen($str); $i++){ 66 if(ord($str{$i}) <128){ 67 $ascii .= $str{$i}; 68 } 69 } 70 return $ascii; 71} 72 73/** 74 * Tries to detect if a string is in Unicode encoding 75 * 76 * @author <bmorel@ssi.fr> 77 * @link http://www.php.net/manual/en/function.utf8-encode.php 78 */ 79function utf8_check($Str) { 80 for ($i=0; $i<strlen($Str); $i++) { 81 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 82 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 83 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 84 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 85 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 86 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 87 else return false; # Does not match any model 88 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 89 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 90 return false; 91 } 92 } 93 return true; 94} 95 96/** 97 * Unicode aware replacement for strlen() 98 * 99 * utf8_decode() converts characters that are not in ISO-8859-1 100 * to '?', which, for the purpose of counting, is alright - It's 101 * even faster than mb_strlen. 102 * 103 * @author <chernyshevsky at hotmail dot com> 104 * @see strlen() 105 * @see utf8_decode() 106 */ 107function utf8_strlen($string){ 108 return strlen(utf8_decode($string)); 109} 110 111/** 112 * Unicode aware replacement for substr() 113 * 114 * @author lmak at NOSPAM dot iti dot gr 115 * @link http://www.php.net/manual/en/function.substr.php 116 * @see substr() 117 */ 118function utf8_substr($str,$start,$length=null){ 119 preg_match_all("/./u", $str, $ar); 120 121 if($length != null) { 122 return join("",array_slice($ar[0],$start,$length)); 123 } else { 124 return join("",array_slice($ar[0],$start)); 125 } 126} 127 128/** 129 * Unicode aware replacement for substr_replace() 130 * 131 * @author Andreas Gohr <andi@splitbrain.org> 132 * @see substr_replace() 133 */ 134function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 135 $ret = ''; 136 if($start>0) $ret .= utf8_substr($string, 0, $start); 137 $ret .= $replacement; 138 $ret .= utf8_substr($string, $start+$length); 139 return $ret; 140} 141 142/** 143 * Unicode aware replacement for explode 144 * 145 * @TODO support third limit arg 146 * @author Harry Fuecks <hfuecks@gmail.com> 147 * @see explode(); 148 */ 149function utf8_explode($sep, $str) { 150 if ( $sep == '' ) { 151 trigger_error('Empty delimiter',E_USER_WARNING); 152 return FALSE; 153 } 154 155 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 156} 157 158/** 159 * Unicode aware replacement for strrepalce() 160 * 161 * @todo support PHP5 count (fourth arg) 162 * @author Harry Fuecks <hfuecks@gmail.com> 163 * @see strreplace(); 164 */ 165function utf8_str_replace($s,$r,$str){ 166 if(!is_array($s)){ 167 $s = '!'.preg_quote($s,'!').'!u'; 168 }else{ 169 foreach ($s as $k => $v) { 170 $s[$k] = '!'.preg_quote($v).'!u'; 171 } 172 } 173 return preg_replace($s,$r,$str); 174} 175 176/** 177 * Unicode aware replacement for ltrim() 178 * 179 * @author Andreas Gohr <andi@splitbrain.org> 180 * @see ltrim() 181 * @return string 182 */ 183function utf8_ltrim($str,$charlist=''){ 184 if($charlist == '') return ltrim($str); 185 186 //quote charlist for use in a characterclass 187 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 188 189 return preg_replace('/^['.$charlist.']+/u','',$str); 190} 191 192/** 193 * Unicode aware replacement for rtrim() 194 * 195 * @author Andreas Gohr <andi@splitbrain.org> 196 * @see rtrim() 197 * @return string 198 */ 199function utf8_rtrim($str,$charlist=''){ 200 if($charlist == '') return rtrim($str); 201 202 //quote charlist for use in a characterclass 203 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 204 205 return preg_replace('/['.$charlist.']+$/u','',$str); 206} 207 208/** 209 * Unicode aware replacement for trim() 210 * 211 * @author Andreas Gohr <andi@splitbrain.org> 212 * @see trim() 213 * @return string 214 */ 215function utf8_trim($str,$charlist='') { 216 if($charlist == '') return trim($str); 217 218 return utf8_ltrim(utf8_rtrim($str)); 219} 220 221 222/** 223 * This is a unicode aware replacement for strtolower() 224 * 225 * Uses mb_string extension if available 226 * 227 * @author Andreas Gohr <andi@splitbrain.org> 228 * @see strtolower() 229 * @see utf8_strtoupper() 230 */ 231function utf8_strtolower($string){ 232 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 233 return mb_strtolower($string,'utf-8'); 234 235 global $UTF8_UPPER_TO_LOWER; 236 $uni = utf8_to_unicode($string); 237 $cnt = count($uni); 238 for ($i=0; $i < $cnt; $i++){ 239 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 240 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 241 } 242 } 243 return unicode_to_utf8($uni); 244} 245 246/** 247 * This is a unicode aware replacement for strtoupper() 248 * 249 * Uses mb_string extension if available 250 * 251 * @author Andreas Gohr <andi@splitbrain.org> 252 * @see strtoupper() 253 * @see utf8_strtoupper() 254 */ 255function utf8_strtoupper($string){ 256 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 257 return mb_strtoupper($string,'utf-8'); 258 259 global $UTF8_LOWER_TO_UPPER; 260 $uni = utf8_to_unicode($string); 261 $cnt = count($uni); 262 for ($i=0; $i < $cnt; $i++){ 263 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 264 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 265 } 266 } 267 return unicode_to_utf8($uni); 268} 269 270/** 271 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 272 * 273 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 274 * letters. Default is to deaccent both cases ($case = 0) 275 * 276 * @author Andreas Gohr <andi@splitbrain.org> 277 */ 278function utf8_deaccent($string,$case=0){ 279 if($case <= 0){ 280 global $UTF8_LOWER_ACCENTS; 281 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 282 } 283 if($case >= 0){ 284 global $UTF8_UPPER_ACCENTS; 285 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 286 } 287 return $string; 288} 289 290/** 291 * Romanize a non-latin string 292 * 293 * @author Andreas Gohr <andi@splitbrain.org> 294 */ 295function utf8_romanize($string){ 296 if(utf8_isASCII($string)) return $string; //nothing to do 297 298 global $UTF8_ROMANIZATION; 299 return strtr($string,$UTF8_ROMANIZATION); 300} 301 302/** 303 * Removes special characters (nonalphanumeric) from a UTF-8 string 304 * 305 * This function adds the controlchars 0x00 to 0x19 to the array of 306 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 307 * 308 * @author Andreas Gohr <andi@splitbrain.org> 309 * @param string $string The UTF8 string to strip of special chars 310 * @param string $repl Replace special with this string 311 * @param string $additional Additional chars to strip (used in regexp char class) 312 */ 313function utf8_stripspecials($string,$repl='',$additional=''){ 314 global $UTF8_SPECIAL_CHARS; 315 316 static $specials = null; 317 if(is_null($specials)){ 318 $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 319 } 320 321 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 322} 323 324/** 325 * This is an Unicode aware replacement for strpos 326 * 327 * Uses mb_string extension if available 328 * 329 * @author Harry Fuecks <hfuecks@gmail.com> 330 * @see strpos() 331 */ 332function utf8_strpos($haystack, $needle,$offset=0) { 333 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) 334 return mb_strpos($haystack,$needle,$offset,'utf-8'); 335 336 if(!$offset){ 337 $ar = utf8_explode($needle, $str); 338 if ( count($ar) > 1 ) { 339 return utf8_strlen($ar[0]); 340 } 341 return false; 342 }else{ 343 if ( !is_int($offset) ) { 344 trigger_error('Offset must be an integer',E_USER_WARNING); 345 return false; 346 } 347 348 $str = utf8_substr($str, $offset); 349 350 if ( false !== ($pos = utf8_strpos($str,$needle))){ 351 return $pos + $offset; 352 } 353 return false; 354 } 355} 356 357/** 358 * Encodes UTF-8 characters to HTML entities 359 * 360 * @author <vpribish at shopping dot com> 361 * @link http://www.php.net/manual/en/function.utf8-decode.php 362 */ 363function utf8_tohtml ($str) { 364 $ret = ''; 365 $max = strlen($str); 366 $last = 0; // keeps the index of the last regular character 367 for ($i=0; $i<$max; $i++) { 368 $c = $str{$i}; 369 $c1 = ord($c); 370 if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 371 $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 372 $c1 &= 31; // remove the 3 bit two bytes prefix 373 $c2 = ord($str{++$i}); // the next byte 374 $c2 &= 63; // remove the 2 bit trailing byte prefix 375 $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 376 $c1 >>= 2; // c1 shifts 2 to the right 377 $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 378 $last = $i+1; 379 } 380 } 381 return $ret . substr($str, $last, $i); // append the last batch of regular characters 382} 383 384/** 385 * Takes an UTF-8 string and returns an array of ints representing the 386 * Unicode characters. Astral planes are supported ie. the ints in the 387 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 388 * are not allowed. 389 * 390 * If $strict is set to true the function returns false if the input 391 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 392 * level E_USER_WARNING 393 * 394 * Note: this function has been modified slightly in this library to 395 * trigger errors on encountering bad bytes 396 * 397 * @author <hsivonen@iki.fi> 398 * @author Harry Fuecks <hfuecks@gmail.com> 399 * @param string UTF-8 encoded string 400 * @param boolean Check for invalid sequences? 401 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 402 * @see unicode_to_utf8 403 * @link http://hsivonen.iki.fi/php-utf8/ 404 * @link http://sourceforge.net/projects/phputf8/ 405 */ 406function utf8_to_unicode($str,$strict=false) { 407 $mState = 0; // cached expected number of octets after the current octet 408 // until the beginning of the next UTF8 character sequence 409 $mUcs4 = 0; // cached Unicode character 410 $mBytes = 1; // cached expected number of octets in the current sequence 411 412 $out = array(); 413 414 $len = strlen($str); 415 416 for($i = 0; $i < $len; $i++) { 417 418 $in = ord($str{$i}); 419 420 if ( $mState == 0) { 421 422 // When mState is zero we expect either a US-ASCII character or a 423 // multi-octet sequence. 424 if (0 == (0x80 & ($in))) { 425 // US-ASCII, pass straight through. 426 $out[] = $in; 427 $mBytes = 1; 428 429 } else if (0xC0 == (0xE0 & ($in))) { 430 // First octet of 2 octet sequence 431 $mUcs4 = ($in); 432 $mUcs4 = ($mUcs4 & 0x1F) << 6; 433 $mState = 1; 434 $mBytes = 2; 435 436 } else if (0xE0 == (0xF0 & ($in))) { 437 // First octet of 3 octet sequence 438 $mUcs4 = ($in); 439 $mUcs4 = ($mUcs4 & 0x0F) << 12; 440 $mState = 2; 441 $mBytes = 3; 442 443 } else if (0xF0 == (0xF8 & ($in))) { 444 // First octet of 4 octet sequence 445 $mUcs4 = ($in); 446 $mUcs4 = ($mUcs4 & 0x07) << 18; 447 $mState = 3; 448 $mBytes = 4; 449 450 } else if (0xF8 == (0xFC & ($in))) { 451 /* First octet of 5 octet sequence. 452 * 453 * This is illegal because the encoded codepoint must be either 454 * (a) not the shortest form or 455 * (b) outside the Unicode range of 0-0x10FFFF. 456 * Rather than trying to resynchronize, we will carry on until the end 457 * of the sequence and let the later error handling code catch it. 458 */ 459 $mUcs4 = ($in); 460 $mUcs4 = ($mUcs4 & 0x03) << 24; 461 $mState = 4; 462 $mBytes = 5; 463 464 } else if (0xFC == (0xFE & ($in))) { 465 // First octet of 6 octet sequence, see comments for 5 octet sequence. 466 $mUcs4 = ($in); 467 $mUcs4 = ($mUcs4 & 1) << 30; 468 $mState = 5; 469 $mBytes = 6; 470 471 } elseif($strict) { 472 /* Current octet is neither in the US-ASCII range nor a legal first 473 * octet of a multi-octet sequence. 474 */ 475 trigger_error( 476 'utf8_to_unicode: Illegal sequence identifier '. 477 'in UTF-8 at byte '.$i, 478 E_USER_WARNING 479 ); 480 return FALSE; 481 482 } 483 484 } else { 485 486 // When mState is non-zero, we expect a continuation of the multi-octet 487 // sequence 488 if (0x80 == (0xC0 & ($in))) { 489 490 // Legal continuation. 491 $shift = ($mState - 1) * 6; 492 $tmp = $in; 493 $tmp = ($tmp & 0x0000003F) << $shift; 494 $mUcs4 |= $tmp; 495 496 /** 497 * End of the multi-octet sequence. mUcs4 now contains the final 498 * Unicode codepoint to be output 499 */ 500 if (0 == --$mState) { 501 502 /* 503 * Check for illegal sequences and codepoints. 504 */ 505 // From Unicode 3.1, non-shortest form is illegal 506 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 507 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 508 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 509 (4 < $mBytes) || 510 // From Unicode 3.2, surrogate characters are illegal 511 (($mUcs4 & 0xFFFFF800) == 0xD800) || 512 // Codepoints outside the Unicode range are illegal 513 ($mUcs4 > 0x10FFFF)) { 514 515 if($strict){ 516 trigger_error( 517 'utf8_to_unicode: Illegal sequence or codepoint '. 518 'in UTF-8 at byte '.$i, 519 E_USER_WARNING 520 ); 521 522 return FALSE; 523 } 524 525 } 526 527 if (0xFEFF != $mUcs4) { 528 // BOM is legal but we don't want to output it 529 $out[] = $mUcs4; 530 } 531 532 //initialize UTF8 cache 533 $mState = 0; 534 $mUcs4 = 0; 535 $mBytes = 1; 536 } 537 538 } elseif($strict) { 539 /** 540 *((0xC0 & (*in) != 0x80) && (mState != 0)) 541 * Incomplete multi-octet sequence. 542 */ 543 trigger_error( 544 'utf8_to_unicode: Incomplete multi-octet '. 545 ' sequence in UTF-8 at byte '.$i, 546 E_USER_WARNING 547 ); 548 549 return FALSE; 550 } 551 } 552 } 553 return $out; 554} 555 556/** 557 * Takes an array of ints representing the Unicode characters and returns 558 * a UTF-8 string. Astral planes are supported ie. the ints in the 559 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 560 * are not allowed. 561 * 562 * If $strict is set to true the function returns false if the input 563 * array contains ints that represent surrogates or are outside the 564 * Unicode range and raises a PHP error at level E_USER_WARNING 565 * 566 * Note: this function has been modified slightly in this library to use 567 * output buffering to concatenate the UTF-8 string (faster) as well as 568 * reference the array by it's keys 569 * 570 * @param array of unicode code points representing a string 571 * @param boolean Check for invalid sequences? 572 * @return mixed UTF-8 string or FALSE if array contains invalid code points 573 * @author <hsivonen@iki.fi> 574 * @author Harry Fuecks <hfuecks@gmail.com> 575 * @see utf8_to_unicode 576 * @link http://hsivonen.iki.fi/php-utf8/ 577 * @link http://sourceforge.net/projects/phputf8/ 578 */ 579function unicode_to_utf8($arr,$strict=false) { 580 if (!is_array($arr)) return ''; 581 ob_start(); 582 583 foreach (array_keys($arr) as $k) { 584 585 # ASCII range (including control chars) 586 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 587 588 echo chr($arr[$k]); 589 590 # 2 byte sequence 591 } else if ($arr[$k] <= 0x07ff) { 592 593 echo chr(0xc0 | ($arr[$k] >> 6)); 594 echo chr(0x80 | ($arr[$k] & 0x003f)); 595 596 # Byte order mark (skip) 597 } else if($arr[$k] == 0xFEFF) { 598 599 // nop -- zap the BOM 600 601 # Test for illegal surrogates 602 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 603 604 // found a surrogate 605 if($strict){ 606 trigger_error( 607 'unicode_to_utf8: Illegal surrogate '. 608 'at index: '.$k.', value: '.$arr[$k], 609 E_USER_WARNING 610 ); 611 return FALSE; 612 } 613 614 # 3 byte sequence 615 } else if ($arr[$k] <= 0xffff) { 616 617 echo chr(0xe0 | ($arr[$k] >> 12)); 618 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 619 echo chr(0x80 | ($arr[$k] & 0x003f)); 620 621 # 4 byte sequence 622 } else if ($arr[$k] <= 0x10ffff) { 623 624 echo chr(0xf0 | ($arr[$k] >> 18)); 625 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 626 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 627 echo chr(0x80 | ($arr[$k] & 0x3f)); 628 629 } elseif($strict) { 630 631 trigger_error( 632 'unicode_to_utf8: Codepoint out of Unicode range '. 633 'at index: '.$k.', value: '.$arr[$k], 634 E_USER_WARNING 635 ); 636 637 // out of range 638 return FALSE; 639 } 640 } 641 642 $result = ob_get_contents(); 643 ob_end_clean(); 644 return $result; 645} 646 647/** 648 * UTF-8 to UTF-16BE conversion. 649 * 650 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 651 */ 652function utf8_to_utf16be(&$str, $bom = false) { 653 $out = $bom ? "\xFE\xFF" : ''; 654 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding')) 655 return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 656 657 $uni = utf8_to_unicode($str); 658 foreach($uni as $cp){ 659 $out .= pack('n',$cp); 660 } 661 return $out; 662} 663 664/** 665 * UTF-8 to UTF-16BE conversion. 666 * 667 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 668 */ 669function utf16be_to_utf8(&$str) { 670 $uni = unpack('n*',$str); 671 return unicode_to_utf8($uni); 672} 673 674/** 675 * UTF-8 Case lookup table 676 * 677 * This lookuptable defines the upper case letters to their correspponding 678 * lower case letter in UTF-8 679 * 680 * @author Andreas Gohr <andi@splitbrain.org> 681 */ 682static $UTF8_LOWER_TO_UPPER = array( 683 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 684 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 685 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 686 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 687 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 688 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 689 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 690 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 691 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 692 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 693 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 694 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 695 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 696 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 697 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 698 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 699 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 700 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 701 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 702 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 703 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 704 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 705 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 706 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 707 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 708 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 709 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 710 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 711 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 712 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 713 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 714 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 715 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 716 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 717 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 718 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 719 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 720 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 721 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 722 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 723 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 724 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 725 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 726); 727 728/** 729 * UTF-8 Case lookup table 730 * 731 * This lookuptable defines the lower case letters to their correspponding 732 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 733 * 734 * @author Andreas Gohr <andi@splitbrain.org> 735 */ 736$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 737 738/** 739 * UTF-8 lookup table for lower case accented letters 740 * 741 * This lookuptable defines replacements for accented characters from the ASCII-7 742 * range. This are lower case letters only. 743 * 744 * @author Andreas Gohr <andi@splitbrain.org> 745 * @see utf8_deaccent() 746 */ 747$UTF8_LOWER_ACCENTS = array( 748 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 749 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 750 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 751 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 752 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 753 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 754 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 755 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 756 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 757 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 758 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 759 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 760 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 761 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 762 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 763); 764 765/** 766 * UTF-8 lookup table for upper case accented letters 767 * 768 * This lookuptable defines replacements for accented characters from the ASCII-7 769 * range. This are upper case letters only. 770 * 771 * @author Andreas Gohr <andi@splitbrain.org> 772 * @see utf8_deaccent() 773 */ 774$UTF8_UPPER_ACCENTS = array( 775 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 776 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 777 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 778 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 779 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 780 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 781 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 782 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 783 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 784 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 785 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 786 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 787 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 788 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 789 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 790); 791 792/** 793 * UTF-8 array of common special characters 794 * 795 * This array should contain all special characters (not a letter or digit) 796 * defined in the various local charsets - it's not a complete list of non-alphanum 797 * characters in UTF-8. It's not perfect but should match most cases of special 798 * chars. 799 * 800 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 801 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 802 * 803 * @author Andreas Gohr <andi@splitbrain.org> 804 * @see utf8_stripspecials() 805 */ 806$UTF8_SPECIAL_CHARS = array( 807 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 808 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 809 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 810 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 811 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 812 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 813 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 814 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 815 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 816 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 817 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 818 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 819 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 820 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 821 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 822 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 823 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 824 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 825 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 826 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 827 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 828 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 829 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 830 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 831 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 832 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 833 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 834 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 835 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 836 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 837 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 838 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 839 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 840 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 841 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 842 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 843 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 844 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 845 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 846 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 847 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 848 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 849 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 850 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 851 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 852 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 853 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 854 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 855 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 856 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 857); 858 859/** 860 * Romanization lookup table 861 * 862 * This lookup tables provides a way to transform strings written in a language 863 * different from the ones based upon latin letters into plain ASCII. 864 * 865 * Please note: this is not a scientific transliteration table. It only works 866 * oneway from nonlatin to ASCII and it works by simple character replacement 867 * only. Specialities of each language are not supported. 868 * 869 * @author Andreas Gohr <andi@splitbrain.org> 870 * @author Vitaly Blokhin <vitinfo@vitn.com> 871 * @link http://www.uconv.com/translit.htm 872 * @author Bisqwit <bisqwit@iki.fi> 873 * @link http://kanjidict.stc.cx/hiragana.php?src=2 874 * @link http://www.translatum.gr/converter/greek-transliteration.htm 875 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 876 * @link http://www.btranslations.com/resources/romanization/korean.asp 877 */ 878$UTF8_ROMANIZATION = array( 879 //russian cyrillic 880 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 881 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 882 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 883 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 884 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 885 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 886 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'th','Щ'=>'Th','ъ'=>'qh', 887 'Ъ'=>'Qh','ы'=>'y','Ы'=>'Y','ь'=>'q','Ь'=>'Q','э'=>'eh','Э'=>'Eh','ю'=>'ju', 888 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 889 // Ukrainian cyrillic 890 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 891 // Georgian 892 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 893 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 894 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 895 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 896 'ჰ'=>'xh', 897 //Sanskrit 898 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 899 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 900 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 901 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 902 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 903 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 904 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 905 //Hebrew 906 'ב'=>'a','ג'=>'b','ד'=>'g','ה'=>'d','ו'=>'x','ז'=>'v','ח'=>'kh','ט'=>'th', 907 'י'=>'y','ך'=>'k','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 908 'ס'=>'s','ע'=>'ah','ף'=>'p','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 909 'ש'=>'sh','ת'=>'t', 910 //Arabic 911 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 912 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 913 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 914 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 915 916 // Japanese hiragana 917 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 918 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 919 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 920 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 921 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 922 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 923 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 924 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 925 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 926 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 927 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 928 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 929 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 930 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 931 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 932 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 933 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 934 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 935 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 936 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 937 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 938 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 939 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 940 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 941 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 942 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 943 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 944 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 945 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 946 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 947 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 948 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 949 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 950 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 951 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 952 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 953 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 954 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 955 'じゅ'=>'zyu', 956 // Japanese katakana 957 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 958 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 959 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 960 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 961 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 962 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 963 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 964 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 965 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 966 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 967 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 968 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 969 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 970 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 971 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 972 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 973 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 974 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 975 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 976 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 977 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 978 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 979 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 980 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 981 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 982 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 983 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 984 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 985 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 986 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 987 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 988 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 989 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 990 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 991 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 992 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 993 'ジョ'=>'zyo','ジュ'=>'zyu', 994 995 // "Greeklish" 996 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 997 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 998 999 // Thai 1000 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1001 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1002 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1003 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1004 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1005 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1006 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1007 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1008 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1009 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1010 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1011 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1012 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1013 'เ–ียว'=>'iao', 1014 1015 // Korean 1016 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1017 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1018 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1019 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1020 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1021 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1022); 1023 1024//Setup VIM: ex: et ts=2 enc=utf-8 : 1025 1026