1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * URL-Encode a filename to allow unicodecharacters 11 * 12 * Slashes are not encoded 13 * 14 * When the second parameter is true the string will 15 * be encoded only if non ASCII characters are detected - 16 * This makes it safe to run it multiple times on the 17 * same string (default is true) 18 * 19 * @author Andreas Gohr <andi@splitbrain.org> 20 * @see urlencode 21 */ 22function utf8_encodeFN($file,$safe=true){ 23 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 24 return $file; 25 } 26 $file = urlencode($file); 27 $file = str_replace('%2F','/',$file); 28 return $file; 29} 30 31/** 32 * URL-Decode a filename 33 * 34 * This is just a wrapper around urldecode 35 * 36 * @author Andreas Gohr <andi@splitbrain.org> 37 * @see urldecode 38 */ 39function utf8_decodeFN($file){ 40 $file = urldecode($file); 41 return $file; 42} 43 44/** 45 * Checks if a string contains 7bit ASCII only 46 * 47 * @author Andreas Gohr <andi@splitbrain.org> 48 */ 49function utf8_isASCII($str){ 50 for($i=0; $i<strlen($str); $i++){ 51 if(ord($str{$i}) >127) return false; 52 } 53 return true; 54} 55 56/** 57 * Strips all highbyte chars 58 * 59 * Returns a pure ASCII7 string 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63function utf8_strip($str){ 64 $ascii = ''; 65 for($i=0; $i<strlen($str); $i++){ 66 if(ord($str{$i}) <128){ 67 $ascii .= $str{$i}; 68 } 69 } 70 return $ascii; 71} 72 73/** 74 * Tries to detect if a string is in Unicode encoding 75 * 76 * @author <bmorel@ssi.fr> 77 * @link http://www.php.net/manual/en/function.utf8-encode.php 78 */ 79function utf8_check($Str) { 80 for ($i=0; $i<strlen($Str); $i++) { 81 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 82 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 83 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 84 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 85 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 86 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 87 else return false; # Does not match any model 88 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 89 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 90 return false; 91 } 92 } 93 return true; 94} 95 96/** 97 * Unicode aware replacement for strlen() 98 * 99 * utf8_decode() converts characters that are not in ISO-8859-1 100 * to '?', which, for the purpose of counting, is alright - It's 101 * even faster than mb_strlen. 102 * 103 * @author <chernyshevsky at hotmail dot com> 104 * @see strlen() 105 * @see utf8_decode() 106 */ 107function utf8_strlen($string){ 108 return strlen(utf8_decode($string)); 109} 110 111/** 112 * UTF-8 aware alternative to substr 113 * 114 * Return part of a string given character offset (and optionally length) 115 * Note: supports use of negative offsets and lengths but will be slower 116 * when doing so 117 * 118 * @author Harry Fuecks <hfuecks@gmail.com> 119 * @param string 120 * @param integer number of UTF-8 characters offset (from left) 121 * @param integer (optional) length in UTF-8 characters from offset 122 * @return mixed string or FALSE if failure 123 */ 124function utf8_substr($str, $offset, $length = null) { 125 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr')){ 126 if( $length === null ){ 127 mb_substr($str, $offset); 128 }else{ 129 mb_substr($str, $offset, $length); 130 } 131 } 132 133 if ( $offset >= 0 && $length >= 0 ) { 134 if ( $length === null ) { 135 $length = '*'; 136 } else { 137 $strlen = strlen(utf8_decode($str)); 138 if ( $offset > $strlen ) { 139 return ''; 140 } 141 142 if ( ( $offset + $length ) > $strlen ) { 143 $length = '*'; 144 } else { 145 $length = '{'.$length.'}'; 146 } 147 } 148 149 $pattern = '/^.{'.$offset.'}(.'.$length.')/us'; 150 preg_match($pattern, $str, $matches); 151 152 if ( isset($matches[1]) ) { 153 return $matches[1]; 154 } 155 return false; 156 157 } else { 158 // Handle negatives using different, slower technique 159 // From: http://www.php.net/manual/en/function.substr.php#44838 160 preg_match_all('/./u', $str, $ar); 161 if( $length !== null ) { 162 return join('',array_slice($ar[0],$offset,$length)); 163 } else { 164 return join('',array_slice($ar[0],$offset)); 165 } 166 } 167} 168 169 170/** 171 * Unicode aware replacement for substr_replace() 172 * 173 * @author Andreas Gohr <andi@splitbrain.org> 174 * @see substr_replace() 175 */ 176function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 177 $ret = ''; 178 if($start>0) $ret .= utf8_substr($string, 0, $start); 179 $ret .= $replacement; 180 $ret .= utf8_substr($string, $start+$length); 181 return $ret; 182} 183 184/** 185 * Unicode aware replacement for explode 186 * 187 * @TODO support third limit arg 188 * @author Harry Fuecks <hfuecks@gmail.com> 189 * @see explode(); 190 */ 191function utf8_explode($sep, $str) { 192 if ( $sep == '' ) { 193 trigger_error('Empty delimiter',E_USER_WARNING); 194 return FALSE; 195 } 196 197 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 198} 199 200/** 201 * Unicode aware replacement for strrepalce() 202 * 203 * @todo support PHP5 count (fourth arg) 204 * @author Harry Fuecks <hfuecks@gmail.com> 205 * @see strreplace(); 206 */ 207function utf8_str_replace($s,$r,$str){ 208 if(!is_array($s)){ 209 $s = '!'.preg_quote($s,'!').'!u'; 210 }else{ 211 foreach ($s as $k => $v) { 212 $s[$k] = '!'.preg_quote($v).'!u'; 213 } 214 } 215 return preg_replace($s,$r,$str); 216} 217 218/** 219 * Unicode aware replacement for ltrim() 220 * 221 * @author Andreas Gohr <andi@splitbrain.org> 222 * @see ltrim() 223 * @return string 224 */ 225function utf8_ltrim($str,$charlist=''){ 226 if($charlist == '') return ltrim($str); 227 228 //quote charlist for use in a characterclass 229 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 230 231 return preg_replace('/^['.$charlist.']+/u','',$str); 232} 233 234/** 235 * Unicode aware replacement for rtrim() 236 * 237 * @author Andreas Gohr <andi@splitbrain.org> 238 * @see rtrim() 239 * @return string 240 */ 241function utf8_rtrim($str,$charlist=''){ 242 if($charlist == '') return rtrim($str); 243 244 //quote charlist for use in a characterclass 245 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 246 247 return preg_replace('/['.$charlist.']+$/u','',$str); 248} 249 250/** 251 * Unicode aware replacement for trim() 252 * 253 * @author Andreas Gohr <andi@splitbrain.org> 254 * @see trim() 255 * @return string 256 */ 257function utf8_trim($str,$charlist='') { 258 if($charlist == '') return trim($str); 259 260 return utf8_ltrim(utf8_rtrim($str)); 261} 262 263 264/** 265 * This is a unicode aware replacement for strtolower() 266 * 267 * Uses mb_string extension if available 268 * 269 * @author Andreas Gohr <andi@splitbrain.org> 270 * @see strtolower() 271 * @see utf8_strtoupper() 272 */ 273function utf8_strtolower($string){ 274 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 275 return mb_strtolower($string,'utf-8'); 276 277 global $UTF8_UPPER_TO_LOWER; 278 $uni = utf8_to_unicode($string); 279 $cnt = count($uni); 280 for ($i=0; $i < $cnt; $i++){ 281 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 282 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 283 } 284 } 285 return unicode_to_utf8($uni); 286} 287 288/** 289 * This is a unicode aware replacement for strtoupper() 290 * 291 * Uses mb_string extension if available 292 * 293 * @author Andreas Gohr <andi@splitbrain.org> 294 * @see strtoupper() 295 * @see utf8_strtoupper() 296 */ 297function utf8_strtoupper($string){ 298 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 299 return mb_strtoupper($string,'utf-8'); 300 301 global $UTF8_LOWER_TO_UPPER; 302 $uni = utf8_to_unicode($string); 303 $cnt = count($uni); 304 for ($i=0; $i < $cnt; $i++){ 305 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 306 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 307 } 308 } 309 return unicode_to_utf8($uni); 310} 311 312/** 313 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 314 * 315 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 316 * letters. Default is to deaccent both cases ($case = 0) 317 * 318 * @author Andreas Gohr <andi@splitbrain.org> 319 */ 320function utf8_deaccent($string,$case=0){ 321 if($case <= 0){ 322 global $UTF8_LOWER_ACCENTS; 323 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 324 } 325 if($case >= 0){ 326 global $UTF8_UPPER_ACCENTS; 327 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 328 } 329 return $string; 330} 331 332/** 333 * Romanize a non-latin string 334 * 335 * @author Andreas Gohr <andi@splitbrain.org> 336 */ 337function utf8_romanize($string){ 338 if(utf8_isASCII($string)) return $string; //nothing to do 339 340 global $UTF8_ROMANIZATION; 341 return strtr($string,$UTF8_ROMANIZATION); 342} 343 344/** 345 * Removes special characters (nonalphanumeric) from a UTF-8 string 346 * 347 * This function adds the controlchars 0x00 to 0x19 to the array of 348 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 349 * 350 * @author Andreas Gohr <andi@splitbrain.org> 351 * @param string $string The UTF8 string to strip of special chars 352 * @param string $repl Replace special with this string 353 * @param string $additional Additional chars to strip (used in regexp char class) 354 */ 355function utf8_stripspecials($string,$repl='',$additional=''){ 356 global $UTF8_SPECIAL_CHARS; 357 358 static $specials = null; 359 if(is_null($specials)){ 360 $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 361 } 362 363 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 364} 365 366/** 367 * This is an Unicode aware replacement for strpos 368 * 369 * Uses mb_string extension if available 370 * 371 * @author Harry Fuecks <hfuecks@gmail.com> 372 * @see strpos() 373 */ 374function utf8_strpos($haystack, $needle,$offset=0) { 375 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) 376 return mb_strpos($haystack,$needle,$offset,'utf-8'); 377 378 if(!$offset){ 379 $ar = utf8_explode($needle, $str); 380 if ( count($ar) > 1 ) { 381 return utf8_strlen($ar[0]); 382 } 383 return false; 384 }else{ 385 if ( !is_int($offset) ) { 386 trigger_error('Offset must be an integer',E_USER_WARNING); 387 return false; 388 } 389 390 $str = utf8_substr($str, $offset); 391 392 if ( false !== ($pos = utf8_strpos($str,$needle))){ 393 return $pos + $offset; 394 } 395 return false; 396 } 397} 398 399/** 400 * Encodes UTF-8 characters to HTML entities 401 * 402 * @author <vpribish at shopping dot com> 403 * @link http://www.php.net/manual/en/function.utf8-decode.php 404 */ 405function utf8_tohtml ($str) { 406 $ret = ''; 407 $max = strlen($str); 408 $last = 0; // keeps the index of the last regular character 409 for ($i=0; $i<$max; $i++) { 410 $c = $str{$i}; 411 $c1 = ord($c); 412 if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 413 $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 414 $c1 &= 31; // remove the 3 bit two bytes prefix 415 $c2 = ord($str{++$i}); // the next byte 416 $c2 &= 63; // remove the 2 bit trailing byte prefix 417 $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 418 $c1 >>= 2; // c1 shifts 2 to the right 419 $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 420 $last = $i+1; 421 } 422 } 423 return $ret . substr($str, $last, $i); // append the last batch of regular characters 424} 425 426/** 427 * Takes an UTF-8 string and returns an array of ints representing the 428 * Unicode characters. Astral planes are supported ie. the ints in the 429 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 430 * are not allowed. 431 * 432 * If $strict is set to true the function returns false if the input 433 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 434 * level E_USER_WARNING 435 * 436 * Note: this function has been modified slightly in this library to 437 * trigger errors on encountering bad bytes 438 * 439 * @author <hsivonen@iki.fi> 440 * @author Harry Fuecks <hfuecks@gmail.com> 441 * @param string UTF-8 encoded string 442 * @param boolean Check for invalid sequences? 443 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 444 * @see unicode_to_utf8 445 * @link http://hsivonen.iki.fi/php-utf8/ 446 * @link http://sourceforge.net/projects/phputf8/ 447 */ 448function utf8_to_unicode($str,$strict=false) { 449 $mState = 0; // cached expected number of octets after the current octet 450 // until the beginning of the next UTF8 character sequence 451 $mUcs4 = 0; // cached Unicode character 452 $mBytes = 1; // cached expected number of octets in the current sequence 453 454 $out = array(); 455 456 $len = strlen($str); 457 458 for($i = 0; $i < $len; $i++) { 459 460 $in = ord($str{$i}); 461 462 if ( $mState == 0) { 463 464 // When mState is zero we expect either a US-ASCII character or a 465 // multi-octet sequence. 466 if (0 == (0x80 & ($in))) { 467 // US-ASCII, pass straight through. 468 $out[] = $in; 469 $mBytes = 1; 470 471 } else if (0xC0 == (0xE0 & ($in))) { 472 // First octet of 2 octet sequence 473 $mUcs4 = ($in); 474 $mUcs4 = ($mUcs4 & 0x1F) << 6; 475 $mState = 1; 476 $mBytes = 2; 477 478 } else if (0xE0 == (0xF0 & ($in))) { 479 // First octet of 3 octet sequence 480 $mUcs4 = ($in); 481 $mUcs4 = ($mUcs4 & 0x0F) << 12; 482 $mState = 2; 483 $mBytes = 3; 484 485 } else if (0xF0 == (0xF8 & ($in))) { 486 // First octet of 4 octet sequence 487 $mUcs4 = ($in); 488 $mUcs4 = ($mUcs4 & 0x07) << 18; 489 $mState = 3; 490 $mBytes = 4; 491 492 } else if (0xF8 == (0xFC & ($in))) { 493 /* First octet of 5 octet sequence. 494 * 495 * This is illegal because the encoded codepoint must be either 496 * (a) not the shortest form or 497 * (b) outside the Unicode range of 0-0x10FFFF. 498 * Rather than trying to resynchronize, we will carry on until the end 499 * of the sequence and let the later error handling code catch it. 500 */ 501 $mUcs4 = ($in); 502 $mUcs4 = ($mUcs4 & 0x03) << 24; 503 $mState = 4; 504 $mBytes = 5; 505 506 } else if (0xFC == (0xFE & ($in))) { 507 // First octet of 6 octet sequence, see comments for 5 octet sequence. 508 $mUcs4 = ($in); 509 $mUcs4 = ($mUcs4 & 1) << 30; 510 $mState = 5; 511 $mBytes = 6; 512 513 } elseif($strict) { 514 /* Current octet is neither in the US-ASCII range nor a legal first 515 * octet of a multi-octet sequence. 516 */ 517 trigger_error( 518 'utf8_to_unicode: Illegal sequence identifier '. 519 'in UTF-8 at byte '.$i, 520 E_USER_WARNING 521 ); 522 return FALSE; 523 524 } 525 526 } else { 527 528 // When mState is non-zero, we expect a continuation of the multi-octet 529 // sequence 530 if (0x80 == (0xC0 & ($in))) { 531 532 // Legal continuation. 533 $shift = ($mState - 1) * 6; 534 $tmp = $in; 535 $tmp = ($tmp & 0x0000003F) << $shift; 536 $mUcs4 |= $tmp; 537 538 /** 539 * End of the multi-octet sequence. mUcs4 now contains the final 540 * Unicode codepoint to be output 541 */ 542 if (0 == --$mState) { 543 544 /* 545 * Check for illegal sequences and codepoints. 546 */ 547 // From Unicode 3.1, non-shortest form is illegal 548 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 549 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 550 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 551 (4 < $mBytes) || 552 // From Unicode 3.2, surrogate characters are illegal 553 (($mUcs4 & 0xFFFFF800) == 0xD800) || 554 // Codepoints outside the Unicode range are illegal 555 ($mUcs4 > 0x10FFFF)) { 556 557 if($strict){ 558 trigger_error( 559 'utf8_to_unicode: Illegal sequence or codepoint '. 560 'in UTF-8 at byte '.$i, 561 E_USER_WARNING 562 ); 563 564 return FALSE; 565 } 566 567 } 568 569 if (0xFEFF != $mUcs4) { 570 // BOM is legal but we don't want to output it 571 $out[] = $mUcs4; 572 } 573 574 //initialize UTF8 cache 575 $mState = 0; 576 $mUcs4 = 0; 577 $mBytes = 1; 578 } 579 580 } elseif($strict) { 581 /** 582 *((0xC0 & (*in) != 0x80) && (mState != 0)) 583 * Incomplete multi-octet sequence. 584 */ 585 trigger_error( 586 'utf8_to_unicode: Incomplete multi-octet '. 587 ' sequence in UTF-8 at byte '.$i, 588 E_USER_WARNING 589 ); 590 591 return FALSE; 592 } 593 } 594 } 595 return $out; 596} 597 598/** 599 * Takes an array of ints representing the Unicode characters and returns 600 * a UTF-8 string. Astral planes are supported ie. the ints in the 601 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 602 * are not allowed. 603 * 604 * If $strict is set to true the function returns false if the input 605 * array contains ints that represent surrogates or are outside the 606 * Unicode range and raises a PHP error at level E_USER_WARNING 607 * 608 * Note: this function has been modified slightly in this library to use 609 * output buffering to concatenate the UTF-8 string (faster) as well as 610 * reference the array by it's keys 611 * 612 * @param array of unicode code points representing a string 613 * @param boolean Check for invalid sequences? 614 * @return mixed UTF-8 string or FALSE if array contains invalid code points 615 * @author <hsivonen@iki.fi> 616 * @author Harry Fuecks <hfuecks@gmail.com> 617 * @see utf8_to_unicode 618 * @link http://hsivonen.iki.fi/php-utf8/ 619 * @link http://sourceforge.net/projects/phputf8/ 620 */ 621function unicode_to_utf8($arr,$strict=false) { 622 if (!is_array($arr)) return ''; 623 ob_start(); 624 625 foreach (array_keys($arr) as $k) { 626 627 # ASCII range (including control chars) 628 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 629 630 echo chr($arr[$k]); 631 632 # 2 byte sequence 633 } else if ($arr[$k] <= 0x07ff) { 634 635 echo chr(0xc0 | ($arr[$k] >> 6)); 636 echo chr(0x80 | ($arr[$k] & 0x003f)); 637 638 # Byte order mark (skip) 639 } else if($arr[$k] == 0xFEFF) { 640 641 // nop -- zap the BOM 642 643 # Test for illegal surrogates 644 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 645 646 // found a surrogate 647 if($strict){ 648 trigger_error( 649 'unicode_to_utf8: Illegal surrogate '. 650 'at index: '.$k.', value: '.$arr[$k], 651 E_USER_WARNING 652 ); 653 return FALSE; 654 } 655 656 # 3 byte sequence 657 } else if ($arr[$k] <= 0xffff) { 658 659 echo chr(0xe0 | ($arr[$k] >> 12)); 660 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 661 echo chr(0x80 | ($arr[$k] & 0x003f)); 662 663 # 4 byte sequence 664 } else if ($arr[$k] <= 0x10ffff) { 665 666 echo chr(0xf0 | ($arr[$k] >> 18)); 667 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 668 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 669 echo chr(0x80 | ($arr[$k] & 0x3f)); 670 671 } elseif($strict) { 672 673 trigger_error( 674 'unicode_to_utf8: Codepoint out of Unicode range '. 675 'at index: '.$k.', value: '.$arr[$k], 676 E_USER_WARNING 677 ); 678 679 // out of range 680 return FALSE; 681 } 682 } 683 684 $result = ob_get_contents(); 685 ob_end_clean(); 686 return $result; 687} 688 689/** 690 * UTF-8 to UTF-16BE conversion. 691 * 692 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 693 */ 694function utf8_to_utf16be(&$str, $bom = false) { 695 $out = $bom ? "\xFE\xFF" : ''; 696 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding')) 697 return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 698 699 $uni = utf8_to_unicode($str); 700 foreach($uni as $cp){ 701 $out .= pack('n',$cp); 702 } 703 return $out; 704} 705 706/** 707 * UTF-8 to UTF-16BE conversion. 708 * 709 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 710 */ 711function utf16be_to_utf8(&$str) { 712 $uni = unpack('n*',$str); 713 return unicode_to_utf8($uni); 714} 715 716/** 717 * UTF-8 Case lookup table 718 * 719 * This lookuptable defines the upper case letters to their correspponding 720 * lower case letter in UTF-8 721 * 722 * @author Andreas Gohr <andi@splitbrain.org> 723 */ 724static $UTF8_LOWER_TO_UPPER = array( 725 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 726 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 727 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 728 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 729 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 730 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 731 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 732 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 733 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 734 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 735 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 736 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 737 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 738 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 739 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 740 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 741 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 742 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 743 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 744 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 745 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 746 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 747 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 748 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 749 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 750 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 751 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 752 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 753 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 754 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 755 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 756 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 757 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 758 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 759 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 760 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 761 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 762 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 763 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 764 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 765 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 766 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 767 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 768); 769 770/** 771 * UTF-8 Case lookup table 772 * 773 * This lookuptable defines the lower case letters to their correspponding 774 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 775 * 776 * @author Andreas Gohr <andi@splitbrain.org> 777 */ 778$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 779 780/** 781 * UTF-8 lookup table for lower case accented letters 782 * 783 * This lookuptable defines replacements for accented characters from the ASCII-7 784 * range. This are lower case letters only. 785 * 786 * @author Andreas Gohr <andi@splitbrain.org> 787 * @see utf8_deaccent() 788 */ 789$UTF8_LOWER_ACCENTS = array( 790 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 791 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 792 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 793 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 794 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 795 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 796 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 797 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 798 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 799 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 800 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 801 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 802 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 803 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 804 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 805); 806 807/** 808 * UTF-8 lookup table for upper case accented letters 809 * 810 * This lookuptable defines replacements for accented characters from the ASCII-7 811 * range. This are upper case letters only. 812 * 813 * @author Andreas Gohr <andi@splitbrain.org> 814 * @see utf8_deaccent() 815 */ 816$UTF8_UPPER_ACCENTS = array( 817 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 818 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 819 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 820 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 821 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 822 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 823 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 824 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 825 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 826 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 827 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 828 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 829 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 830 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 831 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 832); 833 834/** 835 * UTF-8 array of common special characters 836 * 837 * This array should contain all special characters (not a letter or digit) 838 * defined in the various local charsets - it's not a complete list of non-alphanum 839 * characters in UTF-8. It's not perfect but should match most cases of special 840 * chars. 841 * 842 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 843 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 844 * 845 * @author Andreas Gohr <andi@splitbrain.org> 846 * @see utf8_stripspecials() 847 */ 848$UTF8_SPECIAL_CHARS = array( 849 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 850 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 851 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 852 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 853 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 854 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 855 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 856 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 857 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 858 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 859 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 860 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 861 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 862 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 863 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 864 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 865 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 866 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 867 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 868 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 869 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 870 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 871 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 872 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 873 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 874 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 875 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 876 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 877 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 878 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 879 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 880 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 881 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 882 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 883 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 884 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 885 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 886 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 887 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 888 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 889 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 890 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 891 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 892 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 893 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 894 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 895 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 896 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 897 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 898 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 899); 900 901/** 902 * Romanization lookup table 903 * 904 * This lookup tables provides a way to transform strings written in a language 905 * different from the ones based upon latin letters into plain ASCII. 906 * 907 * Please note: this is not a scientific transliteration table. It only works 908 * oneway from nonlatin to ASCII and it works by simple character replacement 909 * only. Specialities of each language are not supported. 910 * 911 * @author Andreas Gohr <andi@splitbrain.org> 912 * @author Vitaly Blokhin <vitinfo@vitn.com> 913 * @link http://www.uconv.com/translit.htm 914 * @author Bisqwit <bisqwit@iki.fi> 915 * @link http://kanjidict.stc.cx/hiragana.php?src=2 916 * @link http://www.translatum.gr/converter/greek-transliteration.htm 917 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 918 * @link http://www.btranslations.com/resources/romanization/korean.asp 919 */ 920$UTF8_ROMANIZATION = array( 921 //russian cyrillic 922 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 923 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 924 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 925 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 926 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 927 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 928 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 929 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 930 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 931 // Ukrainian cyrillic 932 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 933 // Georgian 934 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 935 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 936 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 937 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 938 'ჰ'=>'xh', 939 //Sanskrit 940 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 941 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 942 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 943 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 944 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 945 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 946 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 947 //Hebrew 948 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 949 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 950 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 951 'ש'=>'sh','ת'=>'t', 952 //Arabic 953 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 954 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 955 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 956 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 957 958 // Japanese hiragana 959 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 960 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 961 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 962 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 963 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 964 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 965 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 966 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 967 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 968 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 969 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 970 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 971 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 972 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 973 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 974 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 975 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 976 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 977 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 978 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 979 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 980 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 981 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 982 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 983 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 984 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 985 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 986 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 987 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 988 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 989 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 990 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 991 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 992 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 993 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 994 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 995 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 996 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 997 'じゅ'=>'zyu', 998 // Japanese katakana 999 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 1000 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 1001 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 1002 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 1003 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 1004 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 1005 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 1006 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1007 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 1008 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 1009 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 1010 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 1011 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 1012 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1013 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 1014 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 1015 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 1016 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 1017 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 1018 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1019 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 1020 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 1021 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 1022 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 1023 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 1024 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1025 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 1026 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 1027 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 1028 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 1029 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1030 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 1031 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 1032 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 1033 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 1034 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 1035 'ジョ'=>'zyo','ジュ'=>'zyu', 1036 1037 // "Greeklish" 1038 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1039 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1040 1041 // Thai 1042 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1043 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1044 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1045 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1046 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1047 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1048 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1049 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1050 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1051 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1052 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1053 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1054 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1055 'เ–ียว'=>'iao', 1056 1057 // Korean 1058 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1059 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1060 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1061 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1062 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1063 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1064); 1065 1066//Setup VIM: ex: et ts=2 enc=utf-8 : 1067 1068