1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 10/** 11 * check for mb_string support 12 */ 13if(!defined('UTF8_MBSTRING')){ 14 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 15 define('UTF8_MBSTRING',1); 16 mb_internal_encoding('UTF-8'); 17 }else{ 18 define('UTF8_MBSTRING',0); 19 } 20} 21 22 23/** 24 * URL-Encode a filename to allow unicodecharacters 25 * 26 * Slashes are not encoded 27 * 28 * When the second parameter is true the string will 29 * be encoded only if non ASCII characters are detected - 30 * This makes it safe to run it multiple times on the 31 * same string (default is true) 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @see urlencode 35 */ 36function utf8_encodeFN($file,$safe=true){ 37 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38 return $file; 39 } 40 $file = urlencode($file); 41 $file = str_replace('%2F','/',$file); 42 return $file; 43} 44 45/** 46 * URL-Decode a filename 47 * 48 * This is just a wrapper around urldecode 49 * 50 * @author Andreas Gohr <andi@splitbrain.org> 51 * @see urldecode 52 */ 53function utf8_decodeFN($file){ 54 $file = urldecode($file); 55 return $file; 56} 57 58/** 59 * Checks if a string contains 7bit ASCII only 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63function utf8_isASCII($str){ 64 for($i=0; $i<strlen($str); $i++){ 65 if(ord($str{$i}) >127) return false; 66 } 67 return true; 68} 69 70/** 71 * Strips all highbyte chars 72 * 73 * Returns a pure ASCII7 string 74 * 75 * @author Andreas Gohr <andi@splitbrain.org> 76 */ 77function utf8_strip($str){ 78 $ascii = ''; 79 for($i=0; $i<strlen($str); $i++){ 80 if(ord($str{$i}) <128){ 81 $ascii .= $str{$i}; 82 } 83 } 84 return $ascii; 85} 86 87/** 88 * Tries to detect if a string is in Unicode encoding 89 * 90 * @author <bmorel@ssi.fr> 91 * @link http://www.php.net/manual/en/function.utf8-encode.php 92 */ 93function utf8_check($Str) { 94 for ($i=0; $i<strlen($Str); $i++) { 95 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 96 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 97 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 98 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 99 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 100 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 101 else return false; # Does not match any model 102 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 103 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 104 return false; 105 } 106 } 107 return true; 108} 109 110/** 111 * Unicode aware replacement for strlen() 112 * 113 * utf8_decode() converts characters that are not in ISO-8859-1 114 * to '?', which, for the purpose of counting, is alright - It's 115 * even faster than mb_strlen. 116 * 117 * @author <chernyshevsky at hotmail dot com> 118 * @see strlen() 119 * @see utf8_decode() 120 */ 121function utf8_strlen($string){ 122 return strlen(utf8_decode($string)); 123} 124 125/** 126 * UTF-8 aware alternative to substr 127 * 128 * Return part of a string given character offset (and optionally length) 129 * Note: supports use of negative offsets and lengths but will be slower 130 * when doing so 131 * 132 * @author Harry Fuecks <hfuecks@gmail.com> 133 * @param string 134 * @param integer number of UTF-8 characters offset (from left) 135 * @param integer (optional) length in UTF-8 characters from offset 136 * @return mixed string or FALSE if failure 137 */ 138function utf8_substr($str, $offset, $length = null) { 139 if(UTF8_MBSTRING){ 140 if( $length === null ){ 141 return mb_substr($str, $offset); 142 }else{ 143 return mb_substr($str, $offset, $length); 144 } 145 } 146 147 if ( $offset >= 0 && $length >= 0 ) { 148 if ( $length === null ) { 149 $length = '*'; 150 } else { 151 $strlen = strlen(utf8_decode($str)); 152 if ( $offset > $strlen ) { 153 return ''; 154 } 155 156 if ( ( $offset + $length ) > $strlen ) { 157 $length = '*'; 158 } else { 159 $length = '{'.$length.'}'; 160 } 161 } 162 163 $pattern = '/^.{'.$offset.'}(.'.$length.')/us'; 164 preg_match($pattern, $str, $matches); 165 166 if ( isset($matches[1]) ) { 167 return $matches[1]; 168 } 169 return false; 170 171 } else { 172 // Handle negatives using different, slower technique 173 // From: http://www.php.net/manual/en/function.substr.php#44838 174 preg_match_all('/./u', $str, $ar); 175 if( $length !== null ) { 176 return join('',array_slice($ar[0],$offset,$length)); 177 } else { 178 return join('',array_slice($ar[0],$offset)); 179 } 180 } 181} 182 183 184/** 185 * Unicode aware replacement for substr_replace() 186 * 187 * @author Andreas Gohr <andi@splitbrain.org> 188 * @see substr_replace() 189 */ 190function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 191 $ret = ''; 192 if($start>0) $ret .= utf8_substr($string, 0, $start); 193 $ret .= $replacement; 194 $ret .= utf8_substr($string, $start+$length); 195 return $ret; 196} 197 198/** 199 * Unicode aware replacement for explode 200 * 201 * @TODO support third limit arg 202 * @author Harry Fuecks <hfuecks@gmail.com> 203 * @see explode(); 204 */ 205function utf8_explode($sep, $str) { 206 if ( $sep == '' ) { 207 trigger_error('Empty delimiter',E_USER_WARNING); 208 return FALSE; 209 } 210 211 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 212} 213 214/** 215 * Unicode aware replacement for strrepalce() 216 * 217 * @todo support PHP5 count (fourth arg) 218 * @author Harry Fuecks <hfuecks@gmail.com> 219 * @see strreplace(); 220 */ 221function utf8_str_replace($s,$r,$str){ 222 if(!is_array($s)){ 223 $s = '!'.preg_quote($s,'!').'!u'; 224 }else{ 225 foreach ($s as $k => $v) { 226 $s[$k] = '!'.preg_quote($v).'!u'; 227 } 228 } 229 return preg_replace($s,$r,$str); 230} 231 232/** 233 * Unicode aware replacement for ltrim() 234 * 235 * @author Andreas Gohr <andi@splitbrain.org> 236 * @see ltrim() 237 * @return string 238 */ 239function utf8_ltrim($str,$charlist=''){ 240 if($charlist == '') return ltrim($str); 241 242 //quote charlist for use in a characterclass 243 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 244 245 return preg_replace('/^['.$charlist.']+/u','',$str); 246} 247 248/** 249 * Unicode aware replacement for rtrim() 250 * 251 * @author Andreas Gohr <andi@splitbrain.org> 252 * @see rtrim() 253 * @return string 254 */ 255function utf8_rtrim($str,$charlist=''){ 256 if($charlist == '') return rtrim($str); 257 258 //quote charlist for use in a characterclass 259 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 260 261 return preg_replace('/['.$charlist.']+$/u','',$str); 262} 263 264/** 265 * Unicode aware replacement for trim() 266 * 267 * @author Andreas Gohr <andi@splitbrain.org> 268 * @see trim() 269 * @return string 270 */ 271function utf8_trim($str,$charlist='') { 272 if($charlist == '') return trim($str); 273 274 return utf8_ltrim(utf8_rtrim($str)); 275} 276 277 278/** 279 * This is a unicode aware replacement for strtolower() 280 * 281 * Uses mb_string extension if available 282 * 283 * @author Andreas Gohr <andi@splitbrain.org> 284 * @see strtolower() 285 * @see utf8_strtoupper() 286 */ 287function utf8_strtolower($string){ 288 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 289 290 global $UTF8_UPPER_TO_LOWER; 291 $uni = utf8_to_unicode($string); 292 $cnt = count($uni); 293 for ($i=0; $i < $cnt; $i++){ 294 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 295 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 296 } 297 } 298 return unicode_to_utf8($uni); 299} 300 301/** 302 * This is a unicode aware replacement for strtoupper() 303 * 304 * Uses mb_string extension if available 305 * 306 * @author Andreas Gohr <andi@splitbrain.org> 307 * @see strtoupper() 308 * @see utf8_strtoupper() 309 */ 310function utf8_strtoupper($string){ 311 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 312 313 global $UTF8_LOWER_TO_UPPER; 314 $uni = utf8_to_unicode($string); 315 $cnt = count($uni); 316 for ($i=0; $i < $cnt; $i++){ 317 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 318 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 319 } 320 } 321 return unicode_to_utf8($uni); 322} 323 324/** 325 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 326 * 327 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 328 * letters. Default is to deaccent both cases ($case = 0) 329 * 330 * @author Andreas Gohr <andi@splitbrain.org> 331 */ 332function utf8_deaccent($string,$case=0){ 333 if($case <= 0){ 334 global $UTF8_LOWER_ACCENTS; 335 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 336 } 337 if($case >= 0){ 338 global $UTF8_UPPER_ACCENTS; 339 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 340 } 341 return $string; 342} 343 344/** 345 * Romanize a non-latin string 346 * 347 * @author Andreas Gohr <andi@splitbrain.org> 348 */ 349function utf8_romanize($string){ 350 if(utf8_isASCII($string)) return $string; //nothing to do 351 352 global $UTF8_ROMANIZATION; 353 return strtr($string,$UTF8_ROMANIZATION); 354} 355 356/** 357 * Removes special characters (nonalphanumeric) from a UTF-8 string 358 * 359 * This function adds the controlchars 0x00 to 0x19 to the array of 360 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 361 * 362 * @author Andreas Gohr <andi@splitbrain.org> 363 * @param string $string The UTF8 string to strip of special chars 364 * @param string $repl Replace special with this string 365 * @param string $additional Additional chars to strip (used in regexp char class) 366 */ 367function utf8_stripspecials($string,$repl='',$additional=''){ 368 global $UTF8_SPECIAL_CHARS; 369 global $UTF8_SPECIAL_CHARS2; 370 371 static $specials = null; 372 if(is_null($specials)){ 373# $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 374 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 375 } 376 377 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 378} 379 380/** 381 * This is an Unicode aware replacement for strpos 382 * 383 * Uses mb_string extension if available 384 * 385 * @author Harry Fuecks <hfuecks@gmail.com> 386 * @see strpos() 387 */ 388function utf8_strpos($haystack, $needle,$offset=0) { 389 if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8'); 390 391 if(!$offset){ 392 $ar = utf8_explode($needle, $haystack); 393 if ( count($ar) > 1 ) { 394 return utf8_strlen($ar[0]); 395 } 396 return false; 397 }else{ 398 if ( !is_int($offset) ) { 399 trigger_error('Offset must be an integer',E_USER_WARNING); 400 return false; 401 } 402 403 $haystack = utf8_substr($haystack, $offset); 404 405 if ( false !== ($pos = utf8_strpos($haystack,$needle))){ 406 return $pos + $offset; 407 } 408 return false; 409 } 410} 411 412/** 413 * Encodes UTF-8 characters to HTML entities 414 * 415 * @author <vpribish at shopping dot com> 416 * @link http://www.php.net/manual/en/function.utf8-decode.php 417 */ 418function utf8_tohtml ($str) { 419 $ret = ''; 420 $max = strlen($str); 421 $last = 0; // keeps the index of the last regular character 422 for ($i=0; $i<$max; $i++) { 423 $c = $str{$i}; 424 $c1 = ord($c); 425 if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 426 $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 427 $c1 &= 31; // remove the 3 bit two bytes prefix 428 $c2 = ord($str{++$i}); // the next byte 429 $c2 &= 63; // remove the 2 bit trailing byte prefix 430 $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 431 $c1 >>= 2; // c1 shifts 2 to the right 432 $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 433 $last = $i+1; 434 } 435 } 436 return $ret . substr($str, $last, $i); // append the last batch of regular characters 437} 438 439/** 440 * Takes an UTF-8 string and returns an array of ints representing the 441 * Unicode characters. Astral planes are supported ie. the ints in the 442 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 443 * are not allowed. 444 * 445 * If $strict is set to true the function returns false if the input 446 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 447 * level E_USER_WARNING 448 * 449 * Note: this function has been modified slightly in this library to 450 * trigger errors on encountering bad bytes 451 * 452 * @author <hsivonen@iki.fi> 453 * @author Harry Fuecks <hfuecks@gmail.com> 454 * @param string UTF-8 encoded string 455 * @param boolean Check for invalid sequences? 456 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 457 * @see unicode_to_utf8 458 * @link http://hsivonen.iki.fi/php-utf8/ 459 * @link http://sourceforge.net/projects/phputf8/ 460 */ 461function utf8_to_unicode($str,$strict=false) { 462 $mState = 0; // cached expected number of octets after the current octet 463 // until the beginning of the next UTF8 character sequence 464 $mUcs4 = 0; // cached Unicode character 465 $mBytes = 1; // cached expected number of octets in the current sequence 466 467 $out = array(); 468 469 $len = strlen($str); 470 471 for($i = 0; $i < $len; $i++) { 472 473 $in = ord($str{$i}); 474 475 if ( $mState == 0) { 476 477 // When mState is zero we expect either a US-ASCII character or a 478 // multi-octet sequence. 479 if (0 == (0x80 & ($in))) { 480 // US-ASCII, pass straight through. 481 $out[] = $in; 482 $mBytes = 1; 483 484 } else if (0xC0 == (0xE0 & ($in))) { 485 // First octet of 2 octet sequence 486 $mUcs4 = ($in); 487 $mUcs4 = ($mUcs4 & 0x1F) << 6; 488 $mState = 1; 489 $mBytes = 2; 490 491 } else if (0xE0 == (0xF0 & ($in))) { 492 // First octet of 3 octet sequence 493 $mUcs4 = ($in); 494 $mUcs4 = ($mUcs4 & 0x0F) << 12; 495 $mState = 2; 496 $mBytes = 3; 497 498 } else if (0xF0 == (0xF8 & ($in))) { 499 // First octet of 4 octet sequence 500 $mUcs4 = ($in); 501 $mUcs4 = ($mUcs4 & 0x07) << 18; 502 $mState = 3; 503 $mBytes = 4; 504 505 } else if (0xF8 == (0xFC & ($in))) { 506 /* First octet of 5 octet sequence. 507 * 508 * This is illegal because the encoded codepoint must be either 509 * (a) not the shortest form or 510 * (b) outside the Unicode range of 0-0x10FFFF. 511 * Rather than trying to resynchronize, we will carry on until the end 512 * of the sequence and let the later error handling code catch it. 513 */ 514 $mUcs4 = ($in); 515 $mUcs4 = ($mUcs4 & 0x03) << 24; 516 $mState = 4; 517 $mBytes = 5; 518 519 } else if (0xFC == (0xFE & ($in))) { 520 // First octet of 6 octet sequence, see comments for 5 octet sequence. 521 $mUcs4 = ($in); 522 $mUcs4 = ($mUcs4 & 1) << 30; 523 $mState = 5; 524 $mBytes = 6; 525 526 } elseif($strict) { 527 /* Current octet is neither in the US-ASCII range nor a legal first 528 * octet of a multi-octet sequence. 529 */ 530 trigger_error( 531 'utf8_to_unicode: Illegal sequence identifier '. 532 'in UTF-8 at byte '.$i, 533 E_USER_WARNING 534 ); 535 return FALSE; 536 537 } 538 539 } else { 540 541 // When mState is non-zero, we expect a continuation of the multi-octet 542 // sequence 543 if (0x80 == (0xC0 & ($in))) { 544 545 // Legal continuation. 546 $shift = ($mState - 1) * 6; 547 $tmp = $in; 548 $tmp = ($tmp & 0x0000003F) << $shift; 549 $mUcs4 |= $tmp; 550 551 /** 552 * End of the multi-octet sequence. mUcs4 now contains the final 553 * Unicode codepoint to be output 554 */ 555 if (0 == --$mState) { 556 557 /* 558 * Check for illegal sequences and codepoints. 559 */ 560 // From Unicode 3.1, non-shortest form is illegal 561 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 562 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 563 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 564 (4 < $mBytes) || 565 // From Unicode 3.2, surrogate characters are illegal 566 (($mUcs4 & 0xFFFFF800) == 0xD800) || 567 // Codepoints outside the Unicode range are illegal 568 ($mUcs4 > 0x10FFFF)) { 569 570 if($strict){ 571 trigger_error( 572 'utf8_to_unicode: Illegal sequence or codepoint '. 573 'in UTF-8 at byte '.$i, 574 E_USER_WARNING 575 ); 576 577 return FALSE; 578 } 579 580 } 581 582 if (0xFEFF != $mUcs4) { 583 // BOM is legal but we don't want to output it 584 $out[] = $mUcs4; 585 } 586 587 //initialize UTF8 cache 588 $mState = 0; 589 $mUcs4 = 0; 590 $mBytes = 1; 591 } 592 593 } elseif($strict) { 594 /** 595 *((0xC0 & (*in) != 0x80) && (mState != 0)) 596 * Incomplete multi-octet sequence. 597 */ 598 trigger_error( 599 'utf8_to_unicode: Incomplete multi-octet '. 600 ' sequence in UTF-8 at byte '.$i, 601 E_USER_WARNING 602 ); 603 604 return FALSE; 605 } 606 } 607 } 608 return $out; 609} 610 611/** 612 * Takes an array of ints representing the Unicode characters and returns 613 * a UTF-8 string. Astral planes are supported ie. the ints in the 614 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 615 * are not allowed. 616 * 617 * If $strict is set to true the function returns false if the input 618 * array contains ints that represent surrogates or are outside the 619 * Unicode range and raises a PHP error at level E_USER_WARNING 620 * 621 * Note: this function has been modified slightly in this library to use 622 * output buffering to concatenate the UTF-8 string (faster) as well as 623 * reference the array by it's keys 624 * 625 * @param array of unicode code points representing a string 626 * @param boolean Check for invalid sequences? 627 * @return mixed UTF-8 string or FALSE if array contains invalid code points 628 * @author <hsivonen@iki.fi> 629 * @author Harry Fuecks <hfuecks@gmail.com> 630 * @see utf8_to_unicode 631 * @link http://hsivonen.iki.fi/php-utf8/ 632 * @link http://sourceforge.net/projects/phputf8/ 633 */ 634function unicode_to_utf8($arr,$strict=false) { 635 if (!is_array($arr)) return ''; 636 ob_start(); 637 638 foreach (array_keys($arr) as $k) { 639 640 # ASCII range (including control chars) 641 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 642 643 echo chr($arr[$k]); 644 645 # 2 byte sequence 646 } else if ($arr[$k] <= 0x07ff) { 647 648 echo chr(0xc0 | ($arr[$k] >> 6)); 649 echo chr(0x80 | ($arr[$k] & 0x003f)); 650 651 # Byte order mark (skip) 652 } else if($arr[$k] == 0xFEFF) { 653 654 // nop -- zap the BOM 655 656 # Test for illegal surrogates 657 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 658 659 // found a surrogate 660 if($strict){ 661 trigger_error( 662 'unicode_to_utf8: Illegal surrogate '. 663 'at index: '.$k.', value: '.$arr[$k], 664 E_USER_WARNING 665 ); 666 return FALSE; 667 } 668 669 # 3 byte sequence 670 } else if ($arr[$k] <= 0xffff) { 671 672 echo chr(0xe0 | ($arr[$k] >> 12)); 673 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 674 echo chr(0x80 | ($arr[$k] & 0x003f)); 675 676 # 4 byte sequence 677 } else if ($arr[$k] <= 0x10ffff) { 678 679 echo chr(0xf0 | ($arr[$k] >> 18)); 680 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 681 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 682 echo chr(0x80 | ($arr[$k] & 0x3f)); 683 684 } elseif($strict) { 685 686 trigger_error( 687 'unicode_to_utf8: Codepoint out of Unicode range '. 688 'at index: '.$k.', value: '.$arr[$k], 689 E_USER_WARNING 690 ); 691 692 // out of range 693 return FALSE; 694 } 695 } 696 697 $result = ob_get_contents(); 698 ob_end_clean(); 699 return $result; 700} 701 702/** 703 * UTF-8 to UTF-16BE conversion. 704 * 705 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 706 */ 707function utf8_to_utf16be(&$str, $bom = false) { 708 $out = $bom ? "\xFE\xFF" : ''; 709 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 710 711 $uni = utf8_to_unicode($str); 712 foreach($uni as $cp){ 713 $out .= pack('n',$cp); 714 } 715 return $out; 716} 717 718/** 719 * UTF-8 to UTF-16BE conversion. 720 * 721 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 722 */ 723function utf16be_to_utf8(&$str) { 724 $uni = unpack('n*',$str); 725 return unicode_to_utf8($uni); 726} 727 728/** 729 * Replace bad bytes with an alternative character 730 * 731 * ASCII character is recommended for replacement char 732 * 733 * PCRE Pattern to locate bad bytes in a UTF-8 string 734 * Comes from W3 FAQ: Multilingual Forms 735 * Note: modified to include full ASCII range including control chars 736 * 737 * @author Harry Fuecks <hfuecks@gmail.com> 738 * @see http://www.w3.org/International/questions/qa-forms-utf-8 739 * @param string to search 740 * @param string to replace bad bytes with (defaults to '?') - use ASCII 741 * @return string 742 */ 743function utf8_bad_replace($str, $replace = '') { 744 $UTF8_BAD = 745 '([\x00-\x7F]'. # ASCII (including control chars) 746 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 747 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 748 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 749 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 750 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 751 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 752 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 753 '|(.{1}))'; # invalid byte 754 ob_start(); 755 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 756 if ( !isset($matches[2])) { 757 echo $matches[0]; 758 } else { 759 echo $replace; 760 } 761 $str = substr($str,strlen($matches[0])); 762 } 763 $result = ob_get_contents(); 764 ob_end_clean(); 765 return $result; 766} 767 768/** 769 * adjust a byte index into a utf8 string to a utf8 character boundary 770 * 771 * @param $str string utf8 character string 772 * @param $i int byte index into $str 773 * @param $next bool direction to search for boundary, 774 * false = up (current character) 775 * true = down (next character) 776 * 777 * @return int byte index into $str now pointing to a utf8 character boundary 778 * 779 * @author chris smith <chris@jalakai.co.uk> 780 */ 781function utf8_correctIdx(&$str,$i,$next=false) { 782 783 if ($i <= 0) return 0; 784 785 $limit = strlen($str); 786 if ($i>=$limit) return $limit; 787 788 if ($next) { 789 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 790 } else { 791 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 792 } 793 794 return $i; 795} 796 797// only needed if no mb_string available 798if(!UTF8_MBSTRING){ 799 800 /** 801 * UTF-8 Case lookup table 802 * 803 * This lookuptable defines the upper case letters to their correspponding 804 * lower case letter in UTF-8 805 * 806 * @author Andreas Gohr <andi@splitbrain.org> 807 */ 808 global $UTF8_LOWER_TO_UPPER; 809 $UTF8_LOWER_TO_UPPER = array( 810 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 811 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 812 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 813 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 814 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 815 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 816 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 817 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 818 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 819 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 820 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 821 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 822 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 823 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 824 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 825 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 826 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 827 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 828 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 829 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 830 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 831 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 832 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 833 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 834 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 835 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 836 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 837 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 838 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 839 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 840 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 841 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 842 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 843 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 844 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 845 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 846 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 847 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 848 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 849 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 850 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 851 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 852 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 853 ); 854 855 /** 856 * UTF-8 Case lookup table 857 * 858 * This lookuptable defines the lower case letters to their correspponding 859 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 860 * 861 * @author Andreas Gohr <andi@splitbrain.org> 862 */ 863 global $UTF8_UPPER_TO_LOWER; 864 $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 865 866} // end of case lookup tables 867 868 869/** 870 * UTF-8 lookup table for lower case accented letters 871 * 872 * This lookuptable defines replacements for accented characters from the ASCII-7 873 * range. This are lower case letters only. 874 * 875 * @author Andreas Gohr <andi@splitbrain.org> 876 * @see utf8_deaccent() 877 */ 878global $UTF8_LOWER_ACCENTS; 879$UTF8_LOWER_ACCENTS = array( 880 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 881 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 882 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 883 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 884 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 885 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 886 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 887 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 888 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 889 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 890 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 891 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 892 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 893 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 894 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 895); 896 897/** 898 * UTF-8 lookup table for upper case accented letters 899 * 900 * This lookuptable defines replacements for accented characters from the ASCII-7 901 * range. This are upper case letters only. 902 * 903 * @author Andreas Gohr <andi@splitbrain.org> 904 * @see utf8_deaccent() 905 */ 906global $UTF8_UPPER_ACCENTS; 907$UTF8_UPPER_ACCENTS = array( 908 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 909 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 910 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 911 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 912 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 913 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 914 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 915 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 916 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 917 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 918 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 919 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 920 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 921 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 922 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 923); 924 925/** 926 * UTF-8 array of common special characters 927 * 928 * This array should contain all special characters (not a letter or digit) 929 * defined in the various local charsets - it's not a complete list of non-alphanum 930 * characters in UTF-8. It's not perfect but should match most cases of special 931 * chars. 932 * 933 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 934 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 935 * 936 * @author Andreas Gohr <andi@splitbrain.org> 937 * @see utf8_stripspecials() 938 */ 939global $UTF8_SPECIAL_CHARS; 940$UTF8_SPECIAL_CHARS = array( 941 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 942 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 943 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 944 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 945 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 946 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 947 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 948 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 949 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 950 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 951 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 952 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 953 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 954 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 955 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 956 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 957 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 958 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 959 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 960 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 961 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 962 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 963 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 964 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 965 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 966 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 967 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 968 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 969 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 970 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 971 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 972 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 973 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 974 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 975 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 976 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 977 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 978 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 979 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 980 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 981 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 982 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 983 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 984 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 985 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 986 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 987 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 988 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 989 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 990 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 991); 992 993// utf8 version of above data 994global $UTF8_SPECIAL_CHARS2; 995$UTF8_SPECIAL_CHARS2 = 996 ' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 997 '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 998 '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 999 '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1000 '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1001 '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1002 '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1003 '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1004 '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1005 '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1006 '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1007 '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1008 '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1009 '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1010 '➷➸➹➺➻➼➽➾�'. 1011 '�ﹼﹽ'; 1012 1013/** 1014 * Romanization lookup table 1015 * 1016 * This lookup tables provides a way to transform strings written in a language 1017 * different from the ones based upon latin letters into plain ASCII. 1018 * 1019 * Please note: this is not a scientific transliteration table. It only works 1020 * oneway from nonlatin to ASCII and it works by simple character replacement 1021 * only. Specialities of each language are not supported. 1022 * 1023 * @author Andreas Gohr <andi@splitbrain.org> 1024 * @author Vitaly Blokhin <vitinfo@vitn.com> 1025 * @link http://www.uconv.com/translit.htm 1026 * @author Bisqwit <bisqwit@iki.fi> 1027 * @link http://kanjidict.stc.cx/hiragana.php?src=2 1028 * @link http://www.translatum.gr/converter/greek-transliteration.htm 1029 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 1030 * @link http://www.btranslations.com/resources/romanization/korean.asp 1031 */ 1032global $UTF8_ROMANIZATION; 1033$UTF8_ROMANIZATION = array( 1034 //russian cyrillic 1035 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 1036 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 1037 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 1038 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 1039 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 1040 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1041 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1042 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 1043 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 1044 // Ukrainian cyrillic 1045 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 1046 // Georgian 1047 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 1048 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 1049 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 1050 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 1051 'ჰ'=>'xh', 1052 //Sanskrit 1053 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 1054 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 1055 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 1056 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 1057 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 1058 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 1059 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 1060 //Hebrew 1061 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 1062 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 1063 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 1064 'ש'=>'sh','ת'=>'t', 1065 //Arabic 1066 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 1067 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 1068 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 1069 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 1070 1071 // Japanese hiragana 1072 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 1073 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 1074 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1075 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 1076 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 1077 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 1078 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1079 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 1080 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 1081 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 1082 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 1083 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 1084 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 1085 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 1086 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1087 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 1088 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 1089 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 1090 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 1091 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 1092 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 1093 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 1094 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 1095 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 1096 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 1097 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1098 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1099 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 1100 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 1101 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 1102 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 1103 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 1104 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 1105 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 1106 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 1107 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 1108 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 1109 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 1110 'じゅ'=>'zyu', 1111 // Japanese katakana 1112 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 1113 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 1114 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 1115 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 1116 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 1117 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 1118 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 1119 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1120 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 1121 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 1122 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 1123 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 1124 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 1125 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1126 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 1127 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 1128 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 1129 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 1130 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 1131 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1132 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 1133 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 1134 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 1135 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 1136 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 1137 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1138 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 1139 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 1140 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 1141 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 1142 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1143 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 1144 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 1145 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 1146 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 1147 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 1148 'ジョ'=>'zyo','ジュ'=>'zyu', 1149 1150 // "Greeklish" 1151 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1152 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1153 1154 // Thai 1155 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1156 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1157 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1158 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1159 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1160 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1161 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1162 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1163 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1164 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1165 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1166 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1167 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1168 'เ–ียว'=>'iao', 1169 1170 // Korean 1171 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1172 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1173 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1174 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1175 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1176 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1177); 1178 1179//Setup VIM: ex: et ts=2 enc=utf-8 : 1180 1181