1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * check for mb_string support 11 */ 12if(!defined('UTF8_MBSTRING')){ 13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14 define('UTF8_MBSTRING',1); 15 }else{ 16 define('UTF8_MBSTRING',0); 17 } 18} 19 20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 21 22 23/** 24 * URL-Encode a filename to allow unicodecharacters 25 * 26 * Slashes are not encoded 27 * 28 * When the second parameter is true the string will 29 * be encoded only if non ASCII characters are detected - 30 * This makes it safe to run it multiple times on the 31 * same string (default is true) 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @see urlencode 35 */ 36function utf8_encodeFN($file,$safe=true){ 37 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38 return $file; 39 } 40 $file = urlencode($file); 41 $file = str_replace('%2F','/',$file); 42 return $file; 43} 44 45/** 46 * URL-Decode a filename 47 * 48 * This is just a wrapper around urldecode 49 * 50 * @author Andreas Gohr <andi@splitbrain.org> 51 * @see urldecode 52 */ 53function utf8_decodeFN($file){ 54 $file = urldecode($file); 55 return $file; 56} 57 58/** 59 * Checks if a string contains 7bit ASCII only 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63function utf8_isASCII($str){ 64 for($i=0; $i<strlen($str); $i++){ 65 if(ord($str{$i}) >127) return false; 66 } 67 return true; 68} 69 70/** 71 * Strips all highbyte chars 72 * 73 * Returns a pure ASCII7 string 74 * 75 * @author Andreas Gohr <andi@splitbrain.org> 76 */ 77function utf8_strip($str){ 78 $ascii = ''; 79 for($i=0; $i<strlen($str); $i++){ 80 if(ord($str{$i}) <128){ 81 $ascii .= $str{$i}; 82 } 83 } 84 return $ascii; 85} 86 87/** 88 * Tries to detect if a string is in Unicode encoding 89 * 90 * @author <bmorel@ssi.fr> 91 * @link http://www.php.net/manual/en/function.utf8-encode.php 92 */ 93function utf8_check($Str) { 94 for ($i=0; $i<strlen($Str); $i++) { 95 $b = ord($Str[$i]); 96 if ($b < 0x80) continue; # 0bbbbbbb 97 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 98 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 99 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 100 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 101 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 102 else return false; # Does not match any model 103 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 104 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 105 return false; 106 } 107 } 108 return true; 109} 110 111/** 112 * Unicode aware replacement for strlen() 113 * 114 * utf8_decode() converts characters that are not in ISO-8859-1 115 * to '?', which, for the purpose of counting, is alright - It's 116 * even faster than mb_strlen. 117 * 118 * @author <chernyshevsky at hotmail dot com> 119 * @see strlen() 120 * @see utf8_decode() 121 */ 122function utf8_strlen($string){ 123 return strlen(utf8_decode($string)); 124} 125 126/** 127 * UTF-8 aware alternative to substr 128 * 129 * Return part of a string given character offset (and optionally length) 130 * Note: supports use of negative offsets and lengths but will be slower 131 * when doing so 132 * 133 * @author Harry Fuecks <hfuecks@gmail.com> 134 * @author Chris Smith <chris@jalakai.co.uk> 135 * @param string 136 * @param integer number of UTF-8 characters offset (from left) 137 * @param integer (optional) length in UTF-8 characters from offset 138 * @return mixed string or FALSE if failure 139 */ 140function utf8_substr($str, $offset, $length = null) { 141 if(UTF8_MBSTRING){ 142 if( $length === null ){ 143 return mb_substr($str, $offset); 144 }else{ 145 return mb_substr($str, $offset, $length); 146 } 147 } 148 149 if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) { 150 if ( $length === null ) { 151 $length = '*'; 152 } else { 153 $strlen = strlen(utf8_decode($str)); 154 if ( $offset > $strlen ) { 155 return ''; 156 } 157 158 if ( ( $offset + $length ) > $strlen ) { 159 $length = '*'; 160 } else { 161 $length = '{'.$length.'}'; 162 } 163 } 164 165 $pattern = '/^.{'.$offset.'}(.'.$length.')/us'; 166 preg_match($pattern, $str, $matches); 167 168 if ( isset($matches[1]) ) { 169 return $matches[1]; 170 } 171 return false; 172 173 } else { 174 175 // convert character offsets to byte offsets and use normal substr() 176 // 1. normalise paramters into positive offset and length and carry out simple checks 177 $strlen = strlen(utf8_decode($str)); 178 179 if ($offset < 0) { 180 $offset = max($strlen+$offset,0); 181 } 182 if ($offset >= $strlen) return false; 183 184 if ($length === null) { 185 // 2a. convert to start byte offset 186 list($start) = _utf8_byteindex($str,$offset); 187 return substr($str,$start); 188 } 189 190 if ($length < 0) { 191 $length = $strlen-$offset+$length; 192 if ($length < 0) return ''; 193 } 194 195 if ($length === 0) return ''; 196 if ($strlen - $offset < $length) $length = $strlen-$offset; 197 198 // 2b. convert to start and end byte offsets 199 list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length); 200 return substr($str,$start,$end-$start); 201 } 202} 203 204 205/** 206 * Unicode aware replacement for substr_replace() 207 * 208 * @author Andreas Gohr <andi@splitbrain.org> 209 * @see substr_replace() 210 */ 211function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 212 $ret = ''; 213 if($start>0) $ret .= utf8_substr($string, 0, $start); 214 $ret .= $replacement; 215 $ret .= utf8_substr($string, $start+$length); 216 return $ret; 217} 218 219/** 220 * Unicode aware replacement for explode 221 * 222 * @TODO support third limit arg 223 * @author Harry Fuecks <hfuecks@gmail.com> 224 * @see explode(); 225 */ 226function utf8_explode($sep, $str) { 227 if ( $sep == '' ) { 228 trigger_error('Empty delimiter',E_USER_WARNING); 229 return FALSE; 230 } 231 232 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 233} 234 235/** 236 * Unicode aware replacement for strrepalce() 237 * 238 * @todo support PHP5 count (fourth arg) 239 * @author Harry Fuecks <hfuecks@gmail.com> 240 * @see strreplace(); 241 */ 242function utf8_str_replace($s,$r,$str){ 243 if(!is_array($s)){ 244 $s = '!'.preg_quote($s,'!').'!u'; 245 }else{ 246 foreach ($s as $k => $v) { 247 $s[$k] = '!'.preg_quote($v).'!u'; 248 } 249 } 250 return preg_replace($s,$r,$str); 251} 252 253/** 254 * Unicode aware replacement for ltrim() 255 * 256 * @author Andreas Gohr <andi@splitbrain.org> 257 * @see ltrim() 258 * @return string 259 */ 260function utf8_ltrim($str,$charlist=''){ 261 if($charlist == '') return ltrim($str); 262 263 //quote charlist for use in a characterclass 264 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 265 266 return preg_replace('/^['.$charlist.']+/u','',$str); 267} 268 269/** 270 * Unicode aware replacement for rtrim() 271 * 272 * @author Andreas Gohr <andi@splitbrain.org> 273 * @see rtrim() 274 * @return string 275 */ 276function utf8_rtrim($str,$charlist=''){ 277 if($charlist == '') return rtrim($str); 278 279 //quote charlist for use in a characterclass 280 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 281 282 return preg_replace('/['.$charlist.']+$/u','',$str); 283} 284 285/** 286 * Unicode aware replacement for trim() 287 * 288 * @author Andreas Gohr <andi@splitbrain.org> 289 * @see trim() 290 * @return string 291 */ 292function utf8_trim($str,$charlist='') { 293 if($charlist == '') return trim($str); 294 295 return utf8_ltrim(utf8_rtrim($str)); 296} 297 298 299/** 300 * This is a unicode aware replacement for strtolower() 301 * 302 * Uses mb_string extension if available 303 * 304 * @author Andreas Gohr <andi@splitbrain.org> 305 * @see strtolower() 306 * @see utf8_strtoupper() 307 */ 308function utf8_strtolower($string){ 309 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 310 311 global $UTF8_UPPER_TO_LOWER; 312 $uni = utf8_to_unicode($string); 313 $cnt = count($uni); 314 for ($i=0; $i < $cnt; $i++){ 315 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 316 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 317 } 318 } 319 return unicode_to_utf8($uni); 320} 321 322/** 323 * This is a unicode aware replacement for strtoupper() 324 * 325 * Uses mb_string extension if available 326 * 327 * @author Andreas Gohr <andi@splitbrain.org> 328 * @see strtoupper() 329 * @see utf8_strtoupper() 330 */ 331function utf8_strtoupper($string){ 332 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 333 334 global $UTF8_LOWER_TO_UPPER; 335 $uni = utf8_to_unicode($string); 336 $cnt = count($uni); 337 for ($i=0; $i < $cnt; $i++){ 338 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 339 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 340 } 341 } 342 return unicode_to_utf8($uni); 343} 344 345/** 346 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 347 * 348 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 349 * letters. Default is to deaccent both cases ($case = 0) 350 * 351 * @author Andreas Gohr <andi@splitbrain.org> 352 */ 353function utf8_deaccent($string,$case=0){ 354 if($case <= 0){ 355 global $UTF8_LOWER_ACCENTS; 356 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 357 } 358 if($case >= 0){ 359 global $UTF8_UPPER_ACCENTS; 360 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 361 } 362 return $string; 363} 364 365/** 366 * Romanize a non-latin string 367 * 368 * @author Andreas Gohr <andi@splitbrain.org> 369 */ 370function utf8_romanize($string){ 371 if(utf8_isASCII($string)) return $string; //nothing to do 372 373 global $UTF8_ROMANIZATION; 374 return strtr($string,$UTF8_ROMANIZATION); 375} 376 377/** 378 * Removes special characters (nonalphanumeric) from a UTF-8 string 379 * 380 * This function adds the controlchars 0x00 to 0x19 to the array of 381 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 382 * 383 * @author Andreas Gohr <andi@splitbrain.org> 384 * @param string $string The UTF8 string to strip of special chars 385 * @param string $repl Replace special with this string 386 * @param string $additional Additional chars to strip (used in regexp char class) 387 */ 388function utf8_stripspecials($string,$repl='',$additional=''){ 389 global $UTF8_SPECIAL_CHARS; 390 global $UTF8_SPECIAL_CHARS2; 391 392 static $specials = null; 393 if(is_null($specials)){ 394# $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 395 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 396 } 397 398 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 399} 400 401/** 402 * This is an Unicode aware replacement for strpos 403 * 404 * Uses mb_string extension if available 405 * 406 * @author Harry Fuecks <hfuecks@gmail.com> 407 * @see strpos() 408 */ 409function utf8_strpos($haystack, $needle,$offset=0) { 410 if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8'); 411 412 if(!$offset){ 413 $ar = utf8_explode($needle, $haystack); 414 if ( count($ar) > 1 ) { 415 return utf8_strlen($ar[0]); 416 } 417 return false; 418 }else{ 419 if ( !is_int($offset) ) { 420 trigger_error('Offset must be an integer',E_USER_WARNING); 421 return false; 422 } 423 424 $haystack = utf8_substr($haystack, $offset); 425 426 if ( false !== ($pos = utf8_strpos($haystack,$needle))){ 427 return $pos + $offset; 428 } 429 return false; 430 } 431} 432 433/** 434 * Encodes UTF-8 characters to HTML entities 435 * 436 * @author <vpribish at shopping dot com> 437 * @link http://www.php.net/manual/en/function.utf8-decode.php 438 */ 439function utf8_tohtml ($str) { 440 $ret = ''; 441 $max = strlen($str); 442 $last = 0; // keeps the index of the last regular character 443 for ($i=0; $i<$max; $i++) { 444 $c = $str{$i}; 445 $c1 = ord($c); 446 if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 447 $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 448 $c1 &= 31; // remove the 3 bit two bytes prefix 449 $c2 = ord($str{++$i}); // the next byte 450 $c2 &= 63; // remove the 2 bit trailing byte prefix 451 $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 452 $c1 >>= 2; // c1 shifts 2 to the right 453 $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 454 $last = $i+1; 455 } 456 } 457 return $ret . substr($str, $last, $i); // append the last batch of regular characters 458} 459 460/** 461 * Takes an UTF-8 string and returns an array of ints representing the 462 * Unicode characters. Astral planes are supported ie. the ints in the 463 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 464 * are not allowed. 465 * 466 * If $strict is set to true the function returns false if the input 467 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 468 * level E_USER_WARNING 469 * 470 * Note: this function has been modified slightly in this library to 471 * trigger errors on encountering bad bytes 472 * 473 * @author <hsivonen@iki.fi> 474 * @author Harry Fuecks <hfuecks@gmail.com> 475 * @param string UTF-8 encoded string 476 * @param boolean Check for invalid sequences? 477 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 478 * @see unicode_to_utf8 479 * @link http://hsivonen.iki.fi/php-utf8/ 480 * @link http://sourceforge.net/projects/phputf8/ 481 */ 482function utf8_to_unicode($str,$strict=false) { 483 $mState = 0; // cached expected number of octets after the current octet 484 // until the beginning of the next UTF8 character sequence 485 $mUcs4 = 0; // cached Unicode character 486 $mBytes = 1; // cached expected number of octets in the current sequence 487 488 $out = array(); 489 490 $len = strlen($str); 491 492 for($i = 0; $i < $len; $i++) { 493 494 $in = ord($str{$i}); 495 496 if ( $mState == 0) { 497 498 // When mState is zero we expect either a US-ASCII character or a 499 // multi-octet sequence. 500 if (0 == (0x80 & ($in))) { 501 // US-ASCII, pass straight through. 502 $out[] = $in; 503 $mBytes = 1; 504 505 } else if (0xC0 == (0xE0 & ($in))) { 506 // First octet of 2 octet sequence 507 $mUcs4 = ($in); 508 $mUcs4 = ($mUcs4 & 0x1F) << 6; 509 $mState = 1; 510 $mBytes = 2; 511 512 } else if (0xE0 == (0xF0 & ($in))) { 513 // First octet of 3 octet sequence 514 $mUcs4 = ($in); 515 $mUcs4 = ($mUcs4 & 0x0F) << 12; 516 $mState = 2; 517 $mBytes = 3; 518 519 } else if (0xF0 == (0xF8 & ($in))) { 520 // First octet of 4 octet sequence 521 $mUcs4 = ($in); 522 $mUcs4 = ($mUcs4 & 0x07) << 18; 523 $mState = 3; 524 $mBytes = 4; 525 526 } else if (0xF8 == (0xFC & ($in))) { 527 /* First octet of 5 octet sequence. 528 * 529 * This is illegal because the encoded codepoint must be either 530 * (a) not the shortest form or 531 * (b) outside the Unicode range of 0-0x10FFFF. 532 * Rather than trying to resynchronize, we will carry on until the end 533 * of the sequence and let the later error handling code catch it. 534 */ 535 $mUcs4 = ($in); 536 $mUcs4 = ($mUcs4 & 0x03) << 24; 537 $mState = 4; 538 $mBytes = 5; 539 540 } else if (0xFC == (0xFE & ($in))) { 541 // First octet of 6 octet sequence, see comments for 5 octet sequence. 542 $mUcs4 = ($in); 543 $mUcs4 = ($mUcs4 & 1) << 30; 544 $mState = 5; 545 $mBytes = 6; 546 547 } elseif($strict) { 548 /* Current octet is neither in the US-ASCII range nor a legal first 549 * octet of a multi-octet sequence. 550 */ 551 trigger_error( 552 'utf8_to_unicode: Illegal sequence identifier '. 553 'in UTF-8 at byte '.$i, 554 E_USER_WARNING 555 ); 556 return FALSE; 557 558 } 559 560 } else { 561 562 // When mState is non-zero, we expect a continuation of the multi-octet 563 // sequence 564 if (0x80 == (0xC0 & ($in))) { 565 566 // Legal continuation. 567 $shift = ($mState - 1) * 6; 568 $tmp = $in; 569 $tmp = ($tmp & 0x0000003F) << $shift; 570 $mUcs4 |= $tmp; 571 572 /** 573 * End of the multi-octet sequence. mUcs4 now contains the final 574 * Unicode codepoint to be output 575 */ 576 if (0 == --$mState) { 577 578 /* 579 * Check for illegal sequences and codepoints. 580 */ 581 // From Unicode 3.1, non-shortest form is illegal 582 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 583 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 584 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 585 (4 < $mBytes) || 586 // From Unicode 3.2, surrogate characters are illegal 587 (($mUcs4 & 0xFFFFF800) == 0xD800) || 588 // Codepoints outside the Unicode range are illegal 589 ($mUcs4 > 0x10FFFF)) { 590 591 if($strict){ 592 trigger_error( 593 'utf8_to_unicode: Illegal sequence or codepoint '. 594 'in UTF-8 at byte '.$i, 595 E_USER_WARNING 596 ); 597 598 return FALSE; 599 } 600 601 } 602 603 if (0xFEFF != $mUcs4) { 604 // BOM is legal but we don't want to output it 605 $out[] = $mUcs4; 606 } 607 608 //initialize UTF8 cache 609 $mState = 0; 610 $mUcs4 = 0; 611 $mBytes = 1; 612 } 613 614 } elseif($strict) { 615 /** 616 *((0xC0 & (*in) != 0x80) && (mState != 0)) 617 * Incomplete multi-octet sequence. 618 */ 619 trigger_error( 620 'utf8_to_unicode: Incomplete multi-octet '. 621 ' sequence in UTF-8 at byte '.$i, 622 E_USER_WARNING 623 ); 624 625 return FALSE; 626 } 627 } 628 } 629 return $out; 630} 631 632/** 633 * Takes an array of ints representing the Unicode characters and returns 634 * a UTF-8 string. Astral planes are supported ie. the ints in the 635 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 636 * are not allowed. 637 * 638 * If $strict is set to true the function returns false if the input 639 * array contains ints that represent surrogates or are outside the 640 * Unicode range and raises a PHP error at level E_USER_WARNING 641 * 642 * Note: this function has been modified slightly in this library to use 643 * output buffering to concatenate the UTF-8 string (faster) as well as 644 * reference the array by it's keys 645 * 646 * @param array of unicode code points representing a string 647 * @param boolean Check for invalid sequences? 648 * @return mixed UTF-8 string or FALSE if array contains invalid code points 649 * @author <hsivonen@iki.fi> 650 * @author Harry Fuecks <hfuecks@gmail.com> 651 * @see utf8_to_unicode 652 * @link http://hsivonen.iki.fi/php-utf8/ 653 * @link http://sourceforge.net/projects/phputf8/ 654 */ 655function unicode_to_utf8($arr,$strict=false) { 656 if (!is_array($arr)) return ''; 657 ob_start(); 658 659 foreach (array_keys($arr) as $k) { 660 661 # ASCII range (including control chars) 662 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 663 664 echo chr($arr[$k]); 665 666 # 2 byte sequence 667 } else if ($arr[$k] <= 0x07ff) { 668 669 echo chr(0xc0 | ($arr[$k] >> 6)); 670 echo chr(0x80 | ($arr[$k] & 0x003f)); 671 672 # Byte order mark (skip) 673 } else if($arr[$k] == 0xFEFF) { 674 675 // nop -- zap the BOM 676 677 # Test for illegal surrogates 678 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 679 680 // found a surrogate 681 if($strict){ 682 trigger_error( 683 'unicode_to_utf8: Illegal surrogate '. 684 'at index: '.$k.', value: '.$arr[$k], 685 E_USER_WARNING 686 ); 687 return FALSE; 688 } 689 690 # 3 byte sequence 691 } else if ($arr[$k] <= 0xffff) { 692 693 echo chr(0xe0 | ($arr[$k] >> 12)); 694 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 695 echo chr(0x80 | ($arr[$k] & 0x003f)); 696 697 # 4 byte sequence 698 } else if ($arr[$k] <= 0x10ffff) { 699 700 echo chr(0xf0 | ($arr[$k] >> 18)); 701 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 702 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 703 echo chr(0x80 | ($arr[$k] & 0x3f)); 704 705 } elseif($strict) { 706 707 trigger_error( 708 'unicode_to_utf8: Codepoint out of Unicode range '. 709 'at index: '.$k.', value: '.$arr[$k], 710 E_USER_WARNING 711 ); 712 713 // out of range 714 return FALSE; 715 } 716 } 717 718 $result = ob_get_contents(); 719 ob_end_clean(); 720 return $result; 721} 722 723/** 724 * UTF-8 to UTF-16BE conversion. 725 * 726 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 727 */ 728function utf8_to_utf16be(&$str, $bom = false) { 729 $out = $bom ? "\xFE\xFF" : ''; 730 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 731 732 $uni = utf8_to_unicode($str); 733 foreach($uni as $cp){ 734 $out .= pack('n',$cp); 735 } 736 return $out; 737} 738 739/** 740 * UTF-8 to UTF-16BE conversion. 741 * 742 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 743 */ 744function utf16be_to_utf8(&$str) { 745 $uni = unpack('n*',$str); 746 return unicode_to_utf8($uni); 747} 748 749/** 750 * Replace bad bytes with an alternative character 751 * 752 * ASCII character is recommended for replacement char 753 * 754 * PCRE Pattern to locate bad bytes in a UTF-8 string 755 * Comes from W3 FAQ: Multilingual Forms 756 * Note: modified to include full ASCII range including control chars 757 * 758 * @author Harry Fuecks <hfuecks@gmail.com> 759 * @see http://www.w3.org/International/questions/qa-forms-utf-8 760 * @param string to search 761 * @param string to replace bad bytes with (defaults to '?') - use ASCII 762 * @return string 763 */ 764function utf8_bad_replace($str, $replace = '') { 765 $UTF8_BAD = 766 '([\x00-\x7F]'. # ASCII (including control chars) 767 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 768 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 769 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 770 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 771 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 772 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 773 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 774 '|(.{1}))'; # invalid byte 775 ob_start(); 776 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 777 if ( !isset($matches[2])) { 778 echo $matches[0]; 779 } else { 780 echo $replace; 781 } 782 $str = substr($str,strlen($matches[0])); 783 } 784 $result = ob_get_contents(); 785 ob_end_clean(); 786 return $result; 787} 788 789/** 790 * adjust a byte index into a utf8 string to a utf8 character boundary 791 * 792 * @param $str string utf8 character string 793 * @param $i int byte index into $str 794 * @param $next bool direction to search for boundary, 795 * false = up (current character) 796 * true = down (next character) 797 * 798 * @return int byte index into $str now pointing to a utf8 character boundary 799 * 800 * @author chris smith <chris@jalakai.co.uk> 801 */ 802function utf8_correctIdx(&$str,$i,$next=false) { 803 804 if ($i <= 0) return 0; 805 806 $limit = strlen($str); 807 if ($i>=$limit) return $limit; 808 809 if ($next) { 810 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 811 } else { 812 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 813 } 814 815 return $i; 816} 817 818/** 819 * determine the byte indexes into a utf-8 string for one or more character offsets 820 * PRIVATE (could be made public with proper paramter checking) 821 * 822 * @author Chris Smith <chris@jalakai.co.uk> 823 * 824 * @param string $str utf8 string 825 * @param int $offset any number of character offsets into $str 826 * 827 * @return array byte indexes into $str, one index for each offset argument 828 */ 829function _utf8_byteindex() { 830 831 $args = func_get_args(); 832 $str =& array_shift($args); 833 if (!is_string($str)) return false; 834 835 $result = array(); 836 837 // use a short piece of str to estimate bytes per character 838 $i = utf8_correctIdx($str, 300, true); // $i (& $j) -> byte indexes into $str 839 $c = utf8_strlen(substr($str,0,$i)); // $c -> character offset into $str 840 841 sort($args); // deal with arguments from lowest to highest 842 foreach ($args as $offset) { 843 // sanity checks FIXME 844 845 // 0 is an easy check 846 if ($offset == 0) { $result[] = 0; continue; } 847 848 $safety_valve = 50; // ensure no endless looping 849 850 do { 851 $j = (int)($offset * $i/$c); // apply latest bytes/character estimate to offset 852 $j = utf8_correctIdx($str, $j, true); // correct to utf8 character boundary 853 854 if ($j > $i) { 855 $c += utf8_strlen(substr($str,$i,$j-$i)); // determine new character offset 856 } else { 857 $c -= utf8_strlen(substr($str,$j,$i-$j)); // ditto 858 } 859 860 $error = abs($c-$offset); 861 862 $i = $j; // ready for next time around 863 } while (($error > 7) && --$safety_valve) ; // from 7 it is faster to iterate over the string 864 865 if ($error && $error <= 7) { 866 if ($c < $offset) { 867 // move up 868 while ($error--) { $i = utf8_correctIdx($str,++$i,true); } 869 } else { 870 // move down 871 while ($error--) { $i = utf8_correctIdx($str,--$i,false); } 872 } 873 $c = $offset; // ready for next arg 874 } 875 $result[] = $i; 876 } 877 878 return $result; 879} 880 881// only needed if no mb_string available 882if(!UTF8_MBSTRING){ 883 884 /** 885 * UTF-8 Case lookup table 886 * 887 * This lookuptable defines the upper case letters to their correspponding 888 * lower case letter in UTF-8 889 * 890 * @author Andreas Gohr <andi@splitbrain.org> 891 */ 892 global $UTF8_LOWER_TO_UPPER; 893 $UTF8_LOWER_TO_UPPER = array( 894 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 895 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 896 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 897 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 898 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 899 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 900 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 901 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 902 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 903 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 904 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 905 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 906 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 907 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 908 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 909 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 910 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 911 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 912 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 913 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 914 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 915 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 916 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 917 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 918 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 919 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 920 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 921 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 922 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 923 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 924 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 925 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 926 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 927 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 928 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 929 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 930 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 931 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 932 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 933 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 934 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 935 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 936 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 937 ); 938 939 /** 940 * UTF-8 Case lookup table 941 * 942 * This lookuptable defines the lower case letters to their correspponding 943 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 944 * 945 * @author Andreas Gohr <andi@splitbrain.org> 946 */ 947 global $UTF8_UPPER_TO_LOWER; 948 $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 949 950} // end of case lookup tables 951 952 953/** 954 * UTF-8 lookup table for lower case accented letters 955 * 956 * This lookuptable defines replacements for accented characters from the ASCII-7 957 * range. This are lower case letters only. 958 * 959 * @author Andreas Gohr <andi@splitbrain.org> 960 * @see utf8_deaccent() 961 */ 962global $UTF8_LOWER_ACCENTS; 963$UTF8_LOWER_ACCENTS = array( 964 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 965 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 966 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 967 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 968 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 969 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 970 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 971 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 972 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 973 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 974 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 975 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 976 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 977 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 978 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 979); 980 981/** 982 * UTF-8 lookup table for upper case accented letters 983 * 984 * This lookuptable defines replacements for accented characters from the ASCII-7 985 * range. This are upper case letters only. 986 * 987 * @author Andreas Gohr <andi@splitbrain.org> 988 * @see utf8_deaccent() 989 */ 990global $UTF8_UPPER_ACCENTS; 991$UTF8_UPPER_ACCENTS = array( 992 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 993 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 994 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 995 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 996 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 997 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 998 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 999 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1000 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1001 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1002 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1003 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1004 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1005 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 1006 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 1007); 1008 1009/** 1010 * UTF-8 array of common special characters 1011 * 1012 * This array should contain all special characters (not a letter or digit) 1013 * defined in the various local charsets - it's not a complete list of non-alphanum 1014 * characters in UTF-8. It's not perfect but should match most cases of special 1015 * chars. 1016 * 1017 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1018 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1019 * 1020 * @author Andreas Gohr <andi@splitbrain.org> 1021 * @see utf8_stripspecials() 1022 */ 1023global $UTF8_SPECIAL_CHARS; 1024$UTF8_SPECIAL_CHARS = array( 1025 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1026 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 1027 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 1028 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1029 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1030 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1031 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1032 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1033 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1034 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1035 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1036 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1037 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1038 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1039 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1040 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1041 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1042 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1043 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1044 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1045 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1046 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1047 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1048 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1049 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1050 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1051 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1052 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1053 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1054 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1055 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1056 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1057 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1058 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1059 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1060 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1061 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1062 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1063 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1064 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1065 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1066 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1067 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1068 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1069 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1070 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1071 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1072 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1073 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1074 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1075); 1076 1077// utf8 version of above data 1078global $UTF8_SPECIAL_CHARS2; 1079$UTF8_SPECIAL_CHARS2 = 1080 ' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 1081 '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 1082 '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1083 '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1084 '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1085 '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1086 '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1087 '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1088 '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1089 '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1090 '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1091 '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1092 '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1093 '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1094 '➷➸➹➺➻➼➽➾�'. 1095 '�ﹼﹽ'; 1096 1097/** 1098 * Romanization lookup table 1099 * 1100 * This lookup tables provides a way to transform strings written in a language 1101 * different from the ones based upon latin letters into plain ASCII. 1102 * 1103 * Please note: this is not a scientific transliteration table. It only works 1104 * oneway from nonlatin to ASCII and it works by simple character replacement 1105 * only. Specialities of each language are not supported. 1106 * 1107 * @author Andreas Gohr <andi@splitbrain.org> 1108 * @author Vitaly Blokhin <vitinfo@vitn.com> 1109 * @link http://www.uconv.com/translit.htm 1110 * @author Bisqwit <bisqwit@iki.fi> 1111 * @link http://kanjidict.stc.cx/hiragana.php?src=2 1112 * @link http://www.translatum.gr/converter/greek-transliteration.htm 1113 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 1114 * @link http://www.btranslations.com/resources/romanization/korean.asp 1115 */ 1116global $UTF8_ROMANIZATION; 1117$UTF8_ROMANIZATION = array( 1118 //russian cyrillic 1119 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 1120 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 1121 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 1122 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 1123 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 1124 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1125 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1126 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 1127 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 1128 // Ukrainian cyrillic 1129 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 1130 // Georgian 1131 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 1132 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 1133 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 1134 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 1135 'ჰ'=>'xh', 1136 //Sanskrit 1137 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 1138 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 1139 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 1140 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 1141 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 1142 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 1143 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 1144 //Hebrew 1145 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 1146 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 1147 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 1148 'ש'=>'sh','ת'=>'t', 1149 //Arabic 1150 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 1151 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 1152 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 1153 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 1154 1155 // Japanese hiragana 1156 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 1157 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 1158 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1159 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 1160 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 1161 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 1162 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1163 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 1164 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 1165 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 1166 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 1167 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 1168 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 1169 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 1170 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1171 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 1172 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 1173 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 1174 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 1175 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 1176 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 1177 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 1178 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 1179 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 1180 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 1181 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1182 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1183 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 1184 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 1185 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 1186 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 1187 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 1188 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 1189 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 1190 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 1191 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 1192 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 1193 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 1194 'じゅ'=>'zyu', 1195 // Japanese katakana 1196 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 1197 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 1198 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 1199 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 1200 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 1201 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 1202 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 1203 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1204 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 1205 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 1206 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 1207 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 1208 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 1209 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1210 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 1211 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 1212 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 1213 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 1214 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 1215 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1216 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 1217 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 1218 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 1219 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 1220 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 1221 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1222 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 1223 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 1224 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 1225 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 1226 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1227 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 1228 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 1229 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 1230 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 1231 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 1232 'ジョ'=>'zyo','ジュ'=>'zyu', 1233 1234 // "Greeklish" 1235 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1236 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1237 1238 // Thai 1239 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1240 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1241 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1242 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1243 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1244 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1245 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1246 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1247 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1248 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1249 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1250 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1251 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1252 'เ–ียว'=>'iao', 1253 1254 // Korean 1255 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1256 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1257 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1258 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1259 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1260 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1261); 1262 1263//Setup VIM: ex: et ts=2 enc=utf-8 : 1264 1265