1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * check for mb_string support 11 */ 12if(!defined('UTF8_MBSTRING')){ 13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14 define('UTF8_MBSTRING',1); 15 }else{ 16 define('UTF8_MBSTRING',0); 17 } 18} 19 20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 21 22 23/** 24 * URL-Encode a filename to allow unicodecharacters 25 * 26 * Slashes are not encoded 27 * 28 * When the second parameter is true the string will 29 * be encoded only if non ASCII characters are detected - 30 * This makes it safe to run it multiple times on the 31 * same string (default is true) 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @see urlencode 35 */ 36function utf8_encodeFN($file,$safe=true){ 37 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38 return $file; 39 } 40 $file = urlencode($file); 41 $file = str_replace('%2F','/',$file); 42 return $file; 43} 44 45/** 46 * URL-Decode a filename 47 * 48 * This is just a wrapper around urldecode 49 * 50 * @author Andreas Gohr <andi@splitbrain.org> 51 * @see urldecode 52 */ 53function utf8_decodeFN($file){ 54 $file = urldecode($file); 55 return $file; 56} 57 58/** 59 * Checks if a string contains 7bit ASCII only 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63function utf8_isASCII($str){ 64 for($i=0; $i<strlen($str); $i++){ 65 if(ord($str{$i}) >127) return false; 66 } 67 return true; 68} 69 70/** 71 * Strips all highbyte chars 72 * 73 * Returns a pure ASCII7 string 74 * 75 * @author Andreas Gohr <andi@splitbrain.org> 76 */ 77function utf8_strip($str){ 78 $ascii = ''; 79 for($i=0; $i<strlen($str); $i++){ 80 if(ord($str{$i}) <128){ 81 $ascii .= $str{$i}; 82 } 83 } 84 return $ascii; 85} 86 87/** 88 * Tries to detect if a string is in Unicode encoding 89 * 90 * @author <bmorel@ssi.fr> 91 * @link http://www.php.net/manual/en/function.utf8-encode.php 92 */ 93function utf8_check($Str) { 94 for ($i=0; $i<strlen($Str); $i++) { 95 $b = ord($Str[$i]); 96 if ($b < 0x80) continue; # 0bbbbbbb 97 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 98 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 99 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 100 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 101 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 102 else return false; # Does not match any model 103 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 104 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 105 return false; 106 } 107 } 108 return true; 109} 110 111/** 112 * Unicode aware replacement for strlen() 113 * 114 * utf8_decode() converts characters that are not in ISO-8859-1 115 * to '?', which, for the purpose of counting, is alright - It's 116 * even faster than mb_strlen. 117 * 118 * @author <chernyshevsky at hotmail dot com> 119 * @see strlen() 120 * @see utf8_decode() 121 */ 122function utf8_strlen($string){ 123 return strlen(utf8_decode($string)); 124} 125 126/** 127 * UTF-8 aware alternative to substr 128 * 129 * Return part of a string given character offset (and optionally length) 130 * 131 * @author Harry Fuecks <hfuecks@gmail.com> 132 * @author Chris Smith <chris@jalakai.co.uk> 133 * @param string 134 * @param integer number of UTF-8 characters offset (from left) 135 * @param integer (optional) length in UTF-8 characters from offset 136 * @return mixed string or false if failure 137 */ 138function utf8_substr($str, $offset, $length = null) { 139 if(UTF8_MBSTRING){ 140 if( $length === null ){ 141 return mb_substr($str, $offset); 142 }else{ 143 return mb_substr($str, $offset, $length); 144 } 145 } 146 147 /* 148 * Notes: 149 * 150 * no mb string support, so we'll use pcre regex's with 'u' flag 151 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 152 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 153 * 154 * substr documentation states false can be returned in some cases (e.g. offset > string length) 155 * mb_substr never returns false, it will return an empty string instead. 156 * 157 * calculating the number of characters in the string is a relatively expensive operation, so 158 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 159 */ 160 161 // cast parameters to appropriate types to avoid multiple notices/warnings 162 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 163 $offset = (int)$offset; 164 if (!is_null($length)) $length = (int)$length; 165 166 // handle trivial cases 167 if ($length === 0) return ''; 168 if ($offset < 0 && $length < 0 && $length < $offset) return ''; 169 170 $offset_pattern = ''; 171 $length_pattern = ''; 172 173 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 174 if ($offset < 0) { 175 $strlen = strlen(utf8_decode($str)); // see notes 176 $offset = $strlen + $offset; 177 if ($offset < 0) $offset = 0; 178 } 179 180 // establish a pattern for offset, a non-captured group equal in length to offset 181 if ($offset > 0) { 182 $Ox = (int)($offset/65535); 183 $Oy = $offset%65535; 184 185 if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 186 $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 187 } else { 188 $offset_pattern = '^'; // offset == 0; just anchor the pattern 189 } 190 191 // establish a pattern for length 192 if (is_null($length)) { 193 $length_pattern = '(.*)$'; // the rest of the string 194 } else { 195 196 if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 197 if ($offset > $strlen) return ''; // another trivial case 198 199 if ($length > 0) { 200 201 $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 202 203 $Lx = (int)($length/65535); 204 $Ly = $length%65535; 205 206 // +ve length requires ... a captured group of length characters 207 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 208 $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 209 210 } else if ($length < 0) { 211 212 if ($length < ($offset - $strlen)) return ''; 213 214 $Lx = (int)((-$length)/65535); 215 $Ly = (-$length)%65535; 216 217 // -ve length requires ... capture everything except a group of -length characters 218 // anchored at the tail-end of the string 219 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 220 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 221 } 222 } 223 224 if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 225 return $match[1]; 226} 227 228/** 229 * Unicode aware replacement for substr_replace() 230 * 231 * @author Andreas Gohr <andi@splitbrain.org> 232 * @see substr_replace() 233 */ 234function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 235 $ret = ''; 236 if($start>0) $ret .= utf8_substr($string, 0, $start); 237 $ret .= $replacement; 238 $ret .= utf8_substr($string, $start+$length); 239 return $ret; 240} 241 242/** 243 * Unicode aware replacement for explode 244 * 245 * @TODO support third limit arg 246 * @author Harry Fuecks <hfuecks@gmail.com> 247 * @see explode(); 248 */ 249function utf8_explode($sep, $str) { 250 if ( $sep == '' ) { 251 trigger_error('Empty delimiter',E_USER_WARNING); 252 return false; 253 } 254 255 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 256} 257 258/** 259 * Unicode aware replacement for strrepalce() 260 * 261 * @todo support PHP5 count (fourth arg) 262 * @author Harry Fuecks <hfuecks@gmail.com> 263 * @see strreplace(); 264 */ 265function utf8_str_replace($s,$r,$str){ 266 if(!is_array($s)){ 267 $s = '!'.preg_quote($s,'!').'!u'; 268 }else{ 269 foreach ($s as $k => $v) { 270 $s[$k] = '!'.preg_quote($v).'!u'; 271 } 272 } 273 return preg_replace($s,$r,$str); 274} 275 276/** 277 * Unicode aware replacement for ltrim() 278 * 279 * @author Andreas Gohr <andi@splitbrain.org> 280 * @see ltrim() 281 * @return string 282 */ 283function utf8_ltrim($str,$charlist=''){ 284 if($charlist == '') return ltrim($str); 285 286 //quote charlist for use in a characterclass 287 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 288 289 return preg_replace('/^['.$charlist.']+/u','',$str); 290} 291 292/** 293 * Unicode aware replacement for rtrim() 294 * 295 * @author Andreas Gohr <andi@splitbrain.org> 296 * @see rtrim() 297 * @return string 298 */ 299function utf8_rtrim($str,$charlist=''){ 300 if($charlist == '') return rtrim($str); 301 302 //quote charlist for use in a characterclass 303 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 304 305 return preg_replace('/['.$charlist.']+$/u','',$str); 306} 307 308/** 309 * Unicode aware replacement for trim() 310 * 311 * @author Andreas Gohr <andi@splitbrain.org> 312 * @see trim() 313 * @return string 314 */ 315function utf8_trim($str,$charlist='') { 316 if($charlist == '') return trim($str); 317 318 return utf8_ltrim(utf8_rtrim($str)); 319} 320 321 322/** 323 * This is a unicode aware replacement for strtolower() 324 * 325 * Uses mb_string extension if available 326 * 327 * @author Andreas Gohr <andi@splitbrain.org> 328 * @see strtolower() 329 * @see utf8_strtoupper() 330 */ 331function utf8_strtolower($string){ 332 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 333 334 global $UTF8_UPPER_TO_LOWER; 335 $uni = utf8_to_unicode($string); 336 $cnt = count($uni); 337 for ($i=0; $i < $cnt; $i++){ 338 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 339 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 340 } 341 } 342 return unicode_to_utf8($uni); 343} 344 345/** 346 * This is a unicode aware replacement for strtoupper() 347 * 348 * Uses mb_string extension if available 349 * 350 * @author Andreas Gohr <andi@splitbrain.org> 351 * @see strtoupper() 352 * @see utf8_strtoupper() 353 */ 354function utf8_strtoupper($string){ 355 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 356 357 global $UTF8_LOWER_TO_UPPER; 358 $uni = utf8_to_unicode($string); 359 $cnt = count($uni); 360 for ($i=0; $i < $cnt; $i++){ 361 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 362 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 363 } 364 } 365 return unicode_to_utf8($uni); 366} 367 368/** 369 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 370 * 371 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 372 * letters. Default is to deaccent both cases ($case = 0) 373 * 374 * @author Andreas Gohr <andi@splitbrain.org> 375 */ 376function utf8_deaccent($string,$case=0){ 377 if($case <= 0){ 378 global $UTF8_LOWER_ACCENTS; 379 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 380 } 381 if($case >= 0){ 382 global $UTF8_UPPER_ACCENTS; 383 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 384 } 385 return $string; 386} 387 388/** 389 * Romanize a non-latin string 390 * 391 * @author Andreas Gohr <andi@splitbrain.org> 392 */ 393function utf8_romanize($string){ 394 if(utf8_isASCII($string)) return $string; //nothing to do 395 396 global $UTF8_ROMANIZATION; 397 return strtr($string,$UTF8_ROMANIZATION); 398} 399 400/** 401 * Removes special characters (nonalphanumeric) from a UTF-8 string 402 * 403 * This function adds the controlchars 0x00 to 0x19 to the array of 404 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 405 * 406 * @author Andreas Gohr <andi@splitbrain.org> 407 * @param string $string The UTF8 string to strip of special chars 408 * @param string $repl Replace special with this string 409 * @param string $additional Additional chars to strip (used in regexp char class) 410 */ 411function utf8_stripspecials($string,$repl='',$additional=''){ 412 global $UTF8_SPECIAL_CHARS; 413 global $UTF8_SPECIAL_CHARS2; 414 415 static $specials = null; 416 if(is_null($specials)){ 417# $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 418 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 419 } 420 421 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 422} 423 424/** 425 * This is an Unicode aware replacement for strpos 426 * 427 * Uses mb_string extension if available 428 * 429 * @author Harry Fuecks <hfuecks@gmail.com> 430 * @see strpos() 431 */ 432function utf8_strpos($haystack, $needle,$offset=0) { 433 if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8'); 434 435 if(!$offset){ 436 $ar = utf8_explode($needle, $haystack); 437 if ( count($ar) > 1 ) { 438 return utf8_strlen($ar[0]); 439 } 440 return false; 441 }else{ 442 if ( !is_int($offset) ) { 443 trigger_error('Offset must be an integer',E_USER_WARNING); 444 return false; 445 } 446 447 $haystack = utf8_substr($haystack, $offset); 448 449 if ( false !== ($pos = utf8_strpos($haystack,$needle))){ 450 return $pos + $offset; 451 } 452 return false; 453 } 454} 455 456/** 457 * Encodes UTF-8 characters to HTML entities 458 * 459 * @author Tom N Harris <tnharris@whoopdedo.org> 460 * @author <vpribish at shopping dot com> 461 * @link http://www.php.net/manual/en/function.utf8-decode.php 462 */ 463function utf8_tohtml ($str) { 464 $ret = ''; 465 foreach (utf8_to_unicode($str) as $cp) { 466 if ($cp < 0x80) 467 $ret .= chr($cp); 468 elseif ($cp < 0x100) 469 $ret .= "&#$cp;"; 470 else 471 $ret .= '&#x'.dechex($cp).';'; 472 } 473 return $ret; 474} 475 476/** 477 * Decodes HTML entities to UTF-8 characters 478 * 479 * Convert any &#..; entity to a codepoint, 480 * The entities flag defaults to only decoding numeric entities. 481 * Pass HTML_ENTITIES and named entities, including & < etc. 482 * are handled as well. Avoids the problem that would occur if you 483 * had to decode "&#38;&amp;#38;" 484 * 485 * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 486 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 487 * what it should be -> "&&#38;" 488 * 489 * @author Tom N Harris <tnharris@whoopdedo.org> 490 * @param string $str UTF-8 encoded string 491 * @param boolean $entities Flag controlling decoding of named entities. 492 * @return UTF-8 encoded string with numeric (and named) entities replaced. 493 */ 494function utf8_unhtml($str, $entities=null) { 495 static $decoder = null; 496 if (is_null($decoder)) 497 $decoder = new utf8_entity_decoder(); 498 if (is_null($entities)) 499 return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 500 'utf8_decode_numeric', $str); 501 else 502 return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 503 array(&$decoder, 'decode'), $str); 504} 505function utf8_decode_numeric($ent) { 506 switch ($ent[2]) { 507 case 'X': 508 case 'x': 509 $cp = hexdec($ent[3]); 510 break; 511 default: 512 $cp = intval($ent[3]); 513 break; 514 } 515 return unicode_to_utf8(array($cp)); 516} 517class utf8_entity_decoder { 518 var $table; 519 function utf8_entity_decoder() { 520 $table = get_html_translation_table(HTML_ENTITIES); 521 $table = array_flip($table); 522 $this->table = array_map(array(&$this,'makeutf8'), $table); 523 } 524 function makeutf8($c) { 525 return unicode_to_utf8(array(ord($c))); 526 } 527 function decode($ent) { 528 if ($ent[1] == '#') { 529 return utf8_decode_numeric($ent); 530 } elseif (array_key_exists($ent[0],$this->table)) { 531 return $this->table[$ent[0]]; 532 } else { 533 return $ent[0]; 534 } 535 } 536} 537 538/** 539 * Takes an UTF-8 string and returns an array of ints representing the 540 * Unicode characters. Astral planes are supported ie. the ints in the 541 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 542 * are not allowed. 543 * 544 * If $strict is set to true the function returns false if the input 545 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 546 * level E_USER_WARNING 547 * 548 * Note: this function has been modified slightly in this library to 549 * trigger errors on encountering bad bytes 550 * 551 * @author <hsivonen@iki.fi> 552 * @author Harry Fuecks <hfuecks@gmail.com> 553 * @param string UTF-8 encoded string 554 * @param boolean Check for invalid sequences? 555 * @return mixed array of unicode code points or false if UTF-8 invalid 556 * @see unicode_to_utf8 557 * @link http://hsivonen.iki.fi/php-utf8/ 558 * @link http://sourceforge.net/projects/phputf8/ 559 */ 560function utf8_to_unicode($str,$strict=false) { 561 $mState = 0; // cached expected number of octets after the current octet 562 // until the beginning of the next UTF8 character sequence 563 $mUcs4 = 0; // cached Unicode character 564 $mBytes = 1; // cached expected number of octets in the current sequence 565 566 $out = array(); 567 568 $len = strlen($str); 569 570 for($i = 0; $i < $len; $i++) { 571 572 $in = ord($str{$i}); 573 574 if ( $mState == 0) { 575 576 // When mState is zero we expect either a US-ASCII character or a 577 // multi-octet sequence. 578 if (0 == (0x80 & ($in))) { 579 // US-ASCII, pass straight through. 580 $out[] = $in; 581 $mBytes = 1; 582 583 } else if (0xC0 == (0xE0 & ($in))) { 584 // First octet of 2 octet sequence 585 $mUcs4 = ($in); 586 $mUcs4 = ($mUcs4 & 0x1F) << 6; 587 $mState = 1; 588 $mBytes = 2; 589 590 } else if (0xE0 == (0xF0 & ($in))) { 591 // First octet of 3 octet sequence 592 $mUcs4 = ($in); 593 $mUcs4 = ($mUcs4 & 0x0F) << 12; 594 $mState = 2; 595 $mBytes = 3; 596 597 } else if (0xF0 == (0xF8 & ($in))) { 598 // First octet of 4 octet sequence 599 $mUcs4 = ($in); 600 $mUcs4 = ($mUcs4 & 0x07) << 18; 601 $mState = 3; 602 $mBytes = 4; 603 604 } else if (0xF8 == (0xFC & ($in))) { 605 /* First octet of 5 octet sequence. 606 * 607 * This is illegal because the encoded codepoint must be either 608 * (a) not the shortest form or 609 * (b) outside the Unicode range of 0-0x10FFFF. 610 * Rather than trying to resynchronize, we will carry on until the end 611 * of the sequence and let the later error handling code catch it. 612 */ 613 $mUcs4 = ($in); 614 $mUcs4 = ($mUcs4 & 0x03) << 24; 615 $mState = 4; 616 $mBytes = 5; 617 618 } else if (0xFC == (0xFE & ($in))) { 619 // First octet of 6 octet sequence, see comments for 5 octet sequence. 620 $mUcs4 = ($in); 621 $mUcs4 = ($mUcs4 & 1) << 30; 622 $mState = 5; 623 $mBytes = 6; 624 625 } elseif($strict) { 626 /* Current octet is neither in the US-ASCII range nor a legal first 627 * octet of a multi-octet sequence. 628 */ 629 trigger_error( 630 'utf8_to_unicode: Illegal sequence identifier '. 631 'in UTF-8 at byte '.$i, 632 E_USER_WARNING 633 ); 634 return false; 635 636 } 637 638 } else { 639 640 // When mState is non-zero, we expect a continuation of the multi-octet 641 // sequence 642 if (0x80 == (0xC0 & ($in))) { 643 644 // Legal continuation. 645 $shift = ($mState - 1) * 6; 646 $tmp = $in; 647 $tmp = ($tmp & 0x0000003F) << $shift; 648 $mUcs4 |= $tmp; 649 650 /** 651 * End of the multi-octet sequence. mUcs4 now contains the final 652 * Unicode codepoint to be output 653 */ 654 if (0 == --$mState) { 655 656 /* 657 * Check for illegal sequences and codepoints. 658 */ 659 // From Unicode 3.1, non-shortest form is illegal 660 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 661 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 662 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 663 (4 < $mBytes) || 664 // From Unicode 3.2, surrogate characters are illegal 665 (($mUcs4 & 0xFFFFF800) == 0xD800) || 666 // Codepoints outside the Unicode range are illegal 667 ($mUcs4 > 0x10FFFF)) { 668 669 if($strict){ 670 trigger_error( 671 'utf8_to_unicode: Illegal sequence or codepoint '. 672 'in UTF-8 at byte '.$i, 673 E_USER_WARNING 674 ); 675 676 return false; 677 } 678 679 } 680 681 if (0xFEFF != $mUcs4) { 682 // BOM is legal but we don't want to output it 683 $out[] = $mUcs4; 684 } 685 686 //initialize UTF8 cache 687 $mState = 0; 688 $mUcs4 = 0; 689 $mBytes = 1; 690 } 691 692 } elseif($strict) { 693 /** 694 *((0xC0 & (*in) != 0x80) && (mState != 0)) 695 * Incomplete multi-octet sequence. 696 */ 697 trigger_error( 698 'utf8_to_unicode: Incomplete multi-octet '. 699 ' sequence in UTF-8 at byte '.$i, 700 E_USER_WARNING 701 ); 702 703 return false; 704 } 705 } 706 } 707 return $out; 708} 709 710/** 711 * Takes an array of ints representing the Unicode characters and returns 712 * a UTF-8 string. Astral planes are supported ie. the ints in the 713 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 714 * are not allowed. 715 * 716 * If $strict is set to true the function returns false if the input 717 * array contains ints that represent surrogates or are outside the 718 * Unicode range and raises a PHP error at level E_USER_WARNING 719 * 720 * Note: this function has been modified slightly in this library to use 721 * output buffering to concatenate the UTF-8 string (faster) as well as 722 * reference the array by it's keys 723 * 724 * @param array of unicode code points representing a string 725 * @param boolean Check for invalid sequences? 726 * @return mixed UTF-8 string or false if array contains invalid code points 727 * @author <hsivonen@iki.fi> 728 * @author Harry Fuecks <hfuecks@gmail.com> 729 * @see utf8_to_unicode 730 * @link http://hsivonen.iki.fi/php-utf8/ 731 * @link http://sourceforge.net/projects/phputf8/ 732 */ 733function unicode_to_utf8($arr,$strict=false) { 734 if (!is_array($arr)) return ''; 735 ob_start(); 736 737 foreach (array_keys($arr) as $k) { 738 739 # ASCII range (including control chars) 740 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 741 742 echo chr($arr[$k]); 743 744 # 2 byte sequence 745 } else if ($arr[$k] <= 0x07ff) { 746 747 echo chr(0xc0 | ($arr[$k] >> 6)); 748 echo chr(0x80 | ($arr[$k] & 0x003f)); 749 750 # Byte order mark (skip) 751 } else if($arr[$k] == 0xFEFF) { 752 753 // nop -- zap the BOM 754 755 # Test for illegal surrogates 756 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 757 758 // found a surrogate 759 if($strict){ 760 trigger_error( 761 'unicode_to_utf8: Illegal surrogate '. 762 'at index: '.$k.', value: '.$arr[$k], 763 E_USER_WARNING 764 ); 765 return false; 766 } 767 768 # 3 byte sequence 769 } else if ($arr[$k] <= 0xffff) { 770 771 echo chr(0xe0 | ($arr[$k] >> 12)); 772 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 773 echo chr(0x80 | ($arr[$k] & 0x003f)); 774 775 # 4 byte sequence 776 } else if ($arr[$k] <= 0x10ffff) { 777 778 echo chr(0xf0 | ($arr[$k] >> 18)); 779 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 780 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 781 echo chr(0x80 | ($arr[$k] & 0x3f)); 782 783 } elseif($strict) { 784 785 trigger_error( 786 'unicode_to_utf8: Codepoint out of Unicode range '. 787 'at index: '.$k.', value: '.$arr[$k], 788 E_USER_WARNING 789 ); 790 791 // out of range 792 return false; 793 } 794 } 795 796 $result = ob_get_contents(); 797 ob_end_clean(); 798 return $result; 799} 800 801/** 802 * UTF-8 to UTF-16BE conversion. 803 * 804 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 805 */ 806function utf8_to_utf16be(&$str, $bom = false) { 807 $out = $bom ? "\xFE\xFF" : ''; 808 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 809 810 $uni = utf8_to_unicode($str); 811 foreach($uni as $cp){ 812 $out .= pack('n',$cp); 813 } 814 return $out; 815} 816 817/** 818 * UTF-8 to UTF-16BE conversion. 819 * 820 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 821 */ 822function utf16be_to_utf8(&$str) { 823 $uni = unpack('n*',$str); 824 return unicode_to_utf8($uni); 825} 826 827/** 828 * Replace bad bytes with an alternative character 829 * 830 * ASCII character is recommended for replacement char 831 * 832 * PCRE Pattern to locate bad bytes in a UTF-8 string 833 * Comes from W3 FAQ: Multilingual Forms 834 * Note: modified to include full ASCII range including control chars 835 * 836 * @author Harry Fuecks <hfuecks@gmail.com> 837 * @see http://www.w3.org/International/questions/qa-forms-utf-8 838 * @param string to search 839 * @param string to replace bad bytes with (defaults to '?') - use ASCII 840 * @return string 841 */ 842function utf8_bad_replace($str, $replace = '') { 843 $UTF8_BAD = 844 '([\x00-\x7F]'. # ASCII (including control chars) 845 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 846 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 847 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 848 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 849 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 850 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 851 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 852 '|(.{1}))'; # invalid byte 853 ob_start(); 854 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 855 if ( !isset($matches[2])) { 856 echo $matches[0]; 857 } else { 858 echo $replace; 859 } 860 $str = substr($str,strlen($matches[0])); 861 } 862 $result = ob_get_contents(); 863 ob_end_clean(); 864 return $result; 865} 866 867/** 868 * adjust a byte index into a utf8 string to a utf8 character boundary 869 * 870 * @param $str string utf8 character string 871 * @param $i int byte index into $str 872 * @param $next bool direction to search for boundary, 873 * false = up (current character) 874 * true = down (next character) 875 * 876 * @return int byte index into $str now pointing to a utf8 character boundary 877 * 878 * @author chris smith <chris@jalakai.co.uk> 879 */ 880function utf8_correctIdx(&$str,$i,$next=false) { 881 882 if ($i <= 0) return 0; 883 884 $limit = strlen($str); 885 if ($i>=$limit) return $limit; 886 887 if ($next) { 888 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 889 } else { 890 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 891 } 892 893 return $i; 894} 895 896// only needed if no mb_string available 897if(!UTF8_MBSTRING){ 898 899 /** 900 * UTF-8 Case lookup table 901 * 902 * This lookuptable defines the upper case letters to their correspponding 903 * lower case letter in UTF-8 904 * 905 * @author Andreas Gohr <andi@splitbrain.org> 906 */ 907 global $UTF8_LOWER_TO_UPPER; 908 $UTF8_LOWER_TO_UPPER = array( 909 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 910 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 911 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 912 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 913 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 914 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 915 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 916 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 917 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 918 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 919 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 920 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 921 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 922 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 923 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 924 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 925 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 926 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 927 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 928 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 929 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 930 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 931 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 932 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 933 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 934 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 935 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 936 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 937 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 938 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 939 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 940 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 941 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 942 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 943 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 944 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 945 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 946 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 947 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 948 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 949 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 950 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 951 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 952 ); 953 954 /** 955 * UTF-8 Case lookup table 956 * 957 * This lookuptable defines the lower case letters to their correspponding 958 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 959 * 960 * @author Andreas Gohr <andi@splitbrain.org> 961 */ 962 global $UTF8_UPPER_TO_LOWER; 963 $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 964 965} // end of case lookup tables 966 967 968/** 969 * UTF-8 lookup table for lower case accented letters 970 * 971 * This lookuptable defines replacements for accented characters from the ASCII-7 972 * range. This are lower case letters only. 973 * 974 * @author Andreas Gohr <andi@splitbrain.org> 975 * @see utf8_deaccent() 976 */ 977global $UTF8_LOWER_ACCENTS; 978$UTF8_LOWER_ACCENTS = array( 979 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 980 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 981 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 982 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 983 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 984 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 985 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 986 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 987 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 988 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 989 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 990 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 991 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 992 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 993 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 994); 995 996/** 997 * UTF-8 lookup table for upper case accented letters 998 * 999 * This lookuptable defines replacements for accented characters from the ASCII-7 1000 * range. This are upper case letters only. 1001 * 1002 * @author Andreas Gohr <andi@splitbrain.org> 1003 * @see utf8_deaccent() 1004 */ 1005global $UTF8_UPPER_ACCENTS; 1006$UTF8_UPPER_ACCENTS = array( 1007 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 1008 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 1009 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 1010 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 1011 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 1012 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 1013 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 1014 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1015 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1016 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1017 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1018 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1019 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1020 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 1021 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 1022); 1023 1024/** 1025 * UTF-8 array of common special characters 1026 * 1027 * This array should contain all special characters (not a letter or digit) 1028 * defined in the various local charsets - it's not a complete list of non-alphanum 1029 * characters in UTF-8. It's not perfect but should match most cases of special 1030 * chars. 1031 * 1032 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1033 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1034 * 1035 * @author Andreas Gohr <andi@splitbrain.org> 1036 * @see utf8_stripspecials() 1037 */ 1038global $UTF8_SPECIAL_CHARS; 1039$UTF8_SPECIAL_CHARS = array( 1040 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1041 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 1042 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 1043 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1044 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1045 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1046 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1047 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1048 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1049 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1050 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1051 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1052 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1053 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1054 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1055 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1056 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1057 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1058 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1059 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1060 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1061 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1062 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1063 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1064 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1065 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1066 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1067 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1068 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1069 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1070 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1071 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1072 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1073 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1074 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1075 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1076 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1077 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1078 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1079 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1080 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1081 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1082 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1083 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1084 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1085 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 1086 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, 1087 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 1088 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1089 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1090 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1091 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1092 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1093 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 1094 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, 1095 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, 1096 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 1097 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, 1098 0xffeb, 0xffec, 0xffed, 0xffee, 1099); 1100 1101// utf8 version of above data 1102global $UTF8_SPECIAL_CHARS2; 1103$UTF8_SPECIAL_CHARS2 = 1104 "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 1105 '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 1106 '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1107 '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1108 '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1109 '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1110 '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1111 '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1112 '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1113 '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1114 '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1115 '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1116 '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1117 '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1118 '➷➸➹➺➻➼➽➾'. 1119 ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. 1120 '�'. 1121 '�ﹼﹽ'. 1122 '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. 1123 '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'; 1124 1125/** 1126 * Romanization lookup table 1127 * 1128 * This lookup tables provides a way to transform strings written in a language 1129 * different from the ones based upon latin letters into plain ASCII. 1130 * 1131 * Please note: this is not a scientific transliteration table. It only works 1132 * oneway from nonlatin to ASCII and it works by simple character replacement 1133 * only. Specialities of each language are not supported. 1134 * 1135 * @author Andreas Gohr <andi@splitbrain.org> 1136 * @author Vitaly Blokhin <vitinfo@vitn.com> 1137 * @link http://www.uconv.com/translit.htm 1138 * @author Bisqwit <bisqwit@iki.fi> 1139 * @link http://kanjidict.stc.cx/hiragana.php?src=2 1140 * @link http://www.translatum.gr/converter/greek-transliteration.htm 1141 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 1142 * @link http://www.btranslations.com/resources/romanization/korean.asp 1143 */ 1144global $UTF8_ROMANIZATION; 1145$UTF8_ROMANIZATION = array( 1146 //russian cyrillic 1147 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 1148 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 1149 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 1150 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 1151 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 1152 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1153 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1154 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 1155 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 1156 // Ukrainian cyrillic 1157 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 1158 // Georgian 1159 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 1160 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 1161 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 1162 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 1163 'ჰ'=>'xh', 1164 //Sanskrit 1165 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 1166 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 1167 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 1168 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 1169 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 1170 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 1171 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 1172 //Hebrew 1173 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 1174 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 1175 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 1176 'ש'=>'sh','ת'=>'t', 1177 //Arabic 1178 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 1179 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 1180 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 1181 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 1182 1183 // Japanese hiragana 1184 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 1185 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 1186 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1187 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 1188 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 1189 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 1190 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1191 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 1192 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 1193 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 1194 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 1195 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 1196 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 1197 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 1198 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1199 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 1200 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 1201 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 1202 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 1203 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 1204 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 1205 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 1206 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 1207 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 1208 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 1209 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1210 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1211 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 1212 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 1213 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 1214 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 1215 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 1216 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 1217 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 1218 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 1219 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 1220 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 1221 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 1222 'じゅ'=>'zyu', 1223 // Japanese katakana 1224 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 1225 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 1226 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 1227 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 1228 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 1229 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 1230 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 1231 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1232 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 1233 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 1234 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 1235 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 1236 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 1237 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1238 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 1239 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 1240 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 1241 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 1242 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 1243 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1244 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 1245 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 1246 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 1247 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 1248 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 1249 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1250 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 1251 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 1252 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 1253 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 1254 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1255 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 1256 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 1257 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 1258 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 1259 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 1260 'ジョ'=>'zyo','ジュ'=>'zyu', 1261 1262 // "Greeklish" 1263 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1264 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1265 1266 // Thai 1267 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1268 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1269 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1270 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1271 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1272 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1273 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1274 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1275 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1276 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1277 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1278 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1279 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1280 'เ–ียว'=>'iao', 1281 1282 // Korean 1283 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1284 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1285 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1286 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1287 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1288 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1289); 1290 1291//Setup VIM: ex: et ts=2 enc=utf-8 : 1292 1293