1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * check for mb_string support 11 */ 12if(!defined('UTF8_MBSTRING')){ 13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14 define('UTF8_MBSTRING',1); 15 }else{ 16 define('UTF8_MBSTRING',0); 17 } 18} 19 20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 21 22if(!function_exists('utf8_isASCII')){ 23 /** 24 * Checks if a string contains 7bit ASCII only 25 * 26 * @author Andreas Haerter <andreas.haerter@dev.mail-node.com> 27 */ 28 function utf8_isASCII($str){ 29 return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); 30 } 31} 32 33if(!function_exists('utf8_strip')){ 34 /** 35 * Strips all highbyte chars 36 * 37 * Returns a pure ASCII7 string 38 * 39 * @author Andreas Gohr <andi@splitbrain.org> 40 */ 41 function utf8_strip($str){ 42 $ascii = ''; 43 $len = strlen($str); 44 for($i=0; $i<$len; $i++){ 45 if(ord($str{$i}) <128){ 46 $ascii .= $str{$i}; 47 } 48 } 49 return $ascii; 50 } 51} 52 53if(!function_exists('utf8_check')){ 54 /** 55 * Tries to detect if a string is in Unicode encoding 56 * 57 * @author <bmorel@ssi.fr> 58 * @link http://www.php.net/manual/en/function.utf8-encode.php 59 */ 60 function utf8_check($Str) { 61 $len = strlen($Str); 62 for ($i=0; $i<$len; $i++) { 63 $b = ord($Str[$i]); 64 if ($b < 0x80) continue; # 0bbbbbbb 65 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 66 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 67 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 68 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 69 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 70 else return false; # Does not match any model 71 72 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 73 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80)) 74 return false; 75 } 76 } 77 return true; 78 } 79} 80 81if(!function_exists('utf8_strlen')){ 82 /** 83 * Unicode aware replacement for strlen() 84 * 85 * utf8_decode() converts characters that are not in ISO-8859-1 86 * to '?', which, for the purpose of counting, is alright - It's 87 * even faster than mb_strlen. 88 * 89 * @author <chernyshevsky at hotmail dot com> 90 * @see strlen() 91 * @see utf8_decode() 92 */ 93 function utf8_strlen($string){ 94 return strlen(utf8_decode($string)); 95 } 96} 97 98if(!function_exists('utf8_substr')){ 99 /** 100 * UTF-8 aware alternative to substr 101 * 102 * Return part of a string given character offset (and optionally length) 103 * 104 * @author Harry Fuecks <hfuecks@gmail.com> 105 * @author Chris Smith <chris@jalakai.co.uk> 106 * @param string $str 107 * @param int $offset number of UTF-8 characters offset (from left) 108 * @param int $length (optional) length in UTF-8 characters from offset 109 * @return mixed string or false if failure 110 */ 111 function utf8_substr($str, $offset, $length = null) { 112 if(UTF8_MBSTRING){ 113 if( $length === null ){ 114 return mb_substr($str, $offset); 115 }else{ 116 return mb_substr($str, $offset, $length); 117 } 118 } 119 120 /* 121 * Notes: 122 * 123 * no mb string support, so we'll use pcre regex's with 'u' flag 124 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 125 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 126 * 127 * substr documentation states false can be returned in some cases (e.g. offset > string length) 128 * mb_substr never returns false, it will return an empty string instead. 129 * 130 * calculating the number of characters in the string is a relatively expensive operation, so 131 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 132 */ 133 134 // cast parameters to appropriate types to avoid multiple notices/warnings 135 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 136 $offset = (int)$offset; 137 if (!is_null($length)) $length = (int)$length; 138 139 // handle trivial cases 140 if ($length === 0) return ''; 141 if ($offset < 0 && $length < 0 && $length < $offset) return ''; 142 143 $offset_pattern = ''; 144 $length_pattern = ''; 145 146 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 147 if ($offset < 0) { 148 $strlen = strlen(utf8_decode($str)); // see notes 149 $offset = $strlen + $offset; 150 if ($offset < 0) $offset = 0; 151 } 152 153 // establish a pattern for offset, a non-captured group equal in length to offset 154 if ($offset > 0) { 155 $Ox = (int)($offset/65535); 156 $Oy = $offset%65535; 157 158 if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 159 $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 160 } else { 161 $offset_pattern = '^'; // offset == 0; just anchor the pattern 162 } 163 164 // establish a pattern for length 165 if (is_null($length)) { 166 $length_pattern = '(.*)$'; // the rest of the string 167 } else { 168 169 if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 170 if ($offset > $strlen) return ''; // another trivial case 171 172 if ($length > 0) { 173 174 $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 175 176 $Lx = (int)($length/65535); 177 $Ly = $length%65535; 178 179 // +ve length requires ... a captured group of length characters 180 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 181 $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 182 183 } else if ($length < 0) { 184 185 if ($length < ($offset - $strlen)) return ''; 186 187 $Lx = (int)((-$length)/65535); 188 $Ly = (-$length)%65535; 189 190 // -ve length requires ... capture everything except a group of -length characters 191 // anchored at the tail-end of the string 192 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 193 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 194 } 195 } 196 197 if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 198 return $match[1]; 199 } 200} 201 202if(!function_exists('utf8_substr_replace')){ 203 /** 204 * Unicode aware replacement for substr_replace() 205 * 206 * @author Andreas Gohr <andi@splitbrain.org> 207 * @see substr_replace() 208 */ 209 function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 210 $ret = ''; 211 if($start>0) $ret .= utf8_substr($string, 0, $start); 212 $ret .= $replacement; 213 $ret .= utf8_substr($string, $start+$length); 214 return $ret; 215 } 216} 217 218if(!function_exists('utf8_ltrim')){ 219 /** 220 * Unicode aware replacement for ltrim() 221 * 222 * @author Andreas Gohr <andi@splitbrain.org> 223 * @see ltrim() 224 * @param string $str 225 * @param string $charlist 226 * @return string 227 */ 228 function utf8_ltrim($str,$charlist=''){ 229 if($charlist == '') return ltrim($str); 230 231 //quote charlist for use in a characterclass 232 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 233 234 return preg_replace('/^['.$charlist.']+/u','',$str); 235 } 236} 237 238if(!function_exists('utf8_rtrim')){ 239 /** 240 * Unicode aware replacement for rtrim() 241 * 242 * @author Andreas Gohr <andi@splitbrain.org> 243 * @see rtrim() 244 * @param string $str 245 * @param string $charlist 246 * @return string 247 */ 248 function utf8_rtrim($str,$charlist=''){ 249 if($charlist == '') return rtrim($str); 250 251 //quote charlist for use in a characterclass 252 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 253 254 return preg_replace('/['.$charlist.']+$/u','',$str); 255 } 256} 257 258if(!function_exists('utf8_trim')){ 259 /** 260 * Unicode aware replacement for trim() 261 * 262 * @author Andreas Gohr <andi@splitbrain.org> 263 * @see trim() 264 * @param string $str 265 * @param string $charlist 266 * @return string 267 */ 268 function utf8_trim($str,$charlist='') { 269 if($charlist == '') return trim($str); 270 271 return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist); 272 } 273} 274 275if(!function_exists('utf8_strtolower')){ 276 /** 277 * This is a unicode aware replacement for strtolower() 278 * 279 * Uses mb_string extension if available 280 * 281 * @author Leo Feyer <leo@typolight.org> 282 * @see strtolower() 283 * @see utf8_strtoupper() 284 */ 285 function utf8_strtolower($string){ 286 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 287 288 global $UTF8_UPPER_TO_LOWER; 289 return strtr($string,$UTF8_UPPER_TO_LOWER); 290 } 291} 292 293if(!function_exists('utf8_strtoupper')){ 294 /** 295 * This is a unicode aware replacement for strtoupper() 296 * 297 * Uses mb_string extension if available 298 * 299 * @author Leo Feyer <leo@typolight.org> 300 * @see strtoupper() 301 * @see utf8_strtoupper() 302 */ 303 function utf8_strtoupper($string){ 304 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 305 306 global $UTF8_LOWER_TO_UPPER; 307 return strtr($string,$UTF8_LOWER_TO_UPPER); 308 } 309} 310 311if(!function_exists('utf8_ucfirst')){ 312 /** 313 * UTF-8 aware alternative to ucfirst 314 * Make a string's first character uppercase 315 * 316 * @author Harry Fuecks 317 * @param string 318 * @return string with first character as upper case (if applicable) 319 */ 320 function utf8_ucfirst($str){ 321 switch ( utf8_strlen($str) ) { 322 case 0: 323 return ''; 324 case 1: 325 return utf8_strtoupper($str); 326 default: 327 preg_match('/^(.{1})(.*)$/us', $str, $matches); 328 return utf8_strtoupper($matches[1]).$matches[2]; 329 } 330 } 331} 332 333if(!function_exists('utf8_ucwords')){ 334 /** 335 * UTF-8 aware alternative to ucwords 336 * Uppercase the first character of each word in a string 337 * 338 * @author Harry Fuecks 339 * @param string 340 * @return string with first char of each word uppercase 341 * @see http://www.php.net/ucwords 342 */ 343 function utf8_ucwords($str) { 344 // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches; 345 // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns 346 // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords 347 $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u'; 348 349 return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str); 350 } 351 352 /** 353 * Callback function for preg_replace_callback call in utf8_ucwords 354 * You don't need to call this yourself 355 * 356 * @author Harry Fuecks 357 * @param array $matches matches corresponding to a single word 358 * @return string with first char of the word in uppercase 359 * @see utf8_ucwords 360 * @see utf8_strtoupper 361 */ 362 function utf8_ucwords_callback($matches) { 363 $leadingws = $matches[2]; 364 $ucfirst = utf8_strtoupper($matches[3]); 365 $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1); 366 return $leadingws . $ucword; 367 } 368} 369 370if(!function_exists('utf8_deaccent')){ 371 /** 372 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 373 * 374 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 375 * letters. Default is to deaccent both cases ($case = 0) 376 * 377 * @author Andreas Gohr <andi@splitbrain.org> 378 */ 379 function utf8_deaccent($string,$case=0){ 380 if($case <= 0){ 381 global $UTF8_LOWER_ACCENTS; 382 $string = strtr($string,$UTF8_LOWER_ACCENTS); 383 } 384 if($case >= 0){ 385 global $UTF8_UPPER_ACCENTS; 386 $string = strtr($string,$UTF8_UPPER_ACCENTS); 387 } 388 return $string; 389 } 390} 391 392if(!function_exists('utf8_romanize')){ 393 /** 394 * Romanize a non-latin string 395 * 396 * @author Andreas Gohr <andi@splitbrain.org> 397 */ 398 function utf8_romanize($string){ 399 if(utf8_isASCII($string)) return $string; //nothing to do 400 401 global $UTF8_ROMANIZATION; 402 return strtr($string,$UTF8_ROMANIZATION); 403 } 404} 405 406if(!function_exists('utf8_stripspecials')){ 407 /** 408 * Removes special characters (nonalphanumeric) from a UTF-8 string 409 * 410 * This function adds the controlchars 0x00 to 0x19 to the array of 411 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 412 * 413 * @author Andreas Gohr <andi@splitbrain.org> 414 * @param string $string The UTF8 string to strip of special chars 415 * @param string $repl Replace special with this string 416 * @param string $additional Additional chars to strip (used in regexp char class) 417 * @return string 418 */ 419 function utf8_stripspecials($string,$repl='',$additional=''){ 420 global $UTF8_SPECIAL_CHARS2; 421 422 static $specials = null; 423 if(is_null($specials)){ 424 #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 425 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 426 } 427 428 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 429 } 430} 431 432if(!function_exists('utf8_strpos')){ 433 /** 434 * This is an Unicode aware replacement for strpos 435 * 436 * @author Leo Feyer <leo@typolight.org> 437 * @see strpos() 438 * @param string 439 * @param string 440 * @param integer 441 * @return integer 442 */ 443 function utf8_strpos($haystack, $needle, $offset=0){ 444 $comp = 0; 445 $length = null; 446 447 while (is_null($length) || $length < $offset) { 448 $pos = strpos($haystack, $needle, $offset + $comp); 449 450 if ($pos === false) 451 return false; 452 453 $length = utf8_strlen(substr($haystack, 0, $pos)); 454 455 if ($length < $offset) 456 $comp = $pos - $length; 457 } 458 459 return $length; 460 } 461} 462 463if(!function_exists('utf8_tohtml')){ 464 /** 465 * Encodes UTF-8 characters to HTML entities 466 * 467 * @author Tom N Harris <tnharris@whoopdedo.org> 468 * @author <vpribish at shopping dot com> 469 * @link http://www.php.net/manual/en/function.utf8-decode.php 470 */ 471 function utf8_tohtml ($str) { 472 $ret = ''; 473 foreach (utf8_to_unicode($str) as $cp) { 474 if ($cp < 0x80) 475 $ret .= chr($cp); 476 elseif ($cp < 0x100) 477 $ret .= "&#$cp;"; 478 else 479 $ret .= '&#x'.dechex($cp).';'; 480 } 481 return $ret; 482 } 483} 484 485if(!function_exists('utf8_unhtml')){ 486 /** 487 * Decodes HTML entities to UTF-8 characters 488 * 489 * Convert any &#..; entity to a codepoint, 490 * The entities flag defaults to only decoding numeric entities. 491 * Pass HTML_ENTITIES and named entities, including & < etc. 492 * are handled as well. Avoids the problem that would occur if you 493 * had to decode "&#38;&amp;#38;" 494 * 495 * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 496 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 497 * what it should be -> "&&#38;" 498 * 499 * @author Tom N Harris <tnharris@whoopdedo.org> 500 * @param string $str UTF-8 encoded string 501 * @param boolean $entities Flag controlling decoding of named entities. 502 * @return string UTF-8 encoded string with numeric (and named) entities replaced. 503 */ 504 function utf8_unhtml($str, $entities=null) { 505 static $decoder = null; 506 if (is_null($decoder)) 507 $decoder = new utf8_entity_decoder(); 508 if (is_null($entities)) 509 return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 510 'utf8_decode_numeric', $str); 511 else 512 return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 513 array(&$decoder, 'decode'), $str); 514 } 515} 516 517if(!function_exists('utf8_decode_numeric')){ 518 /** 519 * Decodes numeric HTML entities to their correct UTF-8 characters 520 * 521 * @param $ent string A numeric entity 522 * @return string 523 */ 524 function utf8_decode_numeric($ent) { 525 switch ($ent[2]) { 526 case 'X': 527 case 'x': 528 $cp = hexdec($ent[3]); 529 break; 530 default: 531 $cp = intval($ent[3]); 532 break; 533 } 534 return unicode_to_utf8(array($cp)); 535 } 536} 537 538if(!class_exists('utf8_entity_decoder')){ 539 /** 540 * Encapsulate HTML entity decoding tables 541 */ 542 class utf8_entity_decoder { 543 var $table; 544 545 /** 546 * Initializes the decoding tables 547 */ 548 function __construct() { 549 $table = get_html_translation_table(HTML_ENTITIES); 550 $table = array_flip($table); 551 $this->table = array_map(array(&$this,'makeutf8'), $table); 552 } 553 554 /** 555 * Wrapper aorund unicode_to_utf8() 556 * 557 * @param $c string 558 * @return mixed 559 */ 560 function makeutf8($c) { 561 return unicode_to_utf8(array(ord($c))); 562 } 563 564 /** 565 * Decodes any HTML entity to it's correct UTF-8 char equivalent 566 * 567 * @param $ent string An entity 568 * @return string 569 */ 570 function decode($ent) { 571 if ($ent[1] == '#') { 572 return utf8_decode_numeric($ent); 573 } elseif (array_key_exists($ent[0],$this->table)) { 574 return $this->table[$ent[0]]; 575 } else { 576 return $ent[0]; 577 } 578 } 579 } 580} 581 582if(!function_exists('utf8_to_unicode')){ 583 /** 584 * Takes an UTF-8 string and returns an array of ints representing the 585 * Unicode characters. Astral planes are supported ie. the ints in the 586 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 587 * are not allowed. 588 * 589 * If $strict is set to true the function returns false if the input 590 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 591 * level E_USER_WARNING 592 * 593 * Note: this function has been modified slightly in this library to 594 * trigger errors on encountering bad bytes 595 * 596 * @author <hsivonen@iki.fi> 597 * @author Harry Fuecks <hfuecks@gmail.com> 598 * @param string $str UTF-8 encoded string 599 * @param boolean $strict Check for invalid sequences? 600 * @return mixed array of unicode code points or false if UTF-8 invalid 601 * @see unicode_to_utf8 602 * @link http://hsivonen.iki.fi/php-utf8/ 603 * @link http://sourceforge.net/projects/phputf8/ 604 */ 605 function utf8_to_unicode($str,$strict=false) { 606 $mState = 0; // cached expected number of octets after the current octet 607 // until the beginning of the next UTF8 character sequence 608 $mUcs4 = 0; // cached Unicode character 609 $mBytes = 1; // cached expected number of octets in the current sequence 610 611 $out = array(); 612 613 $len = strlen($str); 614 615 for($i = 0; $i < $len; $i++) { 616 617 $in = ord($str{$i}); 618 619 if ( $mState == 0) { 620 621 // When mState is zero we expect either a US-ASCII character or a 622 // multi-octet sequence. 623 if (0 == (0x80 & ($in))) { 624 // US-ASCII, pass straight through. 625 $out[] = $in; 626 $mBytes = 1; 627 628 } else if (0xC0 == (0xE0 & ($in))) { 629 // First octet of 2 octet sequence 630 $mUcs4 = ($in); 631 $mUcs4 = ($mUcs4 & 0x1F) << 6; 632 $mState = 1; 633 $mBytes = 2; 634 635 } else if (0xE0 == (0xF0 & ($in))) { 636 // First octet of 3 octet sequence 637 $mUcs4 = ($in); 638 $mUcs4 = ($mUcs4 & 0x0F) << 12; 639 $mState = 2; 640 $mBytes = 3; 641 642 } else if (0xF0 == (0xF8 & ($in))) { 643 // First octet of 4 octet sequence 644 $mUcs4 = ($in); 645 $mUcs4 = ($mUcs4 & 0x07) << 18; 646 $mState = 3; 647 $mBytes = 4; 648 649 } else if (0xF8 == (0xFC & ($in))) { 650 /* First octet of 5 octet sequence. 651 * 652 * This is illegal because the encoded codepoint must be either 653 * (a) not the shortest form or 654 * (b) outside the Unicode range of 0-0x10FFFF. 655 * Rather than trying to resynchronize, we will carry on until the end 656 * of the sequence and let the later error handling code catch it. 657 */ 658 $mUcs4 = ($in); 659 $mUcs4 = ($mUcs4 & 0x03) << 24; 660 $mState = 4; 661 $mBytes = 5; 662 663 } else if (0xFC == (0xFE & ($in))) { 664 // First octet of 6 octet sequence, see comments for 5 octet sequence. 665 $mUcs4 = ($in); 666 $mUcs4 = ($mUcs4 & 1) << 30; 667 $mState = 5; 668 $mBytes = 6; 669 670 } elseif($strict) { 671 /* Current octet is neither in the US-ASCII range nor a legal first 672 * octet of a multi-octet sequence. 673 */ 674 trigger_error( 675 'utf8_to_unicode: Illegal sequence identifier '. 676 'in UTF-8 at byte '.$i, 677 E_USER_WARNING 678 ); 679 return false; 680 681 } 682 683 } else { 684 685 // When mState is non-zero, we expect a continuation of the multi-octet 686 // sequence 687 if (0x80 == (0xC0 & ($in))) { 688 689 // Legal continuation. 690 $shift = ($mState - 1) * 6; 691 $tmp = $in; 692 $tmp = ($tmp & 0x0000003F) << $shift; 693 $mUcs4 |= $tmp; 694 695 /** 696 * End of the multi-octet sequence. mUcs4 now contains the final 697 * Unicode codepoint to be output 698 */ 699 if (0 == --$mState) { 700 701 /* 702 * Check for illegal sequences and codepoints. 703 */ 704 // From Unicode 3.1, non-shortest form is illegal 705 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 706 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 707 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 708 (4 < $mBytes) || 709 // From Unicode 3.2, surrogate characters are illegal 710 (($mUcs4 & 0xFFFFF800) == 0xD800) || 711 // Codepoints outside the Unicode range are illegal 712 ($mUcs4 > 0x10FFFF)) { 713 714 if($strict){ 715 trigger_error( 716 'utf8_to_unicode: Illegal sequence or codepoint '. 717 'in UTF-8 at byte '.$i, 718 E_USER_WARNING 719 ); 720 721 return false; 722 } 723 724 } 725 726 if (0xFEFF != $mUcs4) { 727 // BOM is legal but we don't want to output it 728 $out[] = $mUcs4; 729 } 730 731 //initialize UTF8 cache 732 $mState = 0; 733 $mUcs4 = 0; 734 $mBytes = 1; 735 } 736 737 } elseif($strict) { 738 /** 739 *((0xC0 & (*in) != 0x80) && (mState != 0)) 740 * Incomplete multi-octet sequence. 741 */ 742 trigger_error( 743 'utf8_to_unicode: Incomplete multi-octet '. 744 ' sequence in UTF-8 at byte '.$i, 745 E_USER_WARNING 746 ); 747 748 return false; 749 } 750 } 751 } 752 return $out; 753 } 754} 755 756if(!function_exists('unicode_to_utf8')){ 757 /** 758 * Takes an array of ints representing the Unicode characters and returns 759 * a UTF-8 string. Astral planes are supported ie. the ints in the 760 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 761 * are not allowed. 762 * 763 * If $strict is set to true the function returns false if the input 764 * array contains ints that represent surrogates or are outside the 765 * Unicode range and raises a PHP error at level E_USER_WARNING 766 * 767 * Note: this function has been modified slightly in this library to use 768 * output buffering to concatenate the UTF-8 string (faster) as well as 769 * reference the array by it's keys 770 * 771 * @param array $arr of unicode code points representing a string 772 * @param boolean $strict Check for invalid sequences? 773 * @return mixed UTF-8 string or false if array contains invalid code points 774 * @author <hsivonen@iki.fi> 775 * @author Harry Fuecks <hfuecks@gmail.com> 776 * @see utf8_to_unicode 777 * @link http://hsivonen.iki.fi/php-utf8/ 778 * @link http://sourceforge.net/projects/phputf8/ 779 */ 780 function unicode_to_utf8($arr,$strict=false) { 781 if (!is_array($arr)) return ''; 782 ob_start(); 783 784 foreach (array_keys($arr) as $k) { 785 786 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 787 # ASCII range (including control chars) 788 789 echo chr($arr[$k]); 790 791 } else if ($arr[$k] <= 0x07ff) { 792 # 2 byte sequence 793 794 echo chr(0xc0 | ($arr[$k] >> 6)); 795 echo chr(0x80 | ($arr[$k] & 0x003f)); 796 797 } else if($arr[$k] == 0xFEFF) { 798 # Byte order mark (skip) 799 800 // nop -- zap the BOM 801 802 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 803 # Test for illegal surrogates 804 805 // found a surrogate 806 if($strict){ 807 trigger_error( 808 'unicode_to_utf8: Illegal surrogate '. 809 'at index: '.$k.', value: '.$arr[$k], 810 E_USER_WARNING 811 ); 812 return false; 813 } 814 815 } else if ($arr[$k] <= 0xffff) { 816 # 3 byte sequence 817 818 echo chr(0xe0 | ($arr[$k] >> 12)); 819 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 820 echo chr(0x80 | ($arr[$k] & 0x003f)); 821 822 } else if ($arr[$k] <= 0x10ffff) { 823 # 4 byte sequence 824 825 echo chr(0xf0 | ($arr[$k] >> 18)); 826 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 827 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 828 echo chr(0x80 | ($arr[$k] & 0x3f)); 829 830 } elseif($strict) { 831 832 trigger_error( 833 'unicode_to_utf8: Codepoint out of Unicode range '. 834 'at index: '.$k.', value: '.$arr[$k], 835 E_USER_WARNING 836 ); 837 838 // out of range 839 return false; 840 } 841 } 842 843 $result = ob_get_contents(); 844 ob_end_clean(); 845 return $result; 846 } 847} 848 849if(!function_exists('utf8_to_utf16be')){ 850 /** 851 * UTF-8 to UTF-16BE conversion. 852 * 853 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 854 */ 855 function utf8_to_utf16be(&$str, $bom = false) { 856 $out = $bom ? "\xFE\xFF" : ''; 857 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 858 859 $uni = utf8_to_unicode($str); 860 foreach($uni as $cp){ 861 $out .= pack('n',$cp); 862 } 863 return $out; 864 } 865} 866 867if(!function_exists('utf16be_to_utf8')){ 868 /** 869 * UTF-8 to UTF-16BE conversion. 870 * 871 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 872 */ 873 function utf16be_to_utf8(&$str) { 874 $uni = unpack('n*',$str); 875 return unicode_to_utf8($uni); 876 } 877} 878 879if(!function_exists('utf8_bad_replace')){ 880 /** 881 * Replace bad bytes with an alternative character 882 * 883 * ASCII character is recommended for replacement char 884 * 885 * PCRE Pattern to locate bad bytes in a UTF-8 string 886 * Comes from W3 FAQ: Multilingual Forms 887 * Note: modified to include full ASCII range including control chars 888 * 889 * @author Harry Fuecks <hfuecks@gmail.com> 890 * @see http://www.w3.org/International/questions/qa-forms-utf-8 891 * @param string $str to search 892 * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII 893 * @return string 894 */ 895 function utf8_bad_replace($str, $replace = '') { 896 $UTF8_BAD = 897 '([\x00-\x7F]'. # ASCII (including control chars) 898 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 899 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 900 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 901 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 902 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 903 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 904 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 905 '|(.{1}))'; # invalid byte 906 ob_start(); 907 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 908 if ( !isset($matches[2])) { 909 echo $matches[0]; 910 } else { 911 echo $replace; 912 } 913 $str = substr($str,strlen($matches[0])); 914 } 915 $result = ob_get_contents(); 916 ob_end_clean(); 917 return $result; 918 } 919} 920 921if(!function_exists('utf8_correctIdx')){ 922 /** 923 * adjust a byte index into a utf8 string to a utf8 character boundary 924 * 925 * @param $str string utf8 character string 926 * @param $i int byte index into $str 927 * @param $next bool direction to search for boundary, 928 * false = up (current character) 929 * true = down (next character) 930 * 931 * @return int byte index into $str now pointing to a utf8 character boundary 932 * 933 * @author chris smith <chris@jalakai.co.uk> 934 */ 935 function utf8_correctIdx(&$str,$i,$next=false) { 936 937 if ($i <= 0) return 0; 938 939 $limit = strlen($str); 940 if ($i>=$limit) return $limit; 941 942 if ($next) { 943 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 944 } else { 945 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 946 } 947 948 return $i; 949 } 950} 951 952// only needed if no mb_string available 953if(!UTF8_MBSTRING){ 954 /** 955 * UTF-8 Case lookup table 956 * 957 * This lookuptable defines the upper case letters to their correspponding 958 * lower case letter in UTF-8 959 * 960 * @author Andreas Gohr <andi@splitbrain.org> 961 */ 962 global $UTF8_LOWER_TO_UPPER; 963 if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array( 964 "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q", 965 "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G", 966 "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ", 967 "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ", 968 "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ", 969 "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ", 970 "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ", 971 "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ", 972 "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ", 973 "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ", 974 "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ", 975 "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ", 976 "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ", 977 "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố", 978 "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ", 979 "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ", 980 "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ", 981 "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ", 982 "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ", 983 "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ", 984 "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ", 985 "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ", 986 "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ", 987 "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ", 988 "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս", 989 "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ", 990 "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ", 991 "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ", 992 "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ", 993 "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ", 994 "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ", 995 "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ", 996 "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ", 997 "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ", 998 "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ", 999 "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ", 1000 "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ", 1001 "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ", 1002 "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П", 1003 "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е", 1004 "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ", 1005 "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ", 1006 "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ", 1007 "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π", 1008 "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ", 1009 "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ", 1010 "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ", 1011 "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ", 1012 "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș", 1013 "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ", 1014 "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ", 1015 "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ", 1016 "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ", 1017 "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ", 1018 "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž", 1019 "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ", 1020 "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ", 1021 "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ", 1022 "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī", 1023 "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė", 1024 "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă", 1025 "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö", 1026 "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì", 1027 "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â", 1028 "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T", 1029 "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J", 1030 "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A" 1031 ); 1032 1033 /** 1034 * UTF-8 Case lookup table 1035 * 1036 * This lookuptable defines the lower case letters to their corresponding 1037 * upper case letter in UTF-8 1038 * 1039 * @author Andreas Gohr <andi@splitbrain.org> 1040 */ 1041 global $UTF8_UPPER_TO_LOWER; 1042 if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array ( 1043 "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q", 1044 "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g", 1045 "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ", 1046 "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ", 1047 "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ", 1048 "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ", 1049 "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ", 1050 "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ", 1051 "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ", 1052 "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ", 1053 "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ", 1054 "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ", 1055 "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ", 1056 "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố", 1057 "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ", 1058 "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ", 1059 "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ", 1060 "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ", 1061 "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ", 1062 "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ", 1063 "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ", 1064 "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ", 1065 "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ", 1066 "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ", 1067 "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս", 1068 "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ", 1069 "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ", 1070 "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ", 1071 "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ", 1072 "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ", 1073 "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ", 1074 "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ", 1075 "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ", 1076 "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ", 1077 "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ", 1078 "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ", 1079 "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ", 1080 "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ", 1081 "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п", 1082 "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е", 1083 "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ", 1084 "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ", 1085 "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ", 1086 "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π", 1087 "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ", 1088 "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ", 1089 "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ", 1090 "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ", 1091 "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș", 1092 "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ", 1093 "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ", 1094 "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ", 1095 "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ", 1096 "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ", 1097 "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž", 1098 "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ", 1099 "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ", 1100 "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ", 1101 "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī", 1102 "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė", 1103 "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă", 1104 "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö", 1105 "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì", 1106 "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â", 1107 "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t", 1108 "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j", 1109 "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a" 1110 ); 1111}; // end of case lookup tables 1112 1113/** 1114 * UTF-8 lookup table for lower case accented letters 1115 * 1116 * This lookuptable defines replacements for accented characters from the ASCII-7 1117 * range. This are lower case letters only. 1118 * 1119 * @author Andreas Gohr <andi@splitbrain.org> 1120 * @see utf8_deaccent() 1121 */ 1122global $UTF8_LOWER_ACCENTS; 1123if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array( 1124 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 1125 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 1126 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 1127 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 1128 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 1129 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 1130 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 1131 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 1132 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 1133 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 1134 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 1135 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 1136 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 1137 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 1138 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 1139); 1140 1141/** 1142 * UTF-8 lookup table for upper case accented letters 1143 * 1144 * This lookuptable defines replacements for accented characters from the ASCII-7 1145 * range. This are upper case letters only. 1146 * 1147 * @author Andreas Gohr <andi@splitbrain.org> 1148 * @see utf8_deaccent() 1149 */ 1150global $UTF8_UPPER_ACCENTS; 1151if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array( 1152 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 1153 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 1154 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 1155 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 1156 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 1157 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 1158 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 1159 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1160 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1161 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1162 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1163 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1164 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1165 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 1166 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 1167); 1168 1169/** 1170 * UTF-8 array of common special characters 1171 * 1172 * This array should contain all special characters (not a letter or digit) 1173 * defined in the various local charsets - it's not a complete list of non-alphanum 1174 * characters in UTF-8. It's not perfect but should match most cases of special 1175 * chars. 1176 * 1177 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1178 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1179 * 1180 * @author Andreas Gohr <andi@splitbrain.org> 1181 * @see utf8_stripspecials() 1182 */ 1183global $UTF8_SPECIAL_CHARS; 1184if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array( 1185 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1186 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 1187 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 1188 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1189 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1190 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1191 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1192 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1193 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1194 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1195 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1196 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1197 0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1198 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1199 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1200 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1201 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1202 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1203 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1204 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1205 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1206 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1207 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1208 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1209 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1210 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1211 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1212 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1213 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1214 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1215 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1216 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1217 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1218 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1219 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1220 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1221 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1222 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1223 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1224 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1225 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1226 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1227 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1228 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1229 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1230 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 1231 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, 1232 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 1233 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1234 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1235 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1236 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1237 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1238 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 1239 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, 1240 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, 1241 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 1242 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, 1243 0xffeb, 0xffec, 0xffed, 0xffee, 1244 0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703, 1245 0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b, 1246 0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713, 1247 0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b, 1248 0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf, 1249); 1250 1251// utf8 version of above data 1252global $UTF8_SPECIAL_CHARS2; 1253if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 = 1254 "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 1255 '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 1256 '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1257 '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1258 '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1259 '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1260 '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1261 '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1262 '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1263 '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1264 '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1265 '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1266 '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1267 '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1268 '➷➸➹➺➻➼➽➾'. 1269 ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. 1270 '�'. 1271 '�ﹼﹽ'. 1272 '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. 1273 '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'. 1274 ''. 1275 ' '; 1276 1277/** 1278 * Romanization lookup table 1279 * 1280 * This lookup tables provides a way to transform strings written in a language 1281 * different from the ones based upon latin letters into plain ASCII. 1282 * 1283 * Please note: this is not a scientific transliteration table. It only works 1284 * oneway from nonlatin to ASCII and it works by simple character replacement 1285 * only. Specialities of each language are not supported. 1286 * 1287 * @author Andreas Gohr <andi@splitbrain.org> 1288 * @author Vitaly Blokhin <vitinfo@vitn.com> 1289 * @link http://www.uconv.com/translit.htm 1290 * @author Bisqwit <bisqwit@iki.fi> 1291 * @link http://kanjidict.stc.cx/hiragana.php?src=2 1292 * @link http://www.translatum.gr/converter/greek-transliteration.htm 1293 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 1294 * @link http://www.btranslations.com/resources/romanization/korean.asp 1295 * @author Arthit Suriyawongkul <arthit@gmail.com> 1296 * @author Denis Scheither <amorphis@uni-bremen.de> 1297 * @author Eivind Morland <eivind.morland@gmail.com> 1298 */ 1299global $UTF8_ROMANIZATION; 1300if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array( 1301 // scandinavian - differs from what we do in deaccent 1302 'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O', 1303 1304 //russian cyrillic 1305 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 1306 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 1307 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 1308 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 1309 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 1310 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1311 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1312 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 1313 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 1314 // Ukrainian cyrillic 1315 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 1316 // Georgian 1317 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 1318 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 1319 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 1320 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 1321 'ჰ'=>'xh', 1322 //Sanskrit 1323 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 1324 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 1325 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 1326 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 1327 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 1328 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 1329 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 1330 //Sanskrit diacritics 1331 'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T', 1332 'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L', 1333 'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t', 1334 'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l', 1335 //Hebrew 1336 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 1337 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 1338 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 1339 'ש'=>'sh','ת'=>'t', 1340 //Arabic 1341 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 1342 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 1343 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 1344 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 1345 1346 // Japanese characters (last update: 2008-05-09) 1347 1348 // Japanese hiragana 1349 1350 // 3 character syllables, っ doubles the consonant after 1351 'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu', 1352 'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu', 1353 'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu', 1354 'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu', 1355 // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu', 1356 'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu', 1357 'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu', 1358 'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu', 1359 'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu', 1360 'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu', 1361 'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu', 1362 1363 // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway) 1364 'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u', 1365 'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu', 1366 1367 // 2 character syllables - normal 1368 'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1369 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu', 1370 'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu', 1371 'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1372 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu', 1373 'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu', 1374 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu', 1375 'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1376 'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1377 'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu', 1378 'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu', 1379 'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju', 1380 'うぇ'=>'we','うぃ'=>'wi', 1381 'いぇ'=>'ye', 1382 1383 // 2 character syllables, っ doubles the consonant after 1384 'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu', 1385 'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu', 1386 'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu', 1387 'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu', 1388 'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu', 1389 'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku', 1390 'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu', 1391 'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu', 1392 'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru', 1393 'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu', 1394 'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu', 1395 1396 // 1 character syllabels 1397 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n', 1398 'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu', 1399 'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu', 1400 'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu', 1401 'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu', 1402 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du', 1403 'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu', 1404 'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku', 1405 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu', 1406 'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu', 1407 'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru', 1408 'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su', 1409 'わ'=>'wa','を'=>'wo', 1410 'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu', 1411 'や'=>'ya','よ'=>'yo','ゆ'=>'yu', 1412 // old characters 1413 'ゑ'=>'we','ゐ'=>'wi', 1414 1415 // convert what's left (probably only kicks in when something's missing above) 1416 // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u', 1417 // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu', 1418 1419 // never seen one of those (disabled for the moment) 1420 // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu', 1421 // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu', 1422 // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu', 1423 // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu', 1424 // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu', 1425 // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 1426 // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu', 1427 // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 1428 // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu', 1429 // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu', 1430 // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu', 1431 // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu', 1432 // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu', 1433 // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu', 1434 1435 // 'spare' characters from other romanization systems 1436 // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du', 1437 // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1438 // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su', 1439 // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu', 1440 //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu', 1441 //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu', 1442 //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu', 1443 //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu', 1444 //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi', 1445 //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju', 1446 1447 1448 // Japanese katakana 1449 1450 // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs) 1451 'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu', 1452 'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu', 1453 'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu', 1454 'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu', 1455 'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu', 1456 'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu', 1457 'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu', 1458 'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu', 1459 'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu', 1460 'ッティー'=>'ttii', 1461 'ッヂィー'=>'ddii', 1462 1463 // 3 character syllables - doubled vowels 1464 'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo', 1465 'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu', 1466 'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu', 1467 'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu', 1468 'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu', 1469 'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu', 1470 'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu', 1471 'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu', 1472 'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu', 1473 'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu', 1474 'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu', 1475 'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu', 1476 'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu', 1477 'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu', 1478 'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu', 1479 'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu', 1480 'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu', 1481 'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu', 1482 'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu', 1483 'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu', 1484 'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu', 1485 'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu', 1486 'ウェー'=>'wee','ウィー'=>'wii', 1487 'イェー'=>'yee', 1488 'ティー'=>'tii', 1489 'ヂィー'=>'dii', 1490 1491 // 3 character syllables - doubled consonants 1492 'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu', 1493 'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu', 1494 'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu', 1495 'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu', 1496 'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu', 1497 'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu', 1498 'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu', 1499 'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu', 1500 'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu', 1501 'ッティ'=>'tti', 1502 'ッヂィ'=>'ddi', 1503 1504 // 3 character syllables - doubled vowel and consonants 1505 'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu', 1506 'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu', 1507 'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa', 1508 'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu', 1509 'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu', 1510 'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu', 1511 'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu', 1512 'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu', 1513 'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu', 1514 'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu', 1515 'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu', 1516 1517 // 2 character syllables - normal 1518 'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu', 1519 // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu', 1520 'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu', 1521 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu', 1522 'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu', 1523 'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1524 'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu', 1525 'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1526 'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu', 1527 'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu', 1528 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu', 1529 'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu', 1530 'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju', 1531 'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu', 1532 'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du', 1533 'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1534 // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu', 1535 'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1536 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu', 1537 'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu', 1538 'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu', 1539 'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu', 1540 'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu', 1541 'ウェ'=>'we','ウィ'=>'wi', 1542 'イェ'=>'ye', 1543 'ティ'=>'ti', 1544 'ヂィ'=>'di', 1545 1546 // 2 character syllables - doubled vocal 1547 'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu', 1548 'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu', 1549 'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu', 1550 'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu', 1551 'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu', 1552 'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa', 1553 'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu', 1554 'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu', 1555 'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu', 1556 'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu', 1557 'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu', 1558 'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu', 1559 'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu', 1560 'ワー'=>'waa','ヲー'=>'woo', 1561 'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu', 1562 'ヵー'=>'kaa','ヶー'=>'kee', 1563 // old characters 1564 'ヱー'=>'wee','ヰー'=>'wii', 1565 1566 // seperate katakana 'n' 1567 'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u', 1568 'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu', 1569 1570 // 2 character syllables - doubled consonants 1571 'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu', 1572 'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu', 1573 'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka', 1574 'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu', 1575 'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu', 1576 'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu', 1577 'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru', 1578 'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu', 1579 'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu', 1580 'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu', 1581 'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu', 1582 1583 // 1 character syllables 1584 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n', 1585 'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu', 1586 'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu', 1587 'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1588 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka', 1589 'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu', 1590 'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu', 1591 'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu', 1592 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru', 1593 'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su', 1594 'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu', 1595 'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu', 1596 'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du', 1597 'ワ'=>'wa','ヲ'=>'wo', 1598 'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu', 1599 'ヵ'=>'ka','ヶ'=>'ke', 1600 // old characters 1601 'ヱ'=>'we','ヰ'=>'wi', 1602 1603 // convert what's left (probably only kicks in when something's missing above) 1604 'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u', 1605 'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu', 1606 1607 // special characters 1608 '・'=>'_','、'=>'_', 1609 'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise 1610 1611 // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu', 1612 // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu', 1613 //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu', 1614 // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu', 1615 // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu', 1616 //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu', 1617 //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu', 1618 // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu', 1619 // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu', 1620 //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu', 1621 //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu', 1622 //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi', 1623 1624 // "Greeklish" 1625 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1626 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1627 1628 // Thai 1629 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1630 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1631 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1632 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1633 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1634 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1635 'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am', 1636 'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u', 1637 'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o', 1638 'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua', 1639 'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao', 1640 'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai', 1641 'ิว'=>'io','็ว'=>'eo','ียว'=>'iao', 1642 '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'', 1643 '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'', 1644 'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-', 1645 '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4', 1646 '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9', 1647 1648 // Korean 1649 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1650 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1651 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1652 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1653 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1654 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1655); 1656 1657 1658