1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * check for mb_string support 11 */ 12if(!defined('UTF8_MBSTRING')){ 13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14 define('UTF8_MBSTRING',1); 15 }else{ 16 define('UTF8_MBSTRING',0); 17 } 18} 19 20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 21 22 23/** 24 * URL-Encode a filename to allow unicodecharacters 25 * 26 * Slashes are not encoded 27 * 28 * When the second parameter is true the string will 29 * be encoded only if non ASCII characters are detected - 30 * This makes it safe to run it multiple times on the 31 * same string (default is true) 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @see urlencode 35 */ 36function utf8_encodeFN($file,$safe=true){ 37 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38 return $file; 39 } 40 $file = urlencode($file); 41 $file = str_replace('%2F','/',$file); 42 return $file; 43} 44 45/** 46 * URL-Decode a filename 47 * 48 * This is just a wrapper around urldecode 49 * 50 * @author Andreas Gohr <andi@splitbrain.org> 51 * @see urldecode 52 */ 53function utf8_decodeFN($file){ 54 $file = urldecode($file); 55 return $file; 56} 57 58/** 59 * Checks if a string contains 7bit ASCII only 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63function utf8_isASCII($str){ 64 for($i=0; $i<strlen($str); $i++){ 65 if(ord($str{$i}) >127) return false; 66 } 67 return true; 68} 69 70/** 71 * Strips all highbyte chars 72 * 73 * Returns a pure ASCII7 string 74 * 75 * @author Andreas Gohr <andi@splitbrain.org> 76 */ 77function utf8_strip($str){ 78 $ascii = ''; 79 for($i=0; $i<strlen($str); $i++){ 80 if(ord($str{$i}) <128){ 81 $ascii .= $str{$i}; 82 } 83 } 84 return $ascii; 85} 86 87/** 88 * Tries to detect if a string is in Unicode encoding 89 * 90 * @author <bmorel@ssi.fr> 91 * @link http://www.php.net/manual/en/function.utf8-encode.php 92 */ 93function utf8_check($Str) { 94 for ($i=0; $i<strlen($Str); $i++) { 95 $b = ord($Str[$i]); 96 if ($b < 0x80) continue; # 0bbbbbbb 97 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 98 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 99 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 100 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 101 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 102 else return false; # Does not match any model 103 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 104 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 105 return false; 106 } 107 } 108 return true; 109} 110 111/** 112 * Unicode aware replacement for strlen() 113 * 114 * utf8_decode() converts characters that are not in ISO-8859-1 115 * to '?', which, for the purpose of counting, is alright - It's 116 * even faster than mb_strlen. 117 * 118 * @author <chernyshevsky at hotmail dot com> 119 * @see strlen() 120 * @see utf8_decode() 121 */ 122function utf8_strlen($string){ 123 return strlen(utf8_decode($string)); 124} 125 126/** 127 * UTF-8 aware alternative to substr 128 * 129 * Return part of a string given character offset (and optionally length) 130 * 131 * @author Harry Fuecks <hfuecks@gmail.com> 132 * @author Chris Smith <chris@jalakai.co.uk> 133 * @param string 134 * @param integer number of UTF-8 characters offset (from left) 135 * @param integer (optional) length in UTF-8 characters from offset 136 * @return mixed string or false if failure 137 */ 138function utf8_substr($str, $offset, $length = null) { 139 if(UTF8_MBSTRING){ 140 if( $length === null ){ 141 return mb_substr($str, $offset); 142 }else{ 143 return mb_substr($str, $offset, $length); 144 } 145 } 146 147 /* 148 * Notes: 149 * 150 * no mb string support, so we'll use pcre regex's with 'u' flag 151 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 152 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 153 * 154 * substr documentation states false can be returned in some cases (e.g. offset > string length) 155 * mb_substr never returns false, it will return an empty string instead. 156 * 157 * calculating the number of characters in the string is a relatively expensive operation, so 158 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 159 */ 160 161 // cast parameters to appropriate types to avoid multiple notices/warnings 162 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 163 $offset = (int)$offset; 164 if (!is_null($length)) $length = (int)$length; 165 166 // handle trivial cases 167 if ($length === 0) return ''; 168 if ($offset < 0 && $length < 0 && $length < $offset) return ''; 169 170 $offset_pattern = ''; 171 $length_pattern = ''; 172 173 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 174 if ($offset < 0) { 175 $strlen = strlen(utf8_decode($str)); // see notes 176 $offset = $strlen + $offset; 177 if ($offset < 0) $offset = 0; 178 } 179 180 // establish a pattern for offset, a non-captured group equal in length to offset 181 if ($offset > 0) { 182 $Ox = (int)($offset/65535); 183 $Oy = $offset%65535; 184 185 if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 186 $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 187 } else { 188 $offset_pattern = '^'; // offset == 0; just anchor the pattern 189 } 190 191 // establish a pattern for length 192 if (is_null($length)) { 193 $length_pattern = '(.*)$'; // the rest of the string 194 } else { 195 196 if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 197 if ($offset > $strlen) return ''; // another trivial case 198 199 if ($length > 0) { 200 201 $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 202 203 $Lx = (int)($length/65535); 204 $Ly = $length%65535; 205 206 // +ve length requires ... a captured group of length characters 207 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 208 $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 209 210 } else if ($length < 0) { 211 212 if ($length < ($offset - $strlen)) return ''; 213 214 $Lx = (int)((-$length)/65535); 215 $Ly = (-$length)%65535; 216 217 // -ve length requires ... capture everything except a group of -length characters 218 // anchored at the tail-end of the string 219 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 220 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 221 } 222 } 223 224 if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 225 return $match[1]; 226} 227 228/** 229 * Unicode aware replacement for substr_replace() 230 * 231 * @author Andreas Gohr <andi@splitbrain.org> 232 * @see substr_replace() 233 */ 234function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 235 $ret = ''; 236 if($start>0) $ret .= utf8_substr($string, 0, $start); 237 $ret .= $replacement; 238 $ret .= utf8_substr($string, $start+$length); 239 return $ret; 240} 241 242/** 243 * Unicode aware replacement for explode 244 * 245 * @TODO support third limit arg 246 * @author Harry Fuecks <hfuecks@gmail.com> 247 * @see explode(); 248 */ 249function utf8_explode($sep, $str) { 250 if ( $sep == '' ) { 251 trigger_error('Empty delimiter',E_USER_WARNING); 252 return false; 253 } 254 255 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 256} 257 258/** 259 * Unicode aware replacement for strrepalce() 260 * 261 * @todo support PHP5 count (fourth arg) 262 * @author Harry Fuecks <hfuecks@gmail.com> 263 * @see strreplace(); 264 */ 265function utf8_str_replace($s,$r,$str){ 266 if(!is_array($s)){ 267 $s = '!'.preg_quote($s,'!').'!u'; 268 }else{ 269 foreach ($s as $k => $v) { 270 $s[$k] = '!'.preg_quote($v).'!u'; 271 } 272 } 273 return preg_replace($s,$r,$str); 274} 275 276/** 277 * Unicode aware replacement for ltrim() 278 * 279 * @author Andreas Gohr <andi@splitbrain.org> 280 * @see ltrim() 281 * @return string 282 */ 283function utf8_ltrim($str,$charlist=''){ 284 if($charlist == '') return ltrim($str); 285 286 //quote charlist for use in a characterclass 287 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 288 289 return preg_replace('/^['.$charlist.']+/u','',$str); 290} 291 292/** 293 * Unicode aware replacement for rtrim() 294 * 295 * @author Andreas Gohr <andi@splitbrain.org> 296 * @see rtrim() 297 * @return string 298 */ 299function utf8_rtrim($str,$charlist=''){ 300 if($charlist == '') return rtrim($str); 301 302 //quote charlist for use in a characterclass 303 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 304 305 return preg_replace('/['.$charlist.']+$/u','',$str); 306} 307 308/** 309 * Unicode aware replacement for trim() 310 * 311 * @author Andreas Gohr <andi@splitbrain.org> 312 * @see trim() 313 * @return string 314 */ 315function utf8_trim($str,$charlist='') { 316 if($charlist == '') return trim($str); 317 318 return utf8_ltrim(utf8_rtrim($str)); 319} 320 321 322/** 323 * This is a unicode aware replacement for strtolower() 324 * 325 * Uses mb_string extension if available 326 * 327 * @author Leo Feyer <leo@typolight.org> 328 * @see strtolower() 329 * @see utf8_strtoupper() 330 */ 331function utf8_strtolower($string){ 332 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 333 334 global $UTF8_UPPER_TO_LOWER; 335 return strtr($string,$UTF8_UPPER_TO_LOWER); 336} 337 338/** 339 * This is a unicode aware replacement for strtoupper() 340 * 341 * Uses mb_string extension if available 342 * 343 * @author Leo Feyer <leo@typolight.org> 344 * @see strtoupper() 345 * @see utf8_strtoupper() 346 */ 347function utf8_strtoupper($string){ 348 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 349 350 global $UTF8_LOWER_TO_UPPER; 351 return strtr($string,$UTF8_LOWER_TO_UPPER); 352} 353 354/** 355 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 356 * 357 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 358 * letters. Default is to deaccent both cases ($case = 0) 359 * 360 * @author Andreas Gohr <andi@splitbrain.org> 361 */ 362function utf8_deaccent($string,$case=0){ 363 if($case <= 0){ 364 global $UTF8_LOWER_ACCENTS; 365 $string = strtr($string,$UTF8_LOWER_ACCENTS); 366 } 367 if($case >= 0){ 368 global $UTF8_UPPER_ACCENTS; 369 $string = strtr($string,$UTF8_UPPER_ACCENTS); 370 } 371 return $string; 372} 373 374/** 375 * Romanize a non-latin string 376 * 377 * @author Andreas Gohr <andi@splitbrain.org> 378 */ 379function utf8_romanize($string){ 380 if(utf8_isASCII($string)) return $string; //nothing to do 381 382 global $UTF8_ROMANIZATION; 383 return strtr($string,$UTF8_ROMANIZATION); 384} 385 386/** 387 * Removes special characters (nonalphanumeric) from a UTF-8 string 388 * 389 * This function adds the controlchars 0x00 to 0x19 to the array of 390 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 391 * 392 * @author Andreas Gohr <andi@splitbrain.org> 393 * @param string $string The UTF8 string to strip of special chars 394 * @param string $repl Replace special with this string 395 * @param string $additional Additional chars to strip (used in regexp char class) 396 */ 397function utf8_stripspecials($string,$repl='',$additional=''){ 398 global $UTF8_SPECIAL_CHARS; 399 global $UTF8_SPECIAL_CHARS2; 400 401 static $specials = null; 402 if(is_null($specials)){ 403# $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 404 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 405 } 406 407 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 408} 409 410/** 411 * This is an Unicode aware replacement for strpos 412 * 413 * @author Leo Feyer <leo@typolight.org> 414 * @see strpos() 415 * @param string 416 * @param string 417 * @param integer 418 * @return integer 419 */ 420function utf8_strpos($haystack, $needle, $offset=0){ 421 $comp = 0; 422 $length = null; 423 424 while (is_null($length) || $length < $offset) { 425 $pos = strpos($haystack, $needle, $offset + $comp); 426 427 if ($pos === false) 428 return false; 429 430 $length = utf8_strlen(substr($haystack, 0, $pos)); 431 432 if ($length < $offset) 433 $comp = $pos - $length; 434 } 435 436 return $length; 437} 438 439 440/** 441 * Encodes UTF-8 characters to HTML entities 442 * 443 * @author Tom N Harris <tnharris@whoopdedo.org> 444 * @author <vpribish at shopping dot com> 445 * @link http://www.php.net/manual/en/function.utf8-decode.php 446 */ 447function utf8_tohtml ($str) { 448 $ret = ''; 449 foreach (utf8_to_unicode($str) as $cp) { 450 if ($cp < 0x80) 451 $ret .= chr($cp); 452 elseif ($cp < 0x100) 453 $ret .= "&#$cp;"; 454 else 455 $ret .= '&#x'.dechex($cp).';'; 456 } 457 return $ret; 458} 459 460/** 461 * Decodes HTML entities to UTF-8 characters 462 * 463 * Convert any &#..; entity to a codepoint, 464 * The entities flag defaults to only decoding numeric entities. 465 * Pass HTML_ENTITIES and named entities, including & < etc. 466 * are handled as well. Avoids the problem that would occur if you 467 * had to decode "&#38;&amp;#38;" 468 * 469 * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 470 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 471 * what it should be -> "&&#38;" 472 * 473 * @author Tom N Harris <tnharris@whoopdedo.org> 474 * @param string $str UTF-8 encoded string 475 * @param boolean $entities Flag controlling decoding of named entities. 476 * @return UTF-8 encoded string with numeric (and named) entities replaced. 477 */ 478function utf8_unhtml($str, $entities=null) { 479 static $decoder = null; 480 if (is_null($decoder)) 481 $decoder = new utf8_entity_decoder(); 482 if (is_null($entities)) 483 return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 484 'utf8_decode_numeric', $str); 485 else 486 return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 487 array(&$decoder, 'decode'), $str); 488} 489function utf8_decode_numeric($ent) { 490 switch ($ent[2]) { 491 case 'X': 492 case 'x': 493 $cp = hexdec($ent[3]); 494 break; 495 default: 496 $cp = intval($ent[3]); 497 break; 498 } 499 return unicode_to_utf8(array($cp)); 500} 501class utf8_entity_decoder { 502 var $table; 503 function utf8_entity_decoder() { 504 $table = get_html_translation_table(HTML_ENTITIES); 505 $table = array_flip($table); 506 $this->table = array_map(array(&$this,'makeutf8'), $table); 507 } 508 function makeutf8($c) { 509 return unicode_to_utf8(array(ord($c))); 510 } 511 function decode($ent) { 512 if ($ent[1] == '#') { 513 return utf8_decode_numeric($ent); 514 } elseif (array_key_exists($ent[0],$this->table)) { 515 return $this->table[$ent[0]]; 516 } else { 517 return $ent[0]; 518 } 519 } 520} 521 522/** 523 * Takes an UTF-8 string and returns an array of ints representing the 524 * Unicode characters. Astral planes are supported ie. the ints in the 525 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 526 * are not allowed. 527 * 528 * If $strict is set to true the function returns false if the input 529 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 530 * level E_USER_WARNING 531 * 532 * Note: this function has been modified slightly in this library to 533 * trigger errors on encountering bad bytes 534 * 535 * @author <hsivonen@iki.fi> 536 * @author Harry Fuecks <hfuecks@gmail.com> 537 * @param string UTF-8 encoded string 538 * @param boolean Check for invalid sequences? 539 * @return mixed array of unicode code points or false if UTF-8 invalid 540 * @see unicode_to_utf8 541 * @link http://hsivonen.iki.fi/php-utf8/ 542 * @link http://sourceforge.net/projects/phputf8/ 543 */ 544function utf8_to_unicode($str,$strict=false) { 545 $mState = 0; // cached expected number of octets after the current octet 546 // until the beginning of the next UTF8 character sequence 547 $mUcs4 = 0; // cached Unicode character 548 $mBytes = 1; // cached expected number of octets in the current sequence 549 550 $out = array(); 551 552 $len = strlen($str); 553 554 for($i = 0; $i < $len; $i++) { 555 556 $in = ord($str{$i}); 557 558 if ( $mState == 0) { 559 560 // When mState is zero we expect either a US-ASCII character or a 561 // multi-octet sequence. 562 if (0 == (0x80 & ($in))) { 563 // US-ASCII, pass straight through. 564 $out[] = $in; 565 $mBytes = 1; 566 567 } else if (0xC0 == (0xE0 & ($in))) { 568 // First octet of 2 octet sequence 569 $mUcs4 = ($in); 570 $mUcs4 = ($mUcs4 & 0x1F) << 6; 571 $mState = 1; 572 $mBytes = 2; 573 574 } else if (0xE0 == (0xF0 & ($in))) { 575 // First octet of 3 octet sequence 576 $mUcs4 = ($in); 577 $mUcs4 = ($mUcs4 & 0x0F) << 12; 578 $mState = 2; 579 $mBytes = 3; 580 581 } else if (0xF0 == (0xF8 & ($in))) { 582 // First octet of 4 octet sequence 583 $mUcs4 = ($in); 584 $mUcs4 = ($mUcs4 & 0x07) << 18; 585 $mState = 3; 586 $mBytes = 4; 587 588 } else if (0xF8 == (0xFC & ($in))) { 589 /* First octet of 5 octet sequence. 590 * 591 * This is illegal because the encoded codepoint must be either 592 * (a) not the shortest form or 593 * (b) outside the Unicode range of 0-0x10FFFF. 594 * Rather than trying to resynchronize, we will carry on until the end 595 * of the sequence and let the later error handling code catch it. 596 */ 597 $mUcs4 = ($in); 598 $mUcs4 = ($mUcs4 & 0x03) << 24; 599 $mState = 4; 600 $mBytes = 5; 601 602 } else if (0xFC == (0xFE & ($in))) { 603 // First octet of 6 octet sequence, see comments for 5 octet sequence. 604 $mUcs4 = ($in); 605 $mUcs4 = ($mUcs4 & 1) << 30; 606 $mState = 5; 607 $mBytes = 6; 608 609 } elseif($strict) { 610 /* Current octet is neither in the US-ASCII range nor a legal first 611 * octet of a multi-octet sequence. 612 */ 613 trigger_error( 614 'utf8_to_unicode: Illegal sequence identifier '. 615 'in UTF-8 at byte '.$i, 616 E_USER_WARNING 617 ); 618 return false; 619 620 } 621 622 } else { 623 624 // When mState is non-zero, we expect a continuation of the multi-octet 625 // sequence 626 if (0x80 == (0xC0 & ($in))) { 627 628 // Legal continuation. 629 $shift = ($mState - 1) * 6; 630 $tmp = $in; 631 $tmp = ($tmp & 0x0000003F) << $shift; 632 $mUcs4 |= $tmp; 633 634 /** 635 * End of the multi-octet sequence. mUcs4 now contains the final 636 * Unicode codepoint to be output 637 */ 638 if (0 == --$mState) { 639 640 /* 641 * Check for illegal sequences and codepoints. 642 */ 643 // From Unicode 3.1, non-shortest form is illegal 644 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 645 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 646 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 647 (4 < $mBytes) || 648 // From Unicode 3.2, surrogate characters are illegal 649 (($mUcs4 & 0xFFFFF800) == 0xD800) || 650 // Codepoints outside the Unicode range are illegal 651 ($mUcs4 > 0x10FFFF)) { 652 653 if($strict){ 654 trigger_error( 655 'utf8_to_unicode: Illegal sequence or codepoint '. 656 'in UTF-8 at byte '.$i, 657 E_USER_WARNING 658 ); 659 660 return false; 661 } 662 663 } 664 665 if (0xFEFF != $mUcs4) { 666 // BOM is legal but we don't want to output it 667 $out[] = $mUcs4; 668 } 669 670 //initialize UTF8 cache 671 $mState = 0; 672 $mUcs4 = 0; 673 $mBytes = 1; 674 } 675 676 } elseif($strict) { 677 /** 678 *((0xC0 & (*in) != 0x80) && (mState != 0)) 679 * Incomplete multi-octet sequence. 680 */ 681 trigger_error( 682 'utf8_to_unicode: Incomplete multi-octet '. 683 ' sequence in UTF-8 at byte '.$i, 684 E_USER_WARNING 685 ); 686 687 return false; 688 } 689 } 690 } 691 return $out; 692} 693 694/** 695 * Takes an array of ints representing the Unicode characters and returns 696 * a UTF-8 string. Astral planes are supported ie. the ints in the 697 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 698 * are not allowed. 699 * 700 * If $strict is set to true the function returns false if the input 701 * array contains ints that represent surrogates or are outside the 702 * Unicode range and raises a PHP error at level E_USER_WARNING 703 * 704 * Note: this function has been modified slightly in this library to use 705 * output buffering to concatenate the UTF-8 string (faster) as well as 706 * reference the array by it's keys 707 * 708 * @param array of unicode code points representing a string 709 * @param boolean Check for invalid sequences? 710 * @return mixed UTF-8 string or false if array contains invalid code points 711 * @author <hsivonen@iki.fi> 712 * @author Harry Fuecks <hfuecks@gmail.com> 713 * @see utf8_to_unicode 714 * @link http://hsivonen.iki.fi/php-utf8/ 715 * @link http://sourceforge.net/projects/phputf8/ 716 */ 717function unicode_to_utf8($arr,$strict=false) { 718 if (!is_array($arr)) return ''; 719 ob_start(); 720 721 foreach (array_keys($arr) as $k) { 722 723 # ASCII range (including control chars) 724 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 725 726 echo chr($arr[$k]); 727 728 # 2 byte sequence 729 } else if ($arr[$k] <= 0x07ff) { 730 731 echo chr(0xc0 | ($arr[$k] >> 6)); 732 echo chr(0x80 | ($arr[$k] & 0x003f)); 733 734 # Byte order mark (skip) 735 } else if($arr[$k] == 0xFEFF) { 736 737 // nop -- zap the BOM 738 739 # Test for illegal surrogates 740 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 741 742 // found a surrogate 743 if($strict){ 744 trigger_error( 745 'unicode_to_utf8: Illegal surrogate '. 746 'at index: '.$k.', value: '.$arr[$k], 747 E_USER_WARNING 748 ); 749 return false; 750 } 751 752 # 3 byte sequence 753 } else if ($arr[$k] <= 0xffff) { 754 755 echo chr(0xe0 | ($arr[$k] >> 12)); 756 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 757 echo chr(0x80 | ($arr[$k] & 0x003f)); 758 759 # 4 byte sequence 760 } else if ($arr[$k] <= 0x10ffff) { 761 762 echo chr(0xf0 | ($arr[$k] >> 18)); 763 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 764 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 765 echo chr(0x80 | ($arr[$k] & 0x3f)); 766 767 } elseif($strict) { 768 769 trigger_error( 770 'unicode_to_utf8: Codepoint out of Unicode range '. 771 'at index: '.$k.', value: '.$arr[$k], 772 E_USER_WARNING 773 ); 774 775 // out of range 776 return false; 777 } 778 } 779 780 $result = ob_get_contents(); 781 ob_end_clean(); 782 return $result; 783} 784 785/** 786 * UTF-8 to UTF-16BE conversion. 787 * 788 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 789 */ 790function utf8_to_utf16be(&$str, $bom = false) { 791 $out = $bom ? "\xFE\xFF" : ''; 792 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 793 794 $uni = utf8_to_unicode($str); 795 foreach($uni as $cp){ 796 $out .= pack('n',$cp); 797 } 798 return $out; 799} 800 801/** 802 * UTF-8 to UTF-16BE conversion. 803 * 804 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 805 */ 806function utf16be_to_utf8(&$str) { 807 $uni = unpack('n*',$str); 808 return unicode_to_utf8($uni); 809} 810 811/** 812 * Replace bad bytes with an alternative character 813 * 814 * ASCII character is recommended for replacement char 815 * 816 * PCRE Pattern to locate bad bytes in a UTF-8 string 817 * Comes from W3 FAQ: Multilingual Forms 818 * Note: modified to include full ASCII range including control chars 819 * 820 * @author Harry Fuecks <hfuecks@gmail.com> 821 * @see http://www.w3.org/International/questions/qa-forms-utf-8 822 * @param string to search 823 * @param string to replace bad bytes with (defaults to '?') - use ASCII 824 * @return string 825 */ 826function utf8_bad_replace($str, $replace = '') { 827 $UTF8_BAD = 828 '([\x00-\x7F]'. # ASCII (including control chars) 829 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 830 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 831 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 832 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 833 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 834 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 835 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 836 '|(.{1}))'; # invalid byte 837 ob_start(); 838 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 839 if ( !isset($matches[2])) { 840 echo $matches[0]; 841 } else { 842 echo $replace; 843 } 844 $str = substr($str,strlen($matches[0])); 845 } 846 $result = ob_get_contents(); 847 ob_end_clean(); 848 return $result; 849} 850 851/** 852 * adjust a byte index into a utf8 string to a utf8 character boundary 853 * 854 * @param $str string utf8 character string 855 * @param $i int byte index into $str 856 * @param $next bool direction to search for boundary, 857 * false = up (current character) 858 * true = down (next character) 859 * 860 * @return int byte index into $str now pointing to a utf8 character boundary 861 * 862 * @author chris smith <chris@jalakai.co.uk> 863 */ 864function utf8_correctIdx(&$str,$i,$next=false) { 865 866 if ($i <= 0) return 0; 867 868 $limit = strlen($str); 869 if ($i>=$limit) return $limit; 870 871 if ($next) { 872 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 873 } else { 874 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 875 } 876 877 return $i; 878} 879 880// only needed if no mb_string available 881if(!UTF8_MBSTRING){ 882 /** 883 * UTF-8 Case lookup table 884 * 885 * This lookuptable defines the upper case letters to their correspponding 886 * lower case letter in UTF-8 887 * 888 * @author Andreas Gohr <andi@splitbrain.org> 889 */ 890 global $UTF8_LOWER_TO_UPPER; 891 $UTF8_LOWER_TO_UPPER = array( 892 "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q", 893 "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G", 894 "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ", 895 "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ", 896 "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ", 897 "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ", 898 "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ", 899 "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ", 900 "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ", 901 "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ", 902 "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ", 903 "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ", 904 "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ", 905 "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố", 906 "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ", 907 "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ", 908 "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ", 909 "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ", 910 "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ", 911 "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ", 912 "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ", 913 "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ", 914 "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ", 915 "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ", 916 "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս", 917 "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ", 918 "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ", 919 "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ", 920 "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ", 921 "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ", 922 "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ", 923 "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ", 924 "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ", 925 "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ", 926 "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ", 927 "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ", 928 "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ", 929 "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ", 930 "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П", 931 "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е", 932 "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ", 933 "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ", 934 "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ", 935 "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π", 936 "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ", 937 "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ", 938 "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ", 939 "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ", 940 "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș", 941 "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ", 942 "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ", 943 "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ", 944 "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ", 945 "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ", 946 "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž", 947 "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ", 948 "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ", 949 "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ", 950 "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī", 951 "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė", 952 "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă", 953 "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö", 954 "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì", 955 "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â", 956 "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T", 957 "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J", 958 "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A" 959 ); 960 961 /** 962 * UTF-8 Case lookup table 963 * 964 * This lookuptable defines the lower case letters to their correspponding 965 * upper case letter in UTF-8 966 * 967 * @author Andreas Gohr <andi@splitbrain.org> 968 */ 969 global $UTF8_UPPER_TO_LOWER; 970 $UTF8_UPPER_TO_LOWER = array ( 971 "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q", 972 "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g", 973 "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ", 974 "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ", 975 "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ", 976 "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ", 977 "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ", 978 "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ", 979 "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ", 980 "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ", 981 "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ", 982 "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ", 983 "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ", 984 "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố", 985 "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ", 986 "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ", 987 "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ", 988 "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ", 989 "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ", 990 "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ", 991 "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ", 992 "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ", 993 "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ", 994 "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ", 995 "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս", 996 "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ", 997 "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ", 998 "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ", 999 "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ", 1000 "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ", 1001 "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ", 1002 "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ", 1003 "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ", 1004 "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ", 1005 "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ", 1006 "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ", 1007 "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ", 1008 "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ", 1009 "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п", 1010 "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е", 1011 "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ", 1012 "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ", 1013 "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ", 1014 "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π", 1015 "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ", 1016 "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ", 1017 "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ", 1018 "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ", 1019 "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș", 1020 "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ", 1021 "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ", 1022 "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ", 1023 "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ", 1024 "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ", 1025 "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž", 1026 "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ", 1027 "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ", 1028 "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ", 1029 "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī", 1030 "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė", 1031 "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă", 1032 "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö", 1033 "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì", 1034 "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â", 1035 "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t", 1036 "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j", 1037 "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a" 1038 ); 1039}; // end of case lookup tables 1040 1041/** 1042 * UTF-8 lookup table for lower case accented letters 1043 * 1044 * This lookuptable defines replacements for accented characters from the ASCII-7 1045 * range. This are lower case letters only. 1046 * 1047 * @author Andreas Gohr <andi@splitbrain.org> 1048 * @see utf8_deaccent() 1049 */ 1050global $UTF8_LOWER_ACCENTS; 1051$UTF8_LOWER_ACCENTS = array( 1052 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 1053 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 1054 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 1055 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 1056 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 1057 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 1058 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 1059 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 1060 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 1061 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 1062 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 1063 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 1064 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 1065 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 1066 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 1067); 1068 1069/** 1070 * UTF-8 lookup table for upper case accented letters 1071 * 1072 * This lookuptable defines replacements for accented characters from the ASCII-7 1073 * range. This are upper case letters only. 1074 * 1075 * @author Andreas Gohr <andi@splitbrain.org> 1076 * @see utf8_deaccent() 1077 */ 1078global $UTF8_UPPER_ACCENTS; 1079$UTF8_UPPER_ACCENTS = array( 1080 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 1081 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 1082 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 1083 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 1084 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 1085 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 1086 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 1087 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1088 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1089 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1090 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1091 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1092 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1093 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 1094 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 1095); 1096 1097/** 1098 * UTF-8 array of common special characters 1099 * 1100 * This array should contain all special characters (not a letter or digit) 1101 * defined in the various local charsets - it's not a complete list of non-alphanum 1102 * characters in UTF-8. It's not perfect but should match most cases of special 1103 * chars. 1104 * 1105 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1106 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1107 * 1108 * @author Andreas Gohr <andi@splitbrain.org> 1109 * @see utf8_stripspecials() 1110 */ 1111global $UTF8_SPECIAL_CHARS; 1112$UTF8_SPECIAL_CHARS = array( 1113 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1114 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 1115 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 1116 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1117 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1118 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1119 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1120 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1121 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1122 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1123 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1124 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1125 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1126 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1127 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1128 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1129 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1130 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1131 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1132 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1133 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1134 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1135 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1136 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1137 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1138 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1139 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1140 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1141 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1142 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1143 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1144 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1145 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1146 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1147 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1148 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1149 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1150 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1151 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1152 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1153 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1154 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1155 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1156 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1157 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1158 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 1159 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, 1160 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 1161 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1162 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1163 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1164 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1165 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1166 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 1167 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, 1168 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, 1169 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 1170 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, 1171 0xffeb, 0xffec, 0xffed, 0xffee, 1172); 1173 1174// utf8 version of above data 1175global $UTF8_SPECIAL_CHARS2; 1176$UTF8_SPECIAL_CHARS2 = 1177 "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 1178 '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 1179 '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1180 '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1181 '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1182 '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1183 '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1184 '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1185 '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1186 '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1187 '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1188 '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1189 '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1190 '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1191 '➷➸➹➺➻➼➽➾'. 1192 ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. 1193 '�'. 1194 '�ﹼﹽ'. 1195 '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. 1196 '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'; 1197 1198/** 1199 * Romanization lookup table 1200 * 1201 * This lookup tables provides a way to transform strings written in a language 1202 * different from the ones based upon latin letters into plain ASCII. 1203 * 1204 * Please note: this is not a scientific transliteration table. It only works 1205 * oneway from nonlatin to ASCII and it works by simple character replacement 1206 * only. Specialities of each language are not supported. 1207 * 1208 * @author Andreas Gohr <andi@splitbrain.org> 1209 * @author Vitaly Blokhin <vitinfo@vitn.com> 1210 * @link http://www.uconv.com/translit.htm 1211 * @author Bisqwit <bisqwit@iki.fi> 1212 * @link http://kanjidict.stc.cx/hiragana.php?src=2 1213 * @link http://www.translatum.gr/converter/greek-transliteration.htm 1214 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 1215 * @link http://www.btranslations.com/resources/romanization/korean.asp 1216 */ 1217global $UTF8_ROMANIZATION; 1218$UTF8_ROMANIZATION = array( 1219 //russian cyrillic 1220 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 1221 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 1222 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 1223 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 1224 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 1225 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1226 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1227 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 1228 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 1229 // Ukrainian cyrillic 1230 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 1231 // Georgian 1232 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 1233 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 1234 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 1235 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 1236 'ჰ'=>'xh', 1237 //Sanskrit 1238 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 1239 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 1240 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 1241 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 1242 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 1243 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 1244 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 1245 //Hebrew 1246 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 1247 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 1248 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 1249 'ש'=>'sh','ת'=>'t', 1250 //Arabic 1251 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 1252 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 1253 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 1254 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 1255 1256 // Japanese hiragana 1257 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 1258 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 1259 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1260 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 1261 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 1262 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 1263 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1264 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 1265 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 1266 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 1267 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 1268 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 1269 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 1270 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 1271 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1272 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 1273 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 1274 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 1275 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 1276 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 1277 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 1278 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 1279 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 1280 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 1281 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 1282 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1283 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1284 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 1285 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 1286 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 1287 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 1288 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 1289 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 1290 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 1291 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 1292 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 1293 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 1294 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 1295 'じゅ'=>'zyu', 1296 // Japanese katakana 1297 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 1298 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 1299 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 1300 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 1301 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 1302 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 1303 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 1304 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1305 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 1306 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 1307 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 1308 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 1309 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 1310 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1311 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 1312 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 1313 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 1314 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 1315 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 1316 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1317 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 1318 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 1319 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 1320 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 1321 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 1322 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1323 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 1324 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 1325 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 1326 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 1327 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1328 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 1329 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 1330 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 1331 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 1332 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 1333 'ジョ'=>'zyo','ジュ'=>'zyu', 1334 1335 // "Greeklish" 1336 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1337 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1338 1339 // Thai 1340 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1341 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1342 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1343 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1344 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1345 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1346 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1347 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1348 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1349 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1350 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1351 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1352 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1353 'เ–ียว'=>'iao', 1354 1355 // Korean 1356 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1357 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1358 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1359 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1360 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1361 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1362); 1363 1364//Setup VIM: ex: et ts=2 enc=utf-8 : 1365 1366