1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 10/** 11 * check for mb_string support 12 */ 13if(!defined('UTF8_MBSTRING')){ 14 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 15 define('UTF8_MBSTRING',1); 16 }else{ 17 define('UTF8_MBSTRING',0); 18 } 19} 20 21 22/** 23 * URL-Encode a filename to allow unicodecharacters 24 * 25 * Slashes are not encoded 26 * 27 * When the second parameter is true the string will 28 * be encoded only if non ASCII characters are detected - 29 * This makes it safe to run it multiple times on the 30 * same string (default is true) 31 * 32 * @author Andreas Gohr <andi@splitbrain.org> 33 * @see urlencode 34 */ 35function utf8_encodeFN($file,$safe=true){ 36 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 37 return $file; 38 } 39 $file = urlencode($file); 40 $file = str_replace('%2F','/',$file); 41 return $file; 42} 43 44/** 45 * URL-Decode a filename 46 * 47 * This is just a wrapper around urldecode 48 * 49 * @author Andreas Gohr <andi@splitbrain.org> 50 * @see urldecode 51 */ 52function utf8_decodeFN($file){ 53 $file = urldecode($file); 54 return $file; 55} 56 57/** 58 * Checks if a string contains 7bit ASCII only 59 * 60 * @author Andreas Gohr <andi@splitbrain.org> 61 */ 62function utf8_isASCII($str){ 63 for($i=0; $i<strlen($str); $i++){ 64 if(ord($str{$i}) >127) return false; 65 } 66 return true; 67} 68 69/** 70 * Strips all highbyte chars 71 * 72 * Returns a pure ASCII7 string 73 * 74 * @author Andreas Gohr <andi@splitbrain.org> 75 */ 76function utf8_strip($str){ 77 $ascii = ''; 78 for($i=0; $i<strlen($str); $i++){ 79 if(ord($str{$i}) <128){ 80 $ascii .= $str{$i}; 81 } 82 } 83 return $ascii; 84} 85 86/** 87 * Tries to detect if a string is in Unicode encoding 88 * 89 * @author <bmorel@ssi.fr> 90 * @link http://www.php.net/manual/en/function.utf8-encode.php 91 */ 92function utf8_check($Str) { 93 for ($i=0; $i<strlen($Str); $i++) { 94 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 95 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 96 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 97 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 98 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 99 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 100 else return false; # Does not match any model 101 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 102 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 103 return false; 104 } 105 } 106 return true; 107} 108 109/** 110 * Unicode aware replacement for strlen() 111 * 112 * utf8_decode() converts characters that are not in ISO-8859-1 113 * to '?', which, for the purpose of counting, is alright - It's 114 * even faster than mb_strlen. 115 * 116 * @author <chernyshevsky at hotmail dot com> 117 * @see strlen() 118 * @see utf8_decode() 119 */ 120function utf8_strlen($string){ 121 return strlen(utf8_decode($string)); 122} 123 124/** 125 * UTF-8 aware alternative to substr 126 * 127 * Return part of a string given character offset (and optionally length) 128 * Note: supports use of negative offsets and lengths but will be slower 129 * when doing so 130 * 131 * @author Harry Fuecks <hfuecks@gmail.com> 132 * @param string 133 * @param integer number of UTF-8 characters offset (from left) 134 * @param integer (optional) length in UTF-8 characters from offset 135 * @return mixed string or FALSE if failure 136 */ 137function utf8_substr($str, $offset, $length = null) { 138 if(UTF8_MBSTRING){ 139 if( $length === null ){ 140 return (mb_substr($str, $offset)); 141 }else{ 142 return (mb_substr($str, $offset, $length)); 143 } 144 } 145 146 if ( $offset >= 0 && $length >= 0 ) { 147 if ( $length === null ) { 148 $length = '*'; 149 } else { 150 $strlen = strlen(utf8_decode($str)); 151 if ( $offset > $strlen ) { 152 return ''; 153 } 154 155 if ( ( $offset + $length ) > $strlen ) { 156 $length = '*'; 157 } else { 158 $length = '{'.$length.'}'; 159 } 160 } 161 162 $pattern = '/^.{'.$offset.'}(.'.$length.')/us'; 163 preg_match($pattern, $str, $matches); 164 165 if ( isset($matches[1]) ) { 166 return $matches[1]; 167 } 168 return false; 169 170 } else { 171 // Handle negatives using different, slower technique 172 // From: http://www.php.net/manual/en/function.substr.php#44838 173 preg_match_all('/./u', $str, $ar); 174 if( $length !== null ) { 175 return join('',array_slice($ar[0],$offset,$length)); 176 } else { 177 return join('',array_slice($ar[0],$offset)); 178 } 179 } 180} 181 182 183/** 184 * Unicode aware replacement for substr_replace() 185 * 186 * @author Andreas Gohr <andi@splitbrain.org> 187 * @see substr_replace() 188 */ 189function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 190 $ret = ''; 191 if($start>0) $ret .= utf8_substr($string, 0, $start); 192 $ret .= $replacement; 193 $ret .= utf8_substr($string, $start+$length); 194 return $ret; 195} 196 197/** 198 * Unicode aware replacement for explode 199 * 200 * @TODO support third limit arg 201 * @author Harry Fuecks <hfuecks@gmail.com> 202 * @see explode(); 203 */ 204function utf8_explode($sep, $str) { 205 if ( $sep == '' ) { 206 trigger_error('Empty delimiter',E_USER_WARNING); 207 return FALSE; 208 } 209 210 return preg_split('!'.preg_quote($sep,'!').'!u',$str); 211} 212 213/** 214 * Unicode aware replacement for strrepalce() 215 * 216 * @todo support PHP5 count (fourth arg) 217 * @author Harry Fuecks <hfuecks@gmail.com> 218 * @see strreplace(); 219 */ 220function utf8_str_replace($s,$r,$str){ 221 if(!is_array($s)){ 222 $s = '!'.preg_quote($s,'!').'!u'; 223 }else{ 224 foreach ($s as $k => $v) { 225 $s[$k] = '!'.preg_quote($v).'!u'; 226 } 227 } 228 return preg_replace($s,$r,$str); 229} 230 231/** 232 * Unicode aware replacement for ltrim() 233 * 234 * @author Andreas Gohr <andi@splitbrain.org> 235 * @see ltrim() 236 * @return string 237 */ 238function utf8_ltrim($str,$charlist=''){ 239 if($charlist == '') return ltrim($str); 240 241 //quote charlist for use in a characterclass 242 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 243 244 return preg_replace('/^['.$charlist.']+/u','',$str); 245} 246 247/** 248 * Unicode aware replacement for rtrim() 249 * 250 * @author Andreas Gohr <andi@splitbrain.org> 251 * @see rtrim() 252 * @return string 253 */ 254function utf8_rtrim($str,$charlist=''){ 255 if($charlist == '') return rtrim($str); 256 257 //quote charlist for use in a characterclass 258 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 259 260 return preg_replace('/['.$charlist.']+$/u','',$str); 261} 262 263/** 264 * Unicode aware replacement for trim() 265 * 266 * @author Andreas Gohr <andi@splitbrain.org> 267 * @see trim() 268 * @return string 269 */ 270function utf8_trim($str,$charlist='') { 271 if($charlist == '') return trim($str); 272 273 return utf8_ltrim(utf8_rtrim($str)); 274} 275 276 277/** 278 * This is a unicode aware replacement for strtolower() 279 * 280 * Uses mb_string extension if available 281 * 282 * @author Andreas Gohr <andi@splitbrain.org> 283 * @see strtolower() 284 * @see utf8_strtoupper() 285 */ 286function utf8_strtolower($string){ 287 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 288 289 global $UTF8_UPPER_TO_LOWER; 290 $uni = utf8_to_unicode($string); 291 $cnt = count($uni); 292 for ($i=0; $i < $cnt; $i++){ 293 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 294 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 295 } 296 } 297 return unicode_to_utf8($uni); 298} 299 300/** 301 * This is a unicode aware replacement for strtoupper() 302 * 303 * Uses mb_string extension if available 304 * 305 * @author Andreas Gohr <andi@splitbrain.org> 306 * @see strtoupper() 307 * @see utf8_strtoupper() 308 */ 309function utf8_strtoupper($string){ 310 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 311 312 global $UTF8_LOWER_TO_UPPER; 313 $uni = utf8_to_unicode($string); 314 $cnt = count($uni); 315 for ($i=0; $i < $cnt; $i++){ 316 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 317 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 318 } 319 } 320 return unicode_to_utf8($uni); 321} 322 323/** 324 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 325 * 326 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 327 * letters. Default is to deaccent both cases ($case = 0) 328 * 329 * @author Andreas Gohr <andi@splitbrain.org> 330 */ 331function utf8_deaccent($string,$case=0){ 332 if($case <= 0){ 333 global $UTF8_LOWER_ACCENTS; 334 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 335 } 336 if($case >= 0){ 337 global $UTF8_UPPER_ACCENTS; 338 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 339 } 340 return $string; 341} 342 343/** 344 * Romanize a non-latin string 345 * 346 * @author Andreas Gohr <andi@splitbrain.org> 347 */ 348function utf8_romanize($string){ 349 if(utf8_isASCII($string)) return $string; //nothing to do 350 351 global $UTF8_ROMANIZATION; 352 return strtr($string,$UTF8_ROMANIZATION); 353} 354 355/** 356 * Removes special characters (nonalphanumeric) from a UTF-8 string 357 * 358 * This function adds the controlchars 0x00 to 0x19 to the array of 359 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 360 * 361 * @author Andreas Gohr <andi@splitbrain.org> 362 * @param string $string The UTF8 string to strip of special chars 363 * @param string $repl Replace special with this string 364 * @param string $additional Additional chars to strip (used in regexp char class) 365 */ 366function utf8_stripspecials($string,$repl='',$additional=''){ 367 global $UTF8_SPECIAL_CHARS; 368 369 static $specials = null; 370 if(is_null($specials)){ 371 $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 372 } 373 374 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 375} 376 377/** 378 * This is an Unicode aware replacement for strpos 379 * 380 * Uses mb_string extension if available 381 * 382 * @author Harry Fuecks <hfuecks@gmail.com> 383 * @see strpos() 384 */ 385function utf8_strpos($haystack, $needle,$offset=0) { 386 if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8'); 387 388 if(!$offset){ 389 $ar = utf8_explode($needle, $haystack); 390 if ( count($ar) > 1 ) { 391 return utf8_strlen($ar[0]); 392 } 393 return false; 394 }else{ 395 if ( !is_int($offset) ) { 396 trigger_error('Offset must be an integer',E_USER_WARNING); 397 return false; 398 } 399 400 $haystack = utf8_substr($haystack, $offset); 401 402 if ( false !== ($pos = utf8_strpos($haystack,$needle))){ 403 return $pos + $offset; 404 } 405 return false; 406 } 407} 408 409/** 410 * Encodes UTF-8 characters to HTML entities 411 * 412 * @author <vpribish at shopping dot com> 413 * @link http://www.php.net/manual/en/function.utf8-decode.php 414 */ 415function utf8_tohtml ($str) { 416 $ret = ''; 417 $max = strlen($str); 418 $last = 0; // keeps the index of the last regular character 419 for ($i=0; $i<$max; $i++) { 420 $c = $str{$i}; 421 $c1 = ord($c); 422 if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode 423 $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed 424 $c1 &= 31; // remove the 3 bit two bytes prefix 425 $c2 = ord($str{++$i}); // the next byte 426 $c2 &= 63; // remove the 2 bit trailing byte prefix 427 $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 428 $c1 >>= 2; // c1 shifts 2 to the right 429 $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation 430 $last = $i+1; 431 } 432 } 433 return $ret . substr($str, $last, $i); // append the last batch of regular characters 434} 435 436/** 437 * Takes an UTF-8 string and returns an array of ints representing the 438 * Unicode characters. Astral planes are supported ie. the ints in the 439 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 440 * are not allowed. 441 * 442 * If $strict is set to true the function returns false if the input 443 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 444 * level E_USER_WARNING 445 * 446 * Note: this function has been modified slightly in this library to 447 * trigger errors on encountering bad bytes 448 * 449 * @author <hsivonen@iki.fi> 450 * @author Harry Fuecks <hfuecks@gmail.com> 451 * @param string UTF-8 encoded string 452 * @param boolean Check for invalid sequences? 453 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 454 * @see unicode_to_utf8 455 * @link http://hsivonen.iki.fi/php-utf8/ 456 * @link http://sourceforge.net/projects/phputf8/ 457 */ 458function utf8_to_unicode($str,$strict=false) { 459 $mState = 0; // cached expected number of octets after the current octet 460 // until the beginning of the next UTF8 character sequence 461 $mUcs4 = 0; // cached Unicode character 462 $mBytes = 1; // cached expected number of octets in the current sequence 463 464 $out = array(); 465 466 $len = strlen($str); 467 468 for($i = 0; $i < $len; $i++) { 469 470 $in = ord($str{$i}); 471 472 if ( $mState == 0) { 473 474 // When mState is zero we expect either a US-ASCII character or a 475 // multi-octet sequence. 476 if (0 == (0x80 & ($in))) { 477 // US-ASCII, pass straight through. 478 $out[] = $in; 479 $mBytes = 1; 480 481 } else if (0xC0 == (0xE0 & ($in))) { 482 // First octet of 2 octet sequence 483 $mUcs4 = ($in); 484 $mUcs4 = ($mUcs4 & 0x1F) << 6; 485 $mState = 1; 486 $mBytes = 2; 487 488 } else if (0xE0 == (0xF0 & ($in))) { 489 // First octet of 3 octet sequence 490 $mUcs4 = ($in); 491 $mUcs4 = ($mUcs4 & 0x0F) << 12; 492 $mState = 2; 493 $mBytes = 3; 494 495 } else if (0xF0 == (0xF8 & ($in))) { 496 // First octet of 4 octet sequence 497 $mUcs4 = ($in); 498 $mUcs4 = ($mUcs4 & 0x07) << 18; 499 $mState = 3; 500 $mBytes = 4; 501 502 } else if (0xF8 == (0xFC & ($in))) { 503 /* First octet of 5 octet sequence. 504 * 505 * This is illegal because the encoded codepoint must be either 506 * (a) not the shortest form or 507 * (b) outside the Unicode range of 0-0x10FFFF. 508 * Rather than trying to resynchronize, we will carry on until the end 509 * of the sequence and let the later error handling code catch it. 510 */ 511 $mUcs4 = ($in); 512 $mUcs4 = ($mUcs4 & 0x03) << 24; 513 $mState = 4; 514 $mBytes = 5; 515 516 } else if (0xFC == (0xFE & ($in))) { 517 // First octet of 6 octet sequence, see comments for 5 octet sequence. 518 $mUcs4 = ($in); 519 $mUcs4 = ($mUcs4 & 1) << 30; 520 $mState = 5; 521 $mBytes = 6; 522 523 } elseif($strict) { 524 /* Current octet is neither in the US-ASCII range nor a legal first 525 * octet of a multi-octet sequence. 526 */ 527 trigger_error( 528 'utf8_to_unicode: Illegal sequence identifier '. 529 'in UTF-8 at byte '.$i, 530 E_USER_WARNING 531 ); 532 return FALSE; 533 534 } 535 536 } else { 537 538 // When mState is non-zero, we expect a continuation of the multi-octet 539 // sequence 540 if (0x80 == (0xC0 & ($in))) { 541 542 // Legal continuation. 543 $shift = ($mState - 1) * 6; 544 $tmp = $in; 545 $tmp = ($tmp & 0x0000003F) << $shift; 546 $mUcs4 |= $tmp; 547 548 /** 549 * End of the multi-octet sequence. mUcs4 now contains the final 550 * Unicode codepoint to be output 551 */ 552 if (0 == --$mState) { 553 554 /* 555 * Check for illegal sequences and codepoints. 556 */ 557 // From Unicode 3.1, non-shortest form is illegal 558 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 559 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 560 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 561 (4 < $mBytes) || 562 // From Unicode 3.2, surrogate characters are illegal 563 (($mUcs4 & 0xFFFFF800) == 0xD800) || 564 // Codepoints outside the Unicode range are illegal 565 ($mUcs4 > 0x10FFFF)) { 566 567 if($strict){ 568 trigger_error( 569 'utf8_to_unicode: Illegal sequence or codepoint '. 570 'in UTF-8 at byte '.$i, 571 E_USER_WARNING 572 ); 573 574 return FALSE; 575 } 576 577 } 578 579 if (0xFEFF != $mUcs4) { 580 // BOM is legal but we don't want to output it 581 $out[] = $mUcs4; 582 } 583 584 //initialize UTF8 cache 585 $mState = 0; 586 $mUcs4 = 0; 587 $mBytes = 1; 588 } 589 590 } elseif($strict) { 591 /** 592 *((0xC0 & (*in) != 0x80) && (mState != 0)) 593 * Incomplete multi-octet sequence. 594 */ 595 trigger_error( 596 'utf8_to_unicode: Incomplete multi-octet '. 597 ' sequence in UTF-8 at byte '.$i, 598 E_USER_WARNING 599 ); 600 601 return FALSE; 602 } 603 } 604 } 605 return $out; 606} 607 608/** 609 * Takes an array of ints representing the Unicode characters and returns 610 * a UTF-8 string. Astral planes are supported ie. the ints in the 611 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 612 * are not allowed. 613 * 614 * If $strict is set to true the function returns false if the input 615 * array contains ints that represent surrogates or are outside the 616 * Unicode range and raises a PHP error at level E_USER_WARNING 617 * 618 * Note: this function has been modified slightly in this library to use 619 * output buffering to concatenate the UTF-8 string (faster) as well as 620 * reference the array by it's keys 621 * 622 * @param array of unicode code points representing a string 623 * @param boolean Check for invalid sequences? 624 * @return mixed UTF-8 string or FALSE if array contains invalid code points 625 * @author <hsivonen@iki.fi> 626 * @author Harry Fuecks <hfuecks@gmail.com> 627 * @see utf8_to_unicode 628 * @link http://hsivonen.iki.fi/php-utf8/ 629 * @link http://sourceforge.net/projects/phputf8/ 630 */ 631function unicode_to_utf8($arr,$strict=false) { 632 if (!is_array($arr)) return ''; 633 ob_start(); 634 635 foreach (array_keys($arr) as $k) { 636 637 # ASCII range (including control chars) 638 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 639 640 echo chr($arr[$k]); 641 642 # 2 byte sequence 643 } else if ($arr[$k] <= 0x07ff) { 644 645 echo chr(0xc0 | ($arr[$k] >> 6)); 646 echo chr(0x80 | ($arr[$k] & 0x003f)); 647 648 # Byte order mark (skip) 649 } else if($arr[$k] == 0xFEFF) { 650 651 // nop -- zap the BOM 652 653 # Test for illegal surrogates 654 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 655 656 // found a surrogate 657 if($strict){ 658 trigger_error( 659 'unicode_to_utf8: Illegal surrogate '. 660 'at index: '.$k.', value: '.$arr[$k], 661 E_USER_WARNING 662 ); 663 return FALSE; 664 } 665 666 # 3 byte sequence 667 } else if ($arr[$k] <= 0xffff) { 668 669 echo chr(0xe0 | ($arr[$k] >> 12)); 670 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 671 echo chr(0x80 | ($arr[$k] & 0x003f)); 672 673 # 4 byte sequence 674 } else if ($arr[$k] <= 0x10ffff) { 675 676 echo chr(0xf0 | ($arr[$k] >> 18)); 677 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 678 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 679 echo chr(0x80 | ($arr[$k] & 0x3f)); 680 681 } elseif($strict) { 682 683 trigger_error( 684 'unicode_to_utf8: Codepoint out of Unicode range '. 685 'at index: '.$k.', value: '.$arr[$k], 686 E_USER_WARNING 687 ); 688 689 // out of range 690 return FALSE; 691 } 692 } 693 694 $result = ob_get_contents(); 695 ob_end_clean(); 696 return $result; 697} 698 699/** 700 * UTF-8 to UTF-16BE conversion. 701 * 702 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 703 */ 704function utf8_to_utf16be(&$str, $bom = false) { 705 $out = $bom ? "\xFE\xFF" : ''; 706 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 707 708 $uni = utf8_to_unicode($str); 709 foreach($uni as $cp){ 710 $out .= pack('n',$cp); 711 } 712 return $out; 713} 714 715/** 716 * UTF-8 to UTF-16BE conversion. 717 * 718 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 719 */ 720function utf16be_to_utf8(&$str) { 721 $uni = unpack('n*',$str); 722 return unicode_to_utf8($uni); 723} 724 725/** 726 * Replace bad bytes with an alternative character 727 * 728 * ASCII character is recommended for replacement char 729 * 730 * PCRE Pattern to locate bad bytes in a UTF-8 string 731 * Comes from W3 FAQ: Multilingual Forms 732 * Note: modified to include full ASCII range including control chars 733 * 734 * @author Harry Fuecks <hfuecks@gmail.com> 735 * @see http://www.w3.org/International/questions/qa-forms-utf-8 736 * @param string to search 737 * @param string to replace bad bytes with (defaults to '?') - use ASCII 738 * @return string 739 */ 740function utf8_bad_replace($str, $replace = '') { 741 $UTF8_BAD = 742 '([\x00-\x7F]'. # ASCII (including control chars) 743 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 744 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 745 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 746 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 747 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 748 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 749 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 750 '|(.{1}))'; # invalid byte 751 ob_start(); 752 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 753 if ( !isset($matches[2])) { 754 echo $matches[0]; 755 } else { 756 echo $replace; 757 } 758 $str = substr($str,strlen($matches[0])); 759 } 760 $result = ob_get_contents(); 761 ob_end_clean(); 762 return $result; 763} 764 765/** 766 * adjust a byte index into a utf8 string to a utf8 character boundary 767 * 768 * @param $str string utf8 character string 769 * @param $i int byte index into $str 770 * @param $next bool direction to search for boundary, 771 * false = up (current character) 772 * true = down (next character) 773 * 774 * @return int byte index into $str now pointing to a utf8 character boundary 775 * 776 * @author chris smith <chris@jalakai.co.uk> 777 */ 778function utf8_correctIdx(&$str,$i,$next=false) { 779 780 if ($i <= 0) return 0; 781 782 $limit = strlen($str); 783 if ($i>=$limit) return $limit; 784 785 if ($next) { 786 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 787 } else { 788 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 789 } 790 791 return $i; 792} 793 794// only needed if no mb_string available 795if(!UTF8_MBSTRING){ 796 797 /** 798 * UTF-8 Case lookup table 799 * 800 * This lookuptable defines the upper case letters to their correspponding 801 * lower case letter in UTF-8 802 * 803 * @author Andreas Gohr <andi@splitbrain.org> 804 */ 805 global $UTF8_LOWER_TO_UPPER; 806 $UTF8_LOWER_TO_UPPER = array( 807 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 808 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 809 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 810 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 811 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 812 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 813 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 814 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 815 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 816 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 817 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 818 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 819 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 820 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 821 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 822 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 823 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 824 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 825 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 826 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 827 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 828 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 829 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 830 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 831 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 832 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 833 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 834 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 835 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 836 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 837 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 838 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 839 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 840 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 841 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 842 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 843 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 844 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 845 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 846 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 847 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 848 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 849 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 850 ); 851 852 /** 853 * UTF-8 Case lookup table 854 * 855 * This lookuptable defines the lower case letters to their correspponding 856 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 857 * 858 * @author Andreas Gohr <andi@splitbrain.org> 859 */ 860 global $UTF8_UPPER_TO_LOWER; 861 $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 862 863} // end of case lookup tables 864 865 866/** 867 * UTF-8 lookup table for lower case accented letters 868 * 869 * This lookuptable defines replacements for accented characters from the ASCII-7 870 * range. This are lower case letters only. 871 * 872 * @author Andreas Gohr <andi@splitbrain.org> 873 * @see utf8_deaccent() 874 */ 875global $UTF8_LOWER_ACCENTS; 876$UTF8_LOWER_ACCENTS = array( 877 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 878 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 879 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 880 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 881 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 882 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 883 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 884 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 885 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 886 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 887 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 888 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 889 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 890 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 891 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 892); 893 894/** 895 * UTF-8 lookup table for upper case accented letters 896 * 897 * This lookuptable defines replacements for accented characters from the ASCII-7 898 * range. This are upper case letters only. 899 * 900 * @author Andreas Gohr <andi@splitbrain.org> 901 * @see utf8_deaccent() 902 */ 903global $UTF8_UPPER_ACCENTS; 904$UTF8_UPPER_ACCENTS = array( 905 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 906 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 907 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 908 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 909 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 910 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 911 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 912 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 913 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 914 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 915 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 916 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 917 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 918 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 919 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 920); 921 922/** 923 * UTF-8 array of common special characters 924 * 925 * This array should contain all special characters (not a letter or digit) 926 * defined in the various local charsets - it's not a complete list of non-alphanum 927 * characters in UTF-8. It's not perfect but should match most cases of special 928 * chars. 929 * 930 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 931 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 932 * 933 * @author Andreas Gohr <andi@splitbrain.org> 934 * @see utf8_stripspecials() 935 */ 936global $UTF8_SPECIAL_CHARS; 937$UTF8_SPECIAL_CHARS = array( 938 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 939 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 940 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 941 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 942 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 943 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 944 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 945 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 946 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 947 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 948 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 949 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 950 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 951 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 952 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 953 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 954 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 955 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 956 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 957 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 958 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 959 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 960 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 961 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 962 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 963 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 964 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 965 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 966 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 967 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 968 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 969 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 970 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 971 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 972 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 973 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 974 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 975 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 976 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 977 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 978 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 979 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 980 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 981 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 982 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 983 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 984 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 985 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 986 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 987 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 988); 989 990/** 991 * Romanization lookup table 992 * 993 * This lookup tables provides a way to transform strings written in a language 994 * different from the ones based upon latin letters into plain ASCII. 995 * 996 * Please note: this is not a scientific transliteration table. It only works 997 * oneway from nonlatin to ASCII and it works by simple character replacement 998 * only. Specialities of each language are not supported. 999 * 1000 * @author Andreas Gohr <andi@splitbrain.org> 1001 * @author Vitaly Blokhin <vitinfo@vitn.com> 1002 * @link http://www.uconv.com/translit.htm 1003 * @author Bisqwit <bisqwit@iki.fi> 1004 * @link http://kanjidict.stc.cx/hiragana.php?src=2 1005 * @link http://www.translatum.gr/converter/greek-transliteration.htm 1006 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 1007 * @link http://www.btranslations.com/resources/romanization/korean.asp 1008 */ 1009global $UTF8_ROMANIZATION; 1010$UTF8_ROMANIZATION = array( 1011 //russian cyrillic 1012 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 1013 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 1014 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 1015 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 1016 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 1017 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1018 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1019 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 1020 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 1021 // Ukrainian cyrillic 1022 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 1023 // Georgian 1024 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 1025 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 1026 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 1027 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 1028 'ჰ'=>'xh', 1029 //Sanskrit 1030 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 1031 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 1032 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 1033 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 1034 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 1035 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 1036 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 1037 //Hebrew 1038 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 1039 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 1040 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 1041 'ש'=>'sh','ת'=>'t', 1042 //Arabic 1043 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 1044 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 1045 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 1046 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 1047 1048 // Japanese hiragana 1049 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be', 1050 'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di', 1051 'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1052 'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha', 1053 'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je', 1054 'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki', 1055 'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1056 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne', 1057 'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po', 1058 'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa', 1059 'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti', 1060 'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo', 1061 'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye', 1062 'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo', 1063 'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1064 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya', 1065 'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe', 1066 'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi', 1067 'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo', 1068 'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo', 1069 'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 1070 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya', 1071 'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye', 1072 'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi', 1073 'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo', 1074 'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1075 'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1076 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya', 1077 'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she', 1078 'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi', 1079 'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo', 1080 'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 1081 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa', 1082 'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye', 1083 'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi', 1084 'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who', 1085 'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi', 1086 'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo', 1087 'じゅ'=>'zyu', 1088 // Japanese katakana 1089 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi', 1090 'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do', 1091 'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga', 1092 'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho', 1093 'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka', 1094 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo', 1095 'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne', 1096 'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1097 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si', 1098 'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va', 1099 'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi', 1100 'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze', 1101 'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo', 1102 'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1103 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha', 1104 'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe', 1105 'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi', 1106 'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi', 1107 'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo', 1108 'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1109 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya', 1110 'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye', 1111 'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi', 1112 'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo', 1113 'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo', 1114 'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1115 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha', 1116 'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe', 1117 'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi', 1118 'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho', 1119 'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1120 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya', 1121 'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye', 1122 'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi', 1123 'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe', 1124 'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi', 1125 'ジョ'=>'zyo','ジュ'=>'zyu', 1126 1127 // "Greeklish" 1128 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 1129 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 1130 1131 // Thai 1132 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 1133 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 1134 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 1135 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 1136 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 1137 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1138 'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i', 1139 '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae', 1140 'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe', 1141 'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua', 1142 '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai', 1143 'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai', 1144 'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo', 1145 'เ–ียว'=>'iao', 1146 1147 // Korean 1148 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 1149 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 1150 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 1151 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 1152 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 1153 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 1154); 1155 1156//Setup VIM: ex: et ts=2 enc=utf-8 : 1157 1158