xref: /dokuwiki/inc/utf8.php (revision 40b33eff69a8a44da0ab6962d50f2027bcfc33b6)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9
10/**
11 * check for mb_string support
12 */
13if(!defined('UTF8_MBSTRING')){
14  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
15    define('UTF8_MBSTRING',1);
16    mb_internal_encoding('UTF-8');
17  }else{
18    define('UTF8_MBSTRING',0);
19  }
20}
21
22
23/**
24 * URL-Encode a filename to allow unicodecharacters
25 *
26 * Slashes are not encoded
27 *
28 * When the second parameter is true the string will
29 * be encoded only if non ASCII characters are detected -
30 * This makes it safe to run it multiple times on the
31 * same string (default is true)
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @see    urlencode
35 */
36function utf8_encodeFN($file,$safe=true){
37  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38    return $file;
39  }
40  $file = urlencode($file);
41  $file = str_replace('%2F','/',$file);
42  return $file;
43}
44
45/**
46 * URL-Decode a filename
47 *
48 * This is just a wrapper around urldecode
49 *
50 * @author Andreas Gohr <andi@splitbrain.org>
51 * @see    urldecode
52 */
53function utf8_decodeFN($file){
54  $file = urldecode($file);
55  return $file;
56}
57
58/**
59 * Checks if a string contains 7bit ASCII only
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_isASCII($str){
64  for($i=0; $i<strlen($str); $i++){
65    if(ord($str{$i}) >127) return false;
66  }
67  return true;
68}
69
70/**
71 * Strips all highbyte chars
72 *
73 * Returns a pure ASCII7 string
74 *
75 * @author Andreas Gohr <andi@splitbrain.org>
76 */
77function utf8_strip($str){
78  $ascii = '';
79  for($i=0; $i<strlen($str); $i++){
80    if(ord($str{$i}) <128){
81      $ascii .= $str{$i};
82    }
83  }
84  return $ascii;
85}
86
87/**
88 * Tries to detect if a string is in Unicode encoding
89 *
90 * @author <bmorel@ssi.fr>
91 * @link   http://www.php.net/manual/en/function.utf8-encode.php
92 */
93function utf8_check($Str) {
94 for ($i=0; $i<strlen($Str); $i++) {
95  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
96  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
97  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
98  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
99  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
100  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
101  else return false; # Does not match any model
102  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
103   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
104   return false;
105  }
106 }
107 return true;
108}
109
110/**
111 * Unicode aware replacement for strlen()
112 *
113 * utf8_decode() converts characters that are not in ISO-8859-1
114 * to '?', which, for the purpose of counting, is alright - It's
115 * even faster than mb_strlen.
116 *
117 * @author <chernyshevsky at hotmail dot com>
118 * @see    strlen()
119 * @see    utf8_decode()
120 */
121function utf8_strlen($string){
122  return strlen(utf8_decode($string));
123}
124
125/**
126 * UTF-8 aware alternative to substr
127 *
128 * Return part of a string given character offset (and optionally length)
129 * Note: supports use of negative offsets and lengths but will be slower
130 * when doing so
131 *
132 * @author Harry Fuecks <hfuecks@gmail.com>
133 * @param string
134 * @param integer number of UTF-8 characters offset (from left)
135 * @param integer (optional) length in UTF-8 characters from offset
136 * @return mixed string or FALSE if failure
137 */
138function utf8_substr($str, $offset, $length = null) {
139    if(UTF8_MBSTRING){
140        if( $length === null ){
141            return mb_substr($str, $offset);
142        }else{
143            return mb_substr($str, $offset, $length);
144        }
145    }
146
147    if ( $offset >= 0 && $length >= 0 ) {
148        if ( $length === null ) {
149            $length = '*';
150        } else {
151            $strlen = strlen(utf8_decode($str));
152            if ( $offset > $strlen ) {
153                return '';
154            }
155
156            if ( ( $offset + $length ) > $strlen ) {
157               $length = '*';
158            } else {
159                $length = '{'.$length.'}';
160            }
161        }
162
163        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
164        preg_match($pattern, $str, $matches);
165
166        if ( isset($matches[1]) ) {
167            return $matches[1];
168        }
169        return false;
170
171    } else {
172        // Handle negatives using different, slower technique
173        // From: http://www.php.net/manual/en/function.substr.php#44838
174        preg_match_all('/./u', $str, $ar);
175        if( $length !== null ) {
176            return join('',array_slice($ar[0],$offset,$length));
177        } else {
178            return join('',array_slice($ar[0],$offset));
179        }
180    }
181}
182
183
184/**
185 * Unicode aware replacement for substr_replace()
186 *
187 * @author Andreas Gohr <andi@splitbrain.org>
188 * @see    substr_replace()
189 */
190function utf8_substr_replace($string, $replacement, $start , $length=0 ){
191  $ret = '';
192  if($start>0) $ret .= utf8_substr($string, 0, $start);
193  $ret .= $replacement;
194  $ret .= utf8_substr($string, $start+$length);
195  return $ret;
196}
197
198/**
199 * Unicode aware replacement for explode
200 *
201 * @TODO   support third limit arg
202 * @author Harry Fuecks <hfuecks@gmail.com>
203 * @see    explode();
204 */
205function utf8_explode($sep, $str) {
206  if ( $sep == '' ) {
207    trigger_error('Empty delimiter',E_USER_WARNING);
208    return FALSE;
209  }
210
211  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
212}
213
214/**
215 * Unicode aware replacement for strrepalce()
216 *
217 * @todo   support PHP5 count (fourth arg)
218 * @author Harry Fuecks <hfuecks@gmail.com>
219 * @see    strreplace();
220 */
221function utf8_str_replace($s,$r,$str){
222  if(!is_array($s)){
223    $s = '!'.preg_quote($s,'!').'!u';
224  }else{
225    foreach ($s as $k => $v) {
226      $s[$k] = '!'.preg_quote($v).'!u';
227    }
228  }
229  return preg_replace($s,$r,$str);
230}
231
232/**
233 * Unicode aware replacement for ltrim()
234 *
235 * @author Andreas Gohr <andi@splitbrain.org>
236 * @see    ltrim()
237 * @return string
238 */
239function utf8_ltrim($str,$charlist=''){
240  if($charlist == '') return ltrim($str);
241
242  //quote charlist for use in a characterclass
243  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
244
245  return preg_replace('/^['.$charlist.']+/u','',$str);
246}
247
248/**
249 * Unicode aware replacement for rtrim()
250 *
251 * @author Andreas Gohr <andi@splitbrain.org>
252 * @see    rtrim()
253 * @return string
254 */
255function  utf8_rtrim($str,$charlist=''){
256  if($charlist == '') return rtrim($str);
257
258  //quote charlist for use in a characterclass
259  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
260
261  return preg_replace('/['.$charlist.']+$/u','',$str);
262}
263
264/**
265 * Unicode aware replacement for trim()
266 *
267 * @author Andreas Gohr <andi@splitbrain.org>
268 * @see    trim()
269 * @return string
270 */
271function  utf8_trim($str,$charlist='') {
272  if($charlist == '') return trim($str);
273
274  return utf8_ltrim(utf8_rtrim($str));
275}
276
277
278/**
279 * This is a unicode aware replacement for strtolower()
280 *
281 * Uses mb_string extension if available
282 *
283 * @author Andreas Gohr <andi@splitbrain.org>
284 * @see    strtolower()
285 * @see    utf8_strtoupper()
286 */
287function utf8_strtolower($string){
288  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
289
290  global $UTF8_UPPER_TO_LOWER;
291  $uni = utf8_to_unicode($string);
292  $cnt = count($uni);
293  for ($i=0; $i < $cnt; $i++){
294    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
295      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
296    }
297  }
298  return unicode_to_utf8($uni);
299}
300
301/**
302 * This is a unicode aware replacement for strtoupper()
303 *
304 * Uses mb_string extension if available
305 *
306 * @author Andreas Gohr <andi@splitbrain.org>
307 * @see    strtoupper()
308 * @see    utf8_strtoupper()
309 */
310function utf8_strtoupper($string){
311  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
312
313  global $UTF8_LOWER_TO_UPPER;
314  $uni = utf8_to_unicode($string);
315  $cnt = count($uni);
316  for ($i=0; $i < $cnt; $i++){
317    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
318      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
319    }
320  }
321  return unicode_to_utf8($uni);
322}
323
324/**
325 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
326 *
327 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
328 * letters. Default is to deaccent both cases ($case = 0)
329 *
330 * @author Andreas Gohr <andi@splitbrain.org>
331 */
332function utf8_deaccent($string,$case=0){
333  if($case <= 0){
334    global $UTF8_LOWER_ACCENTS;
335    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
336  }
337  if($case >= 0){
338    global $UTF8_UPPER_ACCENTS;
339    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
340  }
341  return $string;
342}
343
344/**
345 * Romanize a non-latin string
346 *
347 * @author Andreas Gohr <andi@splitbrain.org>
348 */
349function utf8_romanize($string){
350  if(utf8_isASCII($string)) return $string; //nothing to do
351
352  global $UTF8_ROMANIZATION;
353  return strtr($string,$UTF8_ROMANIZATION);
354}
355
356/**
357 * Removes special characters (nonalphanumeric) from a UTF-8 string
358 *
359 * This function adds the controlchars 0x00 to 0x19 to the array of
360 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
361 *
362 * @author Andreas Gohr <andi@splitbrain.org>
363 * @param  string $string     The UTF8 string to strip of special chars
364 * @param  string $repl       Replace special with this string
365 * @param  string $additional Additional chars to strip (used in regexp char class)
366 */
367function utf8_stripspecials($string,$repl='',$additional=''){
368  global $UTF8_SPECIAL_CHARS;
369  global $UTF8_SPECIAL_CHARS2;
370
371  static $specials = null;
372  if(is_null($specials)){
373#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
374    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
375  }
376
377  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
378}
379
380/**
381 * This is an Unicode aware replacement for strpos
382 *
383 * Uses mb_string extension if available
384 *
385 * @author Harry Fuecks <hfuecks@gmail.com>
386 * @see    strpos()
387 */
388function utf8_strpos($haystack, $needle,$offset=0) {
389  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
390
391  if(!$offset){
392    $ar = utf8_explode($needle, $haystack);
393    if ( count($ar) > 1 ) {
394       return utf8_strlen($ar[0]);
395    }
396    return false;
397  }else{
398    if ( !is_int($offset) ) {
399      trigger_error('Offset must be an integer',E_USER_WARNING);
400      return false;
401    }
402
403    $haystack = utf8_substr($haystack, $offset);
404
405    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
406       return $pos + $offset;
407    }
408    return false;
409  }
410}
411
412/**
413 * Encodes UTF-8 characters to HTML entities
414 *
415 * @author <vpribish at shopping dot com>
416 * @link   http://www.php.net/manual/en/function.utf8-decode.php
417 */
418function utf8_tohtml ($str) {
419  $ret = '';
420  $max = strlen($str);
421  $last = 0;  // keeps the index of the last regular character
422  for ($i=0; $i<$max; $i++) {
423    $c = $str{$i};
424    $c1 = ord($c);
425    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
426      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
427      $c1 &= 31; // remove the 3 bit two bytes prefix
428      $c2 = ord($str{++$i}); // the next byte
429      $c2 &= 63;  // remove the 2 bit trailing byte prefix
430      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
431      $c1 >>= 2; // c1 shifts 2 to the right
432      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
433      $last = $i+1;
434    }
435  }
436  return $ret . substr($str, $last, $i); // append the last batch of regular characters
437}
438
439/**
440 * Takes an UTF-8 string and returns an array of ints representing the
441 * Unicode characters. Astral planes are supported ie. the ints in the
442 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
443 * are not allowed.
444 *
445 * If $strict is set to true the function returns false if the input
446 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
447 * level E_USER_WARNING
448 *
449 * Note: this function has been modified slightly in this library to
450 * trigger errors on encountering bad bytes
451 *
452 * @author <hsivonen@iki.fi>
453 * @author Harry Fuecks <hfuecks@gmail.com>
454 * @param  string  UTF-8 encoded string
455 * @param  boolean Check for invalid sequences?
456 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
457 * @see    unicode_to_utf8
458 * @link   http://hsivonen.iki.fi/php-utf8/
459 * @link   http://sourceforge.net/projects/phputf8/
460 */
461function utf8_to_unicode($str,$strict=false) {
462    $mState = 0;     // cached expected number of octets after the current octet
463                     // until the beginning of the next UTF8 character sequence
464    $mUcs4  = 0;     // cached Unicode character
465    $mBytes = 1;     // cached expected number of octets in the current sequence
466
467    $out = array();
468
469    $len = strlen($str);
470
471    for($i = 0; $i < $len; $i++) {
472
473        $in = ord($str{$i});
474
475        if ( $mState == 0) {
476
477            // When mState is zero we expect either a US-ASCII character or a
478            // multi-octet sequence.
479            if (0 == (0x80 & ($in))) {
480                // US-ASCII, pass straight through.
481                $out[] = $in;
482                $mBytes = 1;
483
484            } else if (0xC0 == (0xE0 & ($in))) {
485                // First octet of 2 octet sequence
486                $mUcs4 = ($in);
487                $mUcs4 = ($mUcs4 & 0x1F) << 6;
488                $mState = 1;
489                $mBytes = 2;
490
491            } else if (0xE0 == (0xF0 & ($in))) {
492                // First octet of 3 octet sequence
493                $mUcs4 = ($in);
494                $mUcs4 = ($mUcs4 & 0x0F) << 12;
495                $mState = 2;
496                $mBytes = 3;
497
498            } else if (0xF0 == (0xF8 & ($in))) {
499                // First octet of 4 octet sequence
500                $mUcs4 = ($in);
501                $mUcs4 = ($mUcs4 & 0x07) << 18;
502                $mState = 3;
503                $mBytes = 4;
504
505            } else if (0xF8 == (0xFC & ($in))) {
506                /* First octet of 5 octet sequence.
507                 *
508                 * This is illegal because the encoded codepoint must be either
509                 * (a) not the shortest form or
510                 * (b) outside the Unicode range of 0-0x10FFFF.
511                 * Rather than trying to resynchronize, we will carry on until the end
512                 * of the sequence and let the later error handling code catch it.
513                 */
514                $mUcs4 = ($in);
515                $mUcs4 = ($mUcs4 & 0x03) << 24;
516                $mState = 4;
517                $mBytes = 5;
518
519            } else if (0xFC == (0xFE & ($in))) {
520                // First octet of 6 octet sequence, see comments for 5 octet sequence.
521                $mUcs4 = ($in);
522                $mUcs4 = ($mUcs4 & 1) << 30;
523                $mState = 5;
524                $mBytes = 6;
525
526            } elseif($strict) {
527                /* Current octet is neither in the US-ASCII range nor a legal first
528                 * octet of a multi-octet sequence.
529                 */
530                trigger_error(
531                        'utf8_to_unicode: Illegal sequence identifier '.
532                            'in UTF-8 at byte '.$i,
533                        E_USER_WARNING
534                    );
535                return FALSE;
536
537            }
538
539        } else {
540
541            // When mState is non-zero, we expect a continuation of the multi-octet
542            // sequence
543            if (0x80 == (0xC0 & ($in))) {
544
545                // Legal continuation.
546                $shift = ($mState - 1) * 6;
547                $tmp = $in;
548                $tmp = ($tmp & 0x0000003F) << $shift;
549                $mUcs4 |= $tmp;
550
551                /**
552                 * End of the multi-octet sequence. mUcs4 now contains the final
553                 * Unicode codepoint to be output
554                 */
555                if (0 == --$mState) {
556
557                    /*
558                     * Check for illegal sequences and codepoints.
559                     */
560                    // From Unicode 3.1, non-shortest form is illegal
561                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
562                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
563                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
564                        (4 < $mBytes) ||
565                        // From Unicode 3.2, surrogate characters are illegal
566                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
567                        // Codepoints outside the Unicode range are illegal
568                        ($mUcs4 > 0x10FFFF)) {
569
570                        if($strict){
571                            trigger_error(
572                                    'utf8_to_unicode: Illegal sequence or codepoint '.
573                                        'in UTF-8 at byte '.$i,
574                                    E_USER_WARNING
575                                );
576
577                            return FALSE;
578                        }
579
580                    }
581
582                    if (0xFEFF != $mUcs4) {
583                        // BOM is legal but we don't want to output it
584                        $out[] = $mUcs4;
585                    }
586
587                    //initialize UTF8 cache
588                    $mState = 0;
589                    $mUcs4  = 0;
590                    $mBytes = 1;
591                }
592
593            } elseif($strict) {
594                /**
595                 *((0xC0 & (*in) != 0x80) && (mState != 0))
596                 * Incomplete multi-octet sequence.
597                 */
598                trigger_error(
599                        'utf8_to_unicode: Incomplete multi-octet '.
600                        '   sequence in UTF-8 at byte '.$i,
601                        E_USER_WARNING
602                    );
603
604                return FALSE;
605            }
606        }
607    }
608    return $out;
609}
610
611/**
612 * Takes an array of ints representing the Unicode characters and returns
613 * a UTF-8 string. Astral planes are supported ie. the ints in the
614 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
615 * are not allowed.
616 *
617 * If $strict is set to true the function returns false if the input
618 * array contains ints that represent surrogates or are outside the
619 * Unicode range and raises a PHP error at level E_USER_WARNING
620 *
621 * Note: this function has been modified slightly in this library to use
622 * output buffering to concatenate the UTF-8 string (faster) as well as
623 * reference the array by it's keys
624 *
625 * @param  array of unicode code points representing a string
626 * @param  boolean Check for invalid sequences?
627 * @return mixed UTF-8 string or FALSE if array contains invalid code points
628 * @author <hsivonen@iki.fi>
629 * @author Harry Fuecks <hfuecks@gmail.com>
630 * @see    utf8_to_unicode
631 * @link   http://hsivonen.iki.fi/php-utf8/
632 * @link   http://sourceforge.net/projects/phputf8/
633 */
634function unicode_to_utf8($arr,$strict=false) {
635    if (!is_array($arr)) return '';
636    ob_start();
637
638    foreach (array_keys($arr) as $k) {
639
640        # ASCII range (including control chars)
641        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
642
643            echo chr($arr[$k]);
644
645        # 2 byte sequence
646        } else if ($arr[$k] <= 0x07ff) {
647
648            echo chr(0xc0 | ($arr[$k] >> 6));
649            echo chr(0x80 | ($arr[$k] & 0x003f));
650
651        # Byte order mark (skip)
652        } else if($arr[$k] == 0xFEFF) {
653
654            // nop -- zap the BOM
655
656        # Test for illegal surrogates
657        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
658
659            // found a surrogate
660            if($strict){
661                trigger_error(
662                    'unicode_to_utf8: Illegal surrogate '.
663                        'at index: '.$k.', value: '.$arr[$k],
664                    E_USER_WARNING
665                    );
666                return FALSE;
667            }
668
669        # 3 byte sequence
670        } else if ($arr[$k] <= 0xffff) {
671
672            echo chr(0xe0 | ($arr[$k] >> 12));
673            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
674            echo chr(0x80 | ($arr[$k] & 0x003f));
675
676        # 4 byte sequence
677        } else if ($arr[$k] <= 0x10ffff) {
678
679            echo chr(0xf0 | ($arr[$k] >> 18));
680            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
681            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
682            echo chr(0x80 | ($arr[$k] & 0x3f));
683
684        } elseif($strict) {
685
686            trigger_error(
687                'unicode_to_utf8: Codepoint out of Unicode range '.
688                    'at index: '.$k.', value: '.$arr[$k],
689                E_USER_WARNING
690                );
691
692            // out of range
693            return FALSE;
694        }
695    }
696
697    $result = ob_get_contents();
698    ob_end_clean();
699    return $result;
700}
701
702/**
703 * UTF-8 to UTF-16BE conversion.
704 *
705 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
706 */
707function utf8_to_utf16be(&$str, $bom = false) {
708  $out = $bom ? "\xFE\xFF" : '';
709  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
710
711  $uni = utf8_to_unicode($str);
712  foreach($uni as $cp){
713    $out .= pack('n',$cp);
714  }
715  return $out;
716}
717
718/**
719 * UTF-8 to UTF-16BE conversion.
720 *
721 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
722 */
723function utf16be_to_utf8(&$str) {
724  $uni = unpack('n*',$str);
725  return unicode_to_utf8($uni);
726}
727
728/**
729 * Replace bad bytes with an alternative character
730 *
731 * ASCII character is recommended for replacement char
732 *
733 * PCRE Pattern to locate bad bytes in a UTF-8 string
734 * Comes from W3 FAQ: Multilingual Forms
735 * Note: modified to include full ASCII range including control chars
736 *
737 * @author Harry Fuecks <hfuecks@gmail.com>
738 * @see http://www.w3.org/International/questions/qa-forms-utf-8
739 * @param string to search
740 * @param string to replace bad bytes with (defaults to '?') - use ASCII
741 * @return string
742 */
743function utf8_bad_replace($str, $replace = '') {
744    $UTF8_BAD =
745     '([\x00-\x7F]'.                          # ASCII (including control chars)
746     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
747     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
748     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
749     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
750     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
751     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
752     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
753     '|(.{1}))';                              # invalid byte
754    ob_start();
755    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
756        if ( !isset($matches[2])) {
757            echo $matches[0];
758        } else {
759            echo $replace;
760        }
761        $str = substr($str,strlen($matches[0]));
762    }
763    $result = ob_get_contents();
764    ob_end_clean();
765    return $result;
766}
767
768/**
769 * adjust a byte index into a utf8 string to a utf8 character boundary
770 *
771 * @param $str   string   utf8 character string
772 * @param $i     int      byte index into $str
773 * @param $next  bool     direction to search for boundary,
774 *                           false = up (current character)
775 *                           true = down (next character)
776 *
777 * @return int            byte index into $str now pointing to a utf8 character boundary
778 *
779 * @author       chris smith <chris@jalakai.co.uk>
780 */
781function utf8_correctIdx(&$str,$i,$next=false) {
782
783  if ($i <= 0) return 0;
784
785  $limit = strlen($str);
786  if ($i>=$limit) return $limit;
787
788  if ($next) {
789    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
790  } else {
791    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
792  }
793
794  return $i;
795}
796
797// only needed if no mb_string available
798if(!UTF8_MBSTRING){
799
800  /**
801   * UTF-8 Case lookup table
802   *
803   * This lookuptable defines the upper case letters to their correspponding
804   * lower case letter in UTF-8
805   *
806   * @author Andreas Gohr <andi@splitbrain.org>
807   */
808  global $UTF8_LOWER_TO_UPPER;
809  $UTF8_LOWER_TO_UPPER = array(
810    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
811    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
812    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
813    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
814    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
815    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
816    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
817    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
818    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
819    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
820    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
821    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
822    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
823    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
824    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
825    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
826    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
827    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
828    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
829    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
830    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
831    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
832    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
833    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
834    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
835    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
836    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
837    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
838    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
839    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
840    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
841    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
842    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
843    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
844    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
845    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
846    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
847    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
848    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
849    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
850    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
851    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
852    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
853  );
854
855  /**
856   * UTF-8 Case lookup table
857   *
858   * This lookuptable defines the lower case letters to their correspponding
859   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
860   *
861   * @author Andreas Gohr <andi@splitbrain.org>
862   */
863  global $UTF8_UPPER_TO_LOWER;
864  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
865
866} // end of case lookup tables
867
868
869/**
870 * UTF-8 lookup table for lower case accented letters
871 *
872 * This lookuptable defines replacements for accented characters from the ASCII-7
873 * range. This are lower case letters only.
874 *
875 * @author Andreas Gohr <andi@splitbrain.org>
876 * @see    utf8_deaccent()
877 */
878global $UTF8_LOWER_ACCENTS;
879$UTF8_LOWER_ACCENTS = array(
880  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
881  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
882  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
883  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
884  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
885  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
886  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
887  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
888  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
889  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
890  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
891  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
892  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
893  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
894  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
895);
896
897/**
898 * UTF-8 lookup table for upper case accented letters
899 *
900 * This lookuptable defines replacements for accented characters from the ASCII-7
901 * range. This are upper case letters only.
902 *
903 * @author Andreas Gohr <andi@splitbrain.org>
904 * @see    utf8_deaccent()
905 */
906global $UTF8_UPPER_ACCENTS;
907$UTF8_UPPER_ACCENTS = array(
908  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
909  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
910  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
911  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
912  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
913  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
914  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
915  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
916  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
917  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
918  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
919  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
920  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
921  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
922  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
923);
924
925/**
926 * UTF-8 array of common special characters
927 *
928 * This array should contain all special characters (not a letter or digit)
929 * defined in the various local charsets - it's not a complete list of non-alphanum
930 * characters in UTF-8. It's not perfect but should match most cases of special
931 * chars.
932 *
933 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
934 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
935 *
936 * @author Andreas Gohr <andi@splitbrain.org>
937 * @see    utf8_stripspecials()
938 */
939global $UTF8_SPECIAL_CHARS;
940$UTF8_SPECIAL_CHARS = array(
941  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
942  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
943          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
944  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
945  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
946  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
947  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
948  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
949  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
950  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
951  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
952  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
953  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
954  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
955  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
956  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
957  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
958  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
959  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
960  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
961  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
962  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
963  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
964  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
965  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
966  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
967  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
968  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
969  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
970  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
971  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
972  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
973  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
974  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
975  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
976  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
977  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
978  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
979  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
980  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
981  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
982  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
983  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
984  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
985  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
986  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
987  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
988  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
989  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
990  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
991);
992
993// utf8 version of above data
994global $UTF8_SPECIAL_CHARS2;
995$UTF8_SPECIAL_CHARS2 =
996    ' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
997    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
998    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
999    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1000    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1001    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1002    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1003    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1004    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1005    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1006    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1007    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1008    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1009    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1010    '➷➸➹➺➻➼➽➾�'.
1011    '�ﹼﹽ';
1012
1013/**
1014 * Romanization lookup table
1015 *
1016 * This lookup tables provides a way to transform strings written in a language
1017 * different from the ones based upon latin letters into plain ASCII.
1018 *
1019 * Please note: this is not a scientific transliteration table. It only works
1020 * oneway from nonlatin to ASCII and it works by simple character replacement
1021 * only. Specialities of each language are not supported.
1022 *
1023 * @author Andreas Gohr <andi@splitbrain.org>
1024 * @author Vitaly Blokhin <vitinfo@vitn.com>
1025 * @link   http://www.uconv.com/translit.htm
1026 * @author Bisqwit <bisqwit@iki.fi>
1027 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1028 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1029 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1030 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1031 */
1032global $UTF8_ROMANIZATION;
1033$UTF8_ROMANIZATION = array(
1034  //russian cyrillic
1035  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1036  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1037  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1038  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1039  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1040  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1041  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1042  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1043  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1044  // Ukrainian cyrillic
1045  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1046  // Georgian
1047  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1048  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1049  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1050  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1051  'ჰ'=>'xh',
1052  //Sanskrit
1053  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1054  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1055  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1056  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1057  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1058  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1059  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1060  //Hebrew
1061  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1062  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1063  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1064  'ש'=>'sh','ת'=>'t',
1065  //Arabic
1066  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1067  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1068  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1069  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1070
1071  // Japanese hiragana
1072  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1073  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1074  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1075  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1076  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1077  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1078  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1079  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1080  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1081  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1082  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1083  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1084  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1085  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1086  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1087  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1088  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1089  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1090  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1091  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1092  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1093  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1094  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1095  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1096  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1097  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1098  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1099  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1100  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1101  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1102  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1103  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1104  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1105  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1106  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1107  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1108  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1109  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1110  'じゅ'=>'zyu',
1111  // Japanese katakana
1112  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1113  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1114  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1115  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1116  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1117  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1118  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1119  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1120  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1121  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1122  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1123  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1124  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1125  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1126  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1127  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1128  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1129  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1130  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1131  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1132  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1133  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1134  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1135  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1136  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1137  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1138  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1139  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1140  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1141  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1142  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1143  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1144  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1145  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1146  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1147  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1148  'ジョ'=>'zyo','ジュ'=>'zyu',
1149
1150  // "Greeklish"
1151  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1152  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1153
1154  // Thai
1155  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1156  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1157  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1158  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1159  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1160  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1161  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1162  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1163  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1164  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1165  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1166  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1167  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1168  'เ–ียว'=>'iao',
1169
1170  // Korean
1171  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1172  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1173  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1174  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1175  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1176  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1177);
1178
1179//Setup VIM: ex: et ts=2 enc=utf-8 :
1180
1181