xref: /dokuwiki/inc/utf8.php (revision 680824244fa1719cdf9dae5b8c6e74d748459067)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22
23/**
24 * URL-Encode a filename to allow unicodecharacters
25 *
26 * Slashes are not encoded
27 *
28 * When the second parameter is true the string will
29 * be encoded only if non ASCII characters are detected -
30 * This makes it safe to run it multiple times on the
31 * same string (default is true)
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @see    urlencode
35 */
36function utf8_encodeFN($file,$safe=true){
37  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38    return $file;
39  }
40  $file = urlencode($file);
41  $file = str_replace('%2F','/',$file);
42  return $file;
43}
44
45/**
46 * URL-Decode a filename
47 *
48 * This is just a wrapper around urldecode
49 *
50 * @author Andreas Gohr <andi@splitbrain.org>
51 * @see    urldecode
52 */
53function utf8_decodeFN($file){
54  $file = urldecode($file);
55  return $file;
56}
57
58/**
59 * Checks if a string contains 7bit ASCII only
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_isASCII($str){
64  for($i=0; $i<strlen($str); $i++){
65    if(ord($str{$i}) >127) return false;
66  }
67  return true;
68}
69
70/**
71 * Strips all highbyte chars
72 *
73 * Returns a pure ASCII7 string
74 *
75 * @author Andreas Gohr <andi@splitbrain.org>
76 */
77function utf8_strip($str){
78  $ascii = '';
79  for($i=0; $i<strlen($str); $i++){
80    if(ord($str{$i}) <128){
81      $ascii .= $str{$i};
82    }
83  }
84  return $ascii;
85}
86
87/**
88 * Tries to detect if a string is in Unicode encoding
89 *
90 * @author <bmorel@ssi.fr>
91 * @link   http://www.php.net/manual/en/function.utf8-encode.php
92 */
93function utf8_check($Str) {
94 for ($i=0; $i<strlen($Str); $i++) {
95  $b = ord($Str[$i]);
96  if ($b < 0x80) continue; # 0bbbbbbb
97  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
98  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
99  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
100  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
101  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102  else return false; # Does not match any model
103  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105   return false;
106  }
107 }
108 return true;
109}
110
111/**
112 * Unicode aware replacement for strlen()
113 *
114 * utf8_decode() converts characters that are not in ISO-8859-1
115 * to '?', which, for the purpose of counting, is alright - It's
116 * even faster than mb_strlen.
117 *
118 * @author <chernyshevsky at hotmail dot com>
119 * @see    strlen()
120 * @see    utf8_decode()
121 */
122function utf8_strlen($string){
123  return strlen(utf8_decode($string));
124}
125
126/**
127 * UTF-8 aware alternative to substr
128 *
129 * Return part of a string given character offset (and optionally length)
130 *
131 * @author Harry Fuecks <hfuecks@gmail.com>
132 * @author Chris Smith <chris@jalakai.co.uk>
133 * @param string
134 * @param integer number of UTF-8 characters offset (from left)
135 * @param integer (optional) length in UTF-8 characters from offset
136 * @return mixed string or false if failure
137 */
138function utf8_substr($str, $offset, $length = null) {
139    if(UTF8_MBSTRING){
140        if( $length === null ){
141            return mb_substr($str, $offset);
142        }else{
143            return mb_substr($str, $offset, $length);
144        }
145    }
146
147    /*
148     * Notes:
149     *
150     * no mb string support, so we'll use pcre regex's with 'u' flag
151     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
152     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
153     *
154     * substr documentation states false can be returned in some cases (e.g. offset > string length)
155     * mb_substr never returns false, it will return an empty string instead.
156     *
157     * calculating the number of characters in the string is a relatively expensive operation, so
158     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
159     */
160
161    // cast parameters to appropriate types to avoid multiple notices/warnings
162    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
163    $offset = (int)$offset;
164    if (!is_null($length)) $length = (int)$length;
165
166    // handle trivial cases
167    if ($length === 0) return '';
168    if ($offset < 0 && $length < 0 && $length < $offset) return '';
169
170    $offset_pattern = '';
171    $length_pattern = '';
172
173    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
174    if ($offset < 0) {
175      $strlen = strlen(utf8_decode($str));        // see notes
176      $offset = $strlen + $offset;
177      if ($offset < 0) $offset = 0;
178    }
179
180    // establish a pattern for offset, a non-captured group equal in length to offset
181    if ($offset > 0) {
182      $Ox = (int)($offset/65535);
183      $Oy = $offset%65535;
184
185      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
186      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
187    } else {
188      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
189    }
190
191    // establish a pattern for length
192    if (is_null($length)) {
193      $length_pattern = '(.*)$';                  // the rest of the string
194    } else {
195
196      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
197      if ($offset > $strlen) return '';           // another trivial case
198
199      if ($length > 0) {
200
201        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
202
203        $Lx = (int)($length/65535);
204        $Ly = $length%65535;
205
206        // +ve length requires ... a captured group of length characters
207        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
208        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
209
210      } else if ($length < 0) {
211
212        if ($length < ($offset - $strlen)) return '';
213
214        $Lx = (int)((-$length)/65535);
215        $Ly = (-$length)%65535;
216
217        // -ve length requires ... capture everything except a group of -length characters
218        //                         anchored at the tail-end of the string
219        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
220        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
221      }
222    }
223
224    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
225    return $match[1];
226}
227
228/**
229 * Unicode aware replacement for substr_replace()
230 *
231 * @author Andreas Gohr <andi@splitbrain.org>
232 * @see    substr_replace()
233 */
234function utf8_substr_replace($string, $replacement, $start , $length=0 ){
235  $ret = '';
236  if($start>0) $ret .= utf8_substr($string, 0, $start);
237  $ret .= $replacement;
238  $ret .= utf8_substr($string, $start+$length);
239  return $ret;
240}
241
242/**
243 * Unicode aware replacement for explode
244 *
245 * @TODO   support third limit arg
246 * @author Harry Fuecks <hfuecks@gmail.com>
247 * @see    explode();
248 */
249function utf8_explode($sep, $str) {
250  if ( $sep == '' ) {
251    trigger_error('Empty delimiter',E_USER_WARNING);
252    return false;
253  }
254
255  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
256}
257
258/**
259 * Unicode aware replacement for strrepalce()
260 *
261 * @todo   support PHP5 count (fourth arg)
262 * @author Harry Fuecks <hfuecks@gmail.com>
263 * @see    strreplace();
264 */
265function utf8_str_replace($s,$r,$str){
266  if(!is_array($s)){
267    $s = '!'.preg_quote($s,'!').'!u';
268  }else{
269    foreach ($s as $k => $v) {
270      $s[$k] = '!'.preg_quote($v).'!u';
271    }
272  }
273  return preg_replace($s,$r,$str);
274}
275
276/**
277 * Unicode aware replacement for ltrim()
278 *
279 * @author Andreas Gohr <andi@splitbrain.org>
280 * @see    ltrim()
281 * @return string
282 */
283function utf8_ltrim($str,$charlist=''){
284  if($charlist == '') return ltrim($str);
285
286  //quote charlist for use in a characterclass
287  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
288
289  return preg_replace('/^['.$charlist.']+/u','',$str);
290}
291
292/**
293 * Unicode aware replacement for rtrim()
294 *
295 * @author Andreas Gohr <andi@splitbrain.org>
296 * @see    rtrim()
297 * @return string
298 */
299function  utf8_rtrim($str,$charlist=''){
300  if($charlist == '') return rtrim($str);
301
302  //quote charlist for use in a characterclass
303  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
304
305  return preg_replace('/['.$charlist.']+$/u','',$str);
306}
307
308/**
309 * Unicode aware replacement for trim()
310 *
311 * @author Andreas Gohr <andi@splitbrain.org>
312 * @see    trim()
313 * @return string
314 */
315function  utf8_trim($str,$charlist='') {
316  if($charlist == '') return trim($str);
317
318  return utf8_ltrim(utf8_rtrim($str));
319}
320
321
322/**
323 * This is a unicode aware replacement for strtolower()
324 *
325 * Uses mb_string extension if available
326 *
327 * @author Andreas Gohr <andi@splitbrain.org>
328 * @see    strtolower()
329 * @see    utf8_strtoupper()
330 */
331function utf8_strtolower($string){
332  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
333
334  global $UTF8_UPPER_TO_LOWER;
335  $uni = utf8_to_unicode($string);
336  $cnt = count($uni);
337  for ($i=0; $i < $cnt; $i++){
338    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
339      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
340    }
341  }
342  return unicode_to_utf8($uni);
343}
344
345/**
346 * This is a unicode aware replacement for strtoupper()
347 *
348 * Uses mb_string extension if available
349 *
350 * @author Andreas Gohr <andi@splitbrain.org>
351 * @see    strtoupper()
352 * @see    utf8_strtoupper()
353 */
354function utf8_strtoupper($string){
355  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
356
357  global $UTF8_LOWER_TO_UPPER;
358  $uni = utf8_to_unicode($string);
359  $cnt = count($uni);
360  for ($i=0; $i < $cnt; $i++){
361    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
362      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
363    }
364  }
365  return unicode_to_utf8($uni);
366}
367
368/**
369 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
370 *
371 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
372 * letters. Default is to deaccent both cases ($case = 0)
373 *
374 * @author Andreas Gohr <andi@splitbrain.org>
375 */
376function utf8_deaccent($string,$case=0){
377  if($case <= 0){
378    global $UTF8_LOWER_ACCENTS;
379    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
380  }
381  if($case >= 0){
382    global $UTF8_UPPER_ACCENTS;
383    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
384  }
385  return $string;
386}
387
388/**
389 * Romanize a non-latin string
390 *
391 * @author Andreas Gohr <andi@splitbrain.org>
392 */
393function utf8_romanize($string){
394  if(utf8_isASCII($string)) return $string; //nothing to do
395
396  global $UTF8_ROMANIZATION;
397  return strtr($string,$UTF8_ROMANIZATION);
398}
399
400/**
401 * Removes special characters (nonalphanumeric) from a UTF-8 string
402 *
403 * This function adds the controlchars 0x00 to 0x19 to the array of
404 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
405 *
406 * @author Andreas Gohr <andi@splitbrain.org>
407 * @param  string $string     The UTF8 string to strip of special chars
408 * @param  string $repl       Replace special with this string
409 * @param  string $additional Additional chars to strip (used in regexp char class)
410 */
411function utf8_stripspecials($string,$repl='',$additional=''){
412  global $UTF8_SPECIAL_CHARS;
413  global $UTF8_SPECIAL_CHARS2;
414
415  static $specials = null;
416  if(is_null($specials)){
417#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
418    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
419  }
420
421  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
422}
423
424/**
425 * This is an Unicode aware replacement for strpos
426 *
427 * Uses mb_string extension if available
428 *
429 * @author Harry Fuecks <hfuecks@gmail.com>
430 * @see    strpos()
431 */
432function utf8_strpos($haystack, $needle,$offset=0) {
433  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
434
435  if(!$offset){
436    $ar = utf8_explode($needle, $haystack);
437    if ( count($ar) > 1 ) {
438       return utf8_strlen($ar[0]);
439    }
440    return false;
441  }else{
442    if ( !is_int($offset) ) {
443      trigger_error('Offset must be an integer',E_USER_WARNING);
444      return false;
445    }
446
447    $haystack = utf8_substr($haystack, $offset);
448
449    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
450       return $pos + $offset;
451    }
452    return false;
453  }
454}
455
456/**
457 * Encodes UTF-8 characters to HTML entities
458 *
459 * @author Tom N Harris <tnharris@whoopdedo.org>
460 * @author <vpribish at shopping dot com>
461 * @link   http://www.php.net/manual/en/function.utf8-decode.php
462 */
463function utf8_tohtml ($str) {
464    $ret = '';
465    foreach (utf8_to_unicode($str) as $cp) {
466        if ($cp < 0x80)
467            $ret .= chr($cp);
468        elseif ($cp < 0x100)
469            $ret .= "&#$cp;";
470        else
471            $ret .= '&#x'.dechex($cp).';';
472    }
473    return $ret;
474}
475
476/**
477 * Decodes HTML entities to UTF-8 characters
478 *
479 * Convert any &#..; entity to a codepoint,
480 * The entities flag defaults to only decoding numeric entities.
481 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
482 * are handled as well. Avoids the problem that would occur if you
483 * had to decode "&amp;#38;&#38;amp;#38;"
484 *
485 * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
486 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
487 * what it should be                   -> "&#38;&amp#38;"
488 *
489 * @author Tom N Harris <tnharris@whoopdedo.org>
490 * @param  string  $str      UTF-8 encoded string
491 * @param  boolean $entities Flag controlling decoding of named entities.
492 * @return UTF-8 encoded string with numeric (and named) entities replaced.
493 */
494function utf8_unhtml($str, $entities=null) {
495    static $decoder = null;
496    if (is_null($decoder))
497      $decoder = new utf8_entity_decoder();
498    if (is_null($entities))
499        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
500                                     'utf8_decode_numeric', $str);
501    else
502        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
503                                     array(&$decoder, 'decode'), $str);
504}
505function utf8_decode_numeric($ent) {
506    switch ($ent[2]) {
507      case 'X':
508      case 'x':
509          $cp = hexdec($ent[3]);
510          break;
511      default:
512          $cp = intval($ent[3]);
513          break;
514    }
515    return unicode_to_utf8(array($cp));
516}
517class utf8_entity_decoder {
518    var $table;
519    function utf8_entity_decoder() {
520        $table = get_html_translation_table(HTML_ENTITIES);
521        $table = array_flip($table);
522        $this->table = array_map(array(&$this,'makeutf8'), $table);
523    }
524    function makeutf8($c) {
525        return unicode_to_utf8(array(ord($c)));
526    }
527    function decode($ent) {
528        if ($ent[1] == '#') {
529            return utf8_decode_numeric($ent);
530        } elseif (array_key_exists($ent[0],$this->table)) {
531            return $this->table[$ent[0]];
532        } else {
533            return $ent[0];
534        }
535    }
536}
537
538/**
539 * Takes an UTF-8 string and returns an array of ints representing the
540 * Unicode characters. Astral planes are supported ie. the ints in the
541 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
542 * are not allowed.
543 *
544 * If $strict is set to true the function returns false if the input
545 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
546 * level E_USER_WARNING
547 *
548 * Note: this function has been modified slightly in this library to
549 * trigger errors on encountering bad bytes
550 *
551 * @author <hsivonen@iki.fi>
552 * @author Harry Fuecks <hfuecks@gmail.com>
553 * @param  string  UTF-8 encoded string
554 * @param  boolean Check for invalid sequences?
555 * @return mixed array of unicode code points or false if UTF-8 invalid
556 * @see    unicode_to_utf8
557 * @link   http://hsivonen.iki.fi/php-utf8/
558 * @link   http://sourceforge.net/projects/phputf8/
559 */
560function utf8_to_unicode($str,$strict=false) {
561    $mState = 0;     // cached expected number of octets after the current octet
562                     // until the beginning of the next UTF8 character sequence
563    $mUcs4  = 0;     // cached Unicode character
564    $mBytes = 1;     // cached expected number of octets in the current sequence
565
566    $out = array();
567
568    $len = strlen($str);
569
570    for($i = 0; $i < $len; $i++) {
571
572        $in = ord($str{$i});
573
574        if ( $mState == 0) {
575
576            // When mState is zero we expect either a US-ASCII character or a
577            // multi-octet sequence.
578            if (0 == (0x80 & ($in))) {
579                // US-ASCII, pass straight through.
580                $out[] = $in;
581                $mBytes = 1;
582
583            } else if (0xC0 == (0xE0 & ($in))) {
584                // First octet of 2 octet sequence
585                $mUcs4 = ($in);
586                $mUcs4 = ($mUcs4 & 0x1F) << 6;
587                $mState = 1;
588                $mBytes = 2;
589
590            } else if (0xE0 == (0xF0 & ($in))) {
591                // First octet of 3 octet sequence
592                $mUcs4 = ($in);
593                $mUcs4 = ($mUcs4 & 0x0F) << 12;
594                $mState = 2;
595                $mBytes = 3;
596
597            } else if (0xF0 == (0xF8 & ($in))) {
598                // First octet of 4 octet sequence
599                $mUcs4 = ($in);
600                $mUcs4 = ($mUcs4 & 0x07) << 18;
601                $mState = 3;
602                $mBytes = 4;
603
604            } else if (0xF8 == (0xFC & ($in))) {
605                /* First octet of 5 octet sequence.
606                 *
607                 * This is illegal because the encoded codepoint must be either
608                 * (a) not the shortest form or
609                 * (b) outside the Unicode range of 0-0x10FFFF.
610                 * Rather than trying to resynchronize, we will carry on until the end
611                 * of the sequence and let the later error handling code catch it.
612                 */
613                $mUcs4 = ($in);
614                $mUcs4 = ($mUcs4 & 0x03) << 24;
615                $mState = 4;
616                $mBytes = 5;
617
618            } else if (0xFC == (0xFE & ($in))) {
619                // First octet of 6 octet sequence, see comments for 5 octet sequence.
620                $mUcs4 = ($in);
621                $mUcs4 = ($mUcs4 & 1) << 30;
622                $mState = 5;
623                $mBytes = 6;
624
625            } elseif($strict) {
626                /* Current octet is neither in the US-ASCII range nor a legal first
627                 * octet of a multi-octet sequence.
628                 */
629                trigger_error(
630                        'utf8_to_unicode: Illegal sequence identifier '.
631                            'in UTF-8 at byte '.$i,
632                        E_USER_WARNING
633                    );
634                return false;
635
636            }
637
638        } else {
639
640            // When mState is non-zero, we expect a continuation of the multi-octet
641            // sequence
642            if (0x80 == (0xC0 & ($in))) {
643
644                // Legal continuation.
645                $shift = ($mState - 1) * 6;
646                $tmp = $in;
647                $tmp = ($tmp & 0x0000003F) << $shift;
648                $mUcs4 |= $tmp;
649
650                /**
651                 * End of the multi-octet sequence. mUcs4 now contains the final
652                 * Unicode codepoint to be output
653                 */
654                if (0 == --$mState) {
655
656                    /*
657                     * Check for illegal sequences and codepoints.
658                     */
659                    // From Unicode 3.1, non-shortest form is illegal
660                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
661                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
662                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
663                        (4 < $mBytes) ||
664                        // From Unicode 3.2, surrogate characters are illegal
665                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
666                        // Codepoints outside the Unicode range are illegal
667                        ($mUcs4 > 0x10FFFF)) {
668
669                        if($strict){
670                            trigger_error(
671                                    'utf8_to_unicode: Illegal sequence or codepoint '.
672                                        'in UTF-8 at byte '.$i,
673                                    E_USER_WARNING
674                                );
675
676                            return false;
677                        }
678
679                    }
680
681                    if (0xFEFF != $mUcs4) {
682                        // BOM is legal but we don't want to output it
683                        $out[] = $mUcs4;
684                    }
685
686                    //initialize UTF8 cache
687                    $mState = 0;
688                    $mUcs4  = 0;
689                    $mBytes = 1;
690                }
691
692            } elseif($strict) {
693                /**
694                 *((0xC0 & (*in) != 0x80) && (mState != 0))
695                 * Incomplete multi-octet sequence.
696                 */
697                trigger_error(
698                        'utf8_to_unicode: Incomplete multi-octet '.
699                        '   sequence in UTF-8 at byte '.$i,
700                        E_USER_WARNING
701                    );
702
703                return false;
704            }
705        }
706    }
707    return $out;
708}
709
710/**
711 * Takes an array of ints representing the Unicode characters and returns
712 * a UTF-8 string. Astral planes are supported ie. the ints in the
713 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
714 * are not allowed.
715 *
716 * If $strict is set to true the function returns false if the input
717 * array contains ints that represent surrogates or are outside the
718 * Unicode range and raises a PHP error at level E_USER_WARNING
719 *
720 * Note: this function has been modified slightly in this library to use
721 * output buffering to concatenate the UTF-8 string (faster) as well as
722 * reference the array by it's keys
723 *
724 * @param  array of unicode code points representing a string
725 * @param  boolean Check for invalid sequences?
726 * @return mixed UTF-8 string or false if array contains invalid code points
727 * @author <hsivonen@iki.fi>
728 * @author Harry Fuecks <hfuecks@gmail.com>
729 * @see    utf8_to_unicode
730 * @link   http://hsivonen.iki.fi/php-utf8/
731 * @link   http://sourceforge.net/projects/phputf8/
732 */
733function unicode_to_utf8($arr,$strict=false) {
734    if (!is_array($arr)) return '';
735    ob_start();
736
737    foreach (array_keys($arr) as $k) {
738
739        # ASCII range (including control chars)
740        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
741
742            echo chr($arr[$k]);
743
744        # 2 byte sequence
745        } else if ($arr[$k] <= 0x07ff) {
746
747            echo chr(0xc0 | ($arr[$k] >> 6));
748            echo chr(0x80 | ($arr[$k] & 0x003f));
749
750        # Byte order mark (skip)
751        } else if($arr[$k] == 0xFEFF) {
752
753            // nop -- zap the BOM
754
755        # Test for illegal surrogates
756        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
757
758            // found a surrogate
759            if($strict){
760                trigger_error(
761                    'unicode_to_utf8: Illegal surrogate '.
762                        'at index: '.$k.', value: '.$arr[$k],
763                    E_USER_WARNING
764                    );
765                return false;
766            }
767
768        # 3 byte sequence
769        } else if ($arr[$k] <= 0xffff) {
770
771            echo chr(0xe0 | ($arr[$k] >> 12));
772            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
773            echo chr(0x80 | ($arr[$k] & 0x003f));
774
775        # 4 byte sequence
776        } else if ($arr[$k] <= 0x10ffff) {
777
778            echo chr(0xf0 | ($arr[$k] >> 18));
779            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
780            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
781            echo chr(0x80 | ($arr[$k] & 0x3f));
782
783        } elseif($strict) {
784
785            trigger_error(
786                'unicode_to_utf8: Codepoint out of Unicode range '.
787                    'at index: '.$k.', value: '.$arr[$k],
788                E_USER_WARNING
789                );
790
791            // out of range
792            return false;
793        }
794    }
795
796    $result = ob_get_contents();
797    ob_end_clean();
798    return $result;
799}
800
801/**
802 * UTF-8 to UTF-16BE conversion.
803 *
804 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
805 */
806function utf8_to_utf16be(&$str, $bom = false) {
807  $out = $bom ? "\xFE\xFF" : '';
808  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
809
810  $uni = utf8_to_unicode($str);
811  foreach($uni as $cp){
812    $out .= pack('n',$cp);
813  }
814  return $out;
815}
816
817/**
818 * UTF-8 to UTF-16BE conversion.
819 *
820 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
821 */
822function utf16be_to_utf8(&$str) {
823  $uni = unpack('n*',$str);
824  return unicode_to_utf8($uni);
825}
826
827/**
828 * Replace bad bytes with an alternative character
829 *
830 * ASCII character is recommended for replacement char
831 *
832 * PCRE Pattern to locate bad bytes in a UTF-8 string
833 * Comes from W3 FAQ: Multilingual Forms
834 * Note: modified to include full ASCII range including control chars
835 *
836 * @author Harry Fuecks <hfuecks@gmail.com>
837 * @see http://www.w3.org/International/questions/qa-forms-utf-8
838 * @param string to search
839 * @param string to replace bad bytes with (defaults to '?') - use ASCII
840 * @return string
841 */
842function utf8_bad_replace($str, $replace = '') {
843    $UTF8_BAD =
844     '([\x00-\x7F]'.                          # ASCII (including control chars)
845     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
846     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
847     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
848     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
849     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
850     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
851     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
852     '|(.{1}))';                              # invalid byte
853    ob_start();
854    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
855        if ( !isset($matches[2])) {
856            echo $matches[0];
857        } else {
858            echo $replace;
859        }
860        $str = substr($str,strlen($matches[0]));
861    }
862    $result = ob_get_contents();
863    ob_end_clean();
864    return $result;
865}
866
867/**
868 * adjust a byte index into a utf8 string to a utf8 character boundary
869 *
870 * @param $str   string   utf8 character string
871 * @param $i     int      byte index into $str
872 * @param $next  bool     direction to search for boundary,
873 *                           false = up (current character)
874 *                           true = down (next character)
875 *
876 * @return int            byte index into $str now pointing to a utf8 character boundary
877 *
878 * @author       chris smith <chris@jalakai.co.uk>
879 */
880function utf8_correctIdx(&$str,$i,$next=false) {
881
882  if ($i <= 0) return 0;
883
884  $limit = strlen($str);
885  if ($i>=$limit) return $limit;
886
887  if ($next) {
888    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
889  } else {
890    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
891  }
892
893  return $i;
894}
895
896// only needed if no mb_string available
897if(!UTF8_MBSTRING){
898
899  /**
900   * UTF-8 Case lookup table
901   *
902   * This lookuptable defines the upper case letters to their correspponding
903   * lower case letter in UTF-8
904   *
905   * @author Andreas Gohr <andi@splitbrain.org>
906   */
907  global $UTF8_LOWER_TO_UPPER;
908  $UTF8_LOWER_TO_UPPER = array(
909    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
910    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
911    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
912    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
913    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
914    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
915    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
916    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
917    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
918    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
919    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
920    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
921    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
922    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
923    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
924    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
925    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
926    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
927    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
928    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
929    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
930    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
931    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
932    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
933    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
934    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
935    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
936    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
937    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
938    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
939    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
940    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
941    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
942    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
943    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
944    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
945    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
946    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
947    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
948    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
949    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
950    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
951    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
952  );
953
954  /**
955   * UTF-8 Case lookup table
956   *
957   * This lookuptable defines the lower case letters to their correspponding
958   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
959   *
960   * @author Andreas Gohr <andi@splitbrain.org>
961   */
962  global $UTF8_UPPER_TO_LOWER;
963  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
964
965} // end of case lookup tables
966
967
968/**
969 * UTF-8 lookup table for lower case accented letters
970 *
971 * This lookuptable defines replacements for accented characters from the ASCII-7
972 * range. This are lower case letters only.
973 *
974 * @author Andreas Gohr <andi@splitbrain.org>
975 * @see    utf8_deaccent()
976 */
977global $UTF8_LOWER_ACCENTS;
978$UTF8_LOWER_ACCENTS = array(
979  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
980  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
981  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
982  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
983  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
984  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
985  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
986  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
987  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
988  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
989  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
990  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
991  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
992  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
993  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
994);
995
996/**
997 * UTF-8 lookup table for upper case accented letters
998 *
999 * This lookuptable defines replacements for accented characters from the ASCII-7
1000 * range. This are upper case letters only.
1001 *
1002 * @author Andreas Gohr <andi@splitbrain.org>
1003 * @see    utf8_deaccent()
1004 */
1005global $UTF8_UPPER_ACCENTS;
1006$UTF8_UPPER_ACCENTS = array(
1007  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1008  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1009  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1010  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1011  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1012  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1013  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1014  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1015  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1016  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1017  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1018  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1019  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1020  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1021  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1022);
1023
1024/**
1025 * UTF-8 array of common special characters
1026 *
1027 * This array should contain all special characters (not a letter or digit)
1028 * defined in the various local charsets - it's not a complete list of non-alphanum
1029 * characters in UTF-8. It's not perfect but should match most cases of special
1030 * chars.
1031 *
1032 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1033 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1034 *
1035 * @author Andreas Gohr <andi@splitbrain.org>
1036 * @see    utf8_stripspecials()
1037 */
1038global $UTF8_SPECIAL_CHARS;
1039$UTF8_SPECIAL_CHARS = array(
1040  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1041  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1042          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1043  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1044  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1045  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1046  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1047  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1048  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1049  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1050  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1051  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1052  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1053  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1054  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1055  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1056  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1057  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1058  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1059  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1060  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1061  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1062  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1063  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1064  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1065  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1066  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1067  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1068  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1069  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1070  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1071  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1072  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1073  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1074  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1075  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1076  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1077  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1078  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1079  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1080  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1081  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1082  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1083  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1084  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1085  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1086  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1087  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1088  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1089  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1090  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1091  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1092  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1093          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1094  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1095  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1096  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1097  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1098  0xffeb, 0xffec, 0xffed, 0xffee,
1099);
1100
1101// utf8 version of above data
1102global $UTF8_SPECIAL_CHARS2;
1103$UTF8_SPECIAL_CHARS2 =
1104    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1105    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1106    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1107    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1108    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1109    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1110    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1111    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1112    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1113    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1114    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1115    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1116    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1117    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1118    '➷➸➹➺➻➼➽➾'.
1119    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1120    '�'.
1121    '�ﹼﹽ'.
1122    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1123    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○';
1124
1125/**
1126 * Romanization lookup table
1127 *
1128 * This lookup tables provides a way to transform strings written in a language
1129 * different from the ones based upon latin letters into plain ASCII.
1130 *
1131 * Please note: this is not a scientific transliteration table. It only works
1132 * oneway from nonlatin to ASCII and it works by simple character replacement
1133 * only. Specialities of each language are not supported.
1134 *
1135 * @author Andreas Gohr <andi@splitbrain.org>
1136 * @author Vitaly Blokhin <vitinfo@vitn.com>
1137 * @link   http://www.uconv.com/translit.htm
1138 * @author Bisqwit <bisqwit@iki.fi>
1139 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1140 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1141 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1142 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1143 */
1144global $UTF8_ROMANIZATION;
1145$UTF8_ROMANIZATION = array(
1146  //russian cyrillic
1147  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1148  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1149  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1150  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1151  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1152  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1153  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1154  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1155  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1156  // Ukrainian cyrillic
1157  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1158  // Georgian
1159  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1160  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1161  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1162  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1163  'ჰ'=>'xh',
1164  //Sanskrit
1165  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1166  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1167  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1168  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1169  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1170  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1171  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1172  //Hebrew
1173  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1174  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1175  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1176  'ש'=>'sh','ת'=>'t',
1177  //Arabic
1178  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1179  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1180  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1181  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1182
1183  // Japanese hiragana
1184  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1185  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1186  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1187  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1188  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1189  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1190  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1191  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1192  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1193  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1194  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1195  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1196  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1197  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1198  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1199  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1200  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1201  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1202  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1203  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1204  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1205  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1206  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1207  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1208  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1209  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1210  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1211  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1212  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1213  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1214  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1215  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1216  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1217  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1218  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1219  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1220  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1221  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1222  'じゅ'=>'zyu',
1223  // Japanese katakana
1224  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1225  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1226  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1227  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1228  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1229  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1230  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1231  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1232  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1233  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1234  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1235  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1236  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1237  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1238  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1239  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1240  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1241  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1242  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1243  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1244  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1245  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1246  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1247  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1248  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1249  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1250  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1251  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1252  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1253  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1254  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1255  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1256  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1257  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1258  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1259  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1260  'ジョ'=>'zyo','ジュ'=>'zyu',
1261
1262  // "Greeklish"
1263  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1264  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1265
1266  // Thai
1267  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1268  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1269  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1270  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1271  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1272  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1273  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1274  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1275  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1276  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1277  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1278  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1279  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1280  'เ–ียว'=>'iao',
1281
1282  // Korean
1283  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1284  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1285  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1286  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1287  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1288  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1289);
1290
1291//Setup VIM: ex: et ts=2 enc=utf-8 :
1292
1293