xref: /dokuwiki/inc/utf8.php (revision 01e7a8612cf2ac9dbb9963c2a3ad5876d0ba4132)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22
23/**
24 * URL-Encode a filename to allow unicodecharacters
25 *
26 * Slashes are not encoded
27 *
28 * When the second parameter is true the string will
29 * be encoded only if non ASCII characters are detected -
30 * This makes it safe to run it multiple times on the
31 * same string (default is true)
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @see    urlencode
35 */
36function utf8_encodeFN($file,$safe=true){
37  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38    return $file;
39  }
40  $file = urlencode($file);
41  $file = str_replace('%2F','/',$file);
42  return $file;
43}
44
45/**
46 * URL-Decode a filename
47 *
48 * This is just a wrapper around urldecode
49 *
50 * @author Andreas Gohr <andi@splitbrain.org>
51 * @see    urldecode
52 */
53function utf8_decodeFN($file){
54  $file = urldecode($file);
55  return $file;
56}
57
58/**
59 * Checks if a string contains 7bit ASCII only
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_isASCII($str){
64  for($i=0; $i<strlen($str); $i++){
65    if(ord($str{$i}) >127) return false;
66  }
67  return true;
68}
69
70/**
71 * Strips all highbyte chars
72 *
73 * Returns a pure ASCII7 string
74 *
75 * @author Andreas Gohr <andi@splitbrain.org>
76 */
77function utf8_strip($str){
78  $ascii = '';
79  for($i=0; $i<strlen($str); $i++){
80    if(ord($str{$i}) <128){
81      $ascii .= $str{$i};
82    }
83  }
84  return $ascii;
85}
86
87/**
88 * Tries to detect if a string is in Unicode encoding
89 *
90 * @author <bmorel@ssi.fr>
91 * @link   http://www.php.net/manual/en/function.utf8-encode.php
92 */
93function utf8_check($Str) {
94 for ($i=0; $i<strlen($Str); $i++) {
95  $b = ord($Str[$i]);
96  if ($b < 0x80) continue; # 0bbbbbbb
97  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
98  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
99  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
100  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
101  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102  else return false; # Does not match any model
103  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105   return false;
106  }
107 }
108 return true;
109}
110
111/**
112 * Unicode aware replacement for strlen()
113 *
114 * utf8_decode() converts characters that are not in ISO-8859-1
115 * to '?', which, for the purpose of counting, is alright - It's
116 * even faster than mb_strlen.
117 *
118 * @author <chernyshevsky at hotmail dot com>
119 * @see    strlen()
120 * @see    utf8_decode()
121 */
122function utf8_strlen($string){
123  return strlen(utf8_decode($string));
124}
125
126/**
127 * UTF-8 aware alternative to substr
128 *
129 * Return part of a string given character offset (and optionally length)
130 *
131 * @author Harry Fuecks <hfuecks@gmail.com>
132 * @author Chris Smith <chris@jalakai.co.uk>
133 * @param string
134 * @param integer number of UTF-8 characters offset (from left)
135 * @param integer (optional) length in UTF-8 characters from offset
136 * @return mixed string or FALSE if failure
137 */
138function utf8_substr($str, $offset, $length = null) {
139    if(UTF8_MBSTRING){
140        if( $length === null ){
141            return mb_substr($str, $offset);
142        }else{
143            return mb_substr($str, $offset, $length);
144        }
145    }
146
147    /*
148     * Notes:
149     *
150     * no mb string support, so we'll use pcre regex's with 'u' flag
151     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
152     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
153     *
154     * substr documentation states false can be returned in some cases (e.g. offset > string length)
155     * mb_substr never returns false, it will return an empty string instead.
156     *
157     * calculating the number of characters in the string is a relatively expensive operation, so
158     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
159     */
160
161    // cast parameters to appropriate types to avoid multiple notices/warnings
162    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
163    $offset = (int)$offset;
164    if (!is_null($length)) $length = (int)$length;
165
166    // handle trivial cases
167    if ($length === 0) return '';
168    if ($offset < 0 && $length < 0 && $length < $offset) return '';
169
170    $offset_pattern = '';
171    $length_pattern = '';
172
173    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
174    if ($offset < 0) {
175      $strlen = strlen(utf8_decode($str));        // see notes
176      $offset = $strlen + $offset;
177      if ($offset < 0) $offset = 0;
178    }
179
180    // establish a pattern for offset, a non-captured group equal in length to offset
181    if ($offset > 0) {
182      $Ox = (int)($offset/65535);
183      $Oy = $offset%65535;
184
185      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
186      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
187    } else {
188      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
189    }
190
191    // establish a pattern for length
192    if (is_null($length)) {
193      $length_pattern = '(.*)$';                  // the rest of the string
194    } else {
195
196      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
197      if ($offset > $strlen) return '';           // another trivial case
198
199      if ($length > 0) {
200
201        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
202
203        $Lx = (int)($length/65535);
204        $Ly = $length%65535;
205
206        // +ve length requires ... a captured group of length characters
207        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
208        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
209
210      } else if ($length < 0) {
211
212        if ($length < ($offset - $strlen)) return '';
213
214        $Lx = (int)((-$length)/65535);
215        $Ly = (-$length)%65535;
216
217        // -ve length requires ... capture everything except a group of -length characters
218        //                         anchored at the tail-end of the string
219        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
220        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
221      }
222    }
223
224    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
225    return $match[1];
226}
227
228/**
229 * Unicode aware replacement for substr_replace()
230 *
231 * @author Andreas Gohr <andi@splitbrain.org>
232 * @see    substr_replace()
233 */
234function utf8_substr_replace($string, $replacement, $start , $length=0 ){
235  $ret = '';
236  if($start>0) $ret .= utf8_substr($string, 0, $start);
237  $ret .= $replacement;
238  $ret .= utf8_substr($string, $start+$length);
239  return $ret;
240}
241
242/**
243 * Unicode aware replacement for explode
244 *
245 * @TODO   support third limit arg
246 * @author Harry Fuecks <hfuecks@gmail.com>
247 * @see    explode();
248 */
249function utf8_explode($sep, $str) {
250  if ( $sep == '' ) {
251    trigger_error('Empty delimiter',E_USER_WARNING);
252    return FALSE;
253  }
254
255  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
256}
257
258/**
259 * Unicode aware replacement for strrepalce()
260 *
261 * @todo   support PHP5 count (fourth arg)
262 * @author Harry Fuecks <hfuecks@gmail.com>
263 * @see    strreplace();
264 */
265function utf8_str_replace($s,$r,$str){
266  if(!is_array($s)){
267    $s = '!'.preg_quote($s,'!').'!u';
268  }else{
269    foreach ($s as $k => $v) {
270      $s[$k] = '!'.preg_quote($v).'!u';
271    }
272  }
273  return preg_replace($s,$r,$str);
274}
275
276/**
277 * Unicode aware replacement for ltrim()
278 *
279 * @author Andreas Gohr <andi@splitbrain.org>
280 * @see    ltrim()
281 * @return string
282 */
283function utf8_ltrim($str,$charlist=''){
284  if($charlist == '') return ltrim($str);
285
286  //quote charlist for use in a characterclass
287  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
288
289  return preg_replace('/^['.$charlist.']+/u','',$str);
290}
291
292/**
293 * Unicode aware replacement for rtrim()
294 *
295 * @author Andreas Gohr <andi@splitbrain.org>
296 * @see    rtrim()
297 * @return string
298 */
299function  utf8_rtrim($str,$charlist=''){
300  if($charlist == '') return rtrim($str);
301
302  //quote charlist for use in a characterclass
303  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
304
305  return preg_replace('/['.$charlist.']+$/u','',$str);
306}
307
308/**
309 * Unicode aware replacement for trim()
310 *
311 * @author Andreas Gohr <andi@splitbrain.org>
312 * @see    trim()
313 * @return string
314 */
315function  utf8_trim($str,$charlist='') {
316  if($charlist == '') return trim($str);
317
318  return utf8_ltrim(utf8_rtrim($str));
319}
320
321
322/**
323 * This is a unicode aware replacement for strtolower()
324 *
325 * Uses mb_string extension if available
326 *
327 * @author Andreas Gohr <andi@splitbrain.org>
328 * @see    strtolower()
329 * @see    utf8_strtoupper()
330 */
331function utf8_strtolower($string){
332  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
333
334  global $UTF8_UPPER_TO_LOWER;
335  $uni = utf8_to_unicode($string);
336  $cnt = count($uni);
337  for ($i=0; $i < $cnt; $i++){
338    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
339      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
340    }
341  }
342  return unicode_to_utf8($uni);
343}
344
345/**
346 * This is a unicode aware replacement for strtoupper()
347 *
348 * Uses mb_string extension if available
349 *
350 * @author Andreas Gohr <andi@splitbrain.org>
351 * @see    strtoupper()
352 * @see    utf8_strtoupper()
353 */
354function utf8_strtoupper($string){
355  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
356
357  global $UTF8_LOWER_TO_UPPER;
358  $uni = utf8_to_unicode($string);
359  $cnt = count($uni);
360  for ($i=0; $i < $cnt; $i++){
361    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
362      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
363    }
364  }
365  return unicode_to_utf8($uni);
366}
367
368/**
369 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
370 *
371 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
372 * letters. Default is to deaccent both cases ($case = 0)
373 *
374 * @author Andreas Gohr <andi@splitbrain.org>
375 */
376function utf8_deaccent($string,$case=0){
377  if($case <= 0){
378    global $UTF8_LOWER_ACCENTS;
379    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
380  }
381  if($case >= 0){
382    global $UTF8_UPPER_ACCENTS;
383    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
384  }
385  return $string;
386}
387
388/**
389 * Romanize a non-latin string
390 *
391 * @author Andreas Gohr <andi@splitbrain.org>
392 */
393function utf8_romanize($string){
394  if(utf8_isASCII($string)) return $string; //nothing to do
395
396  global $UTF8_ROMANIZATION;
397  return strtr($string,$UTF8_ROMANIZATION);
398}
399
400/**
401 * Removes special characters (nonalphanumeric) from a UTF-8 string
402 *
403 * This function adds the controlchars 0x00 to 0x19 to the array of
404 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
405 *
406 * @author Andreas Gohr <andi@splitbrain.org>
407 * @param  string $string     The UTF8 string to strip of special chars
408 * @param  string $repl       Replace special with this string
409 * @param  string $additional Additional chars to strip (used in regexp char class)
410 */
411function utf8_stripspecials($string,$repl='',$additional=''){
412  global $UTF8_SPECIAL_CHARS;
413  global $UTF8_SPECIAL_CHARS2;
414
415  static $specials = null;
416  if(is_null($specials)){
417#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
418    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
419  }
420
421  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
422}
423
424/**
425 * This is an Unicode aware replacement for strpos
426 *
427 * Uses mb_string extension if available
428 *
429 * @author Harry Fuecks <hfuecks@gmail.com>
430 * @see    strpos()
431 */
432function utf8_strpos($haystack, $needle,$offset=0) {
433  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
434
435  if(!$offset){
436    $ar = utf8_explode($needle, $haystack);
437    if ( count($ar) > 1 ) {
438       return utf8_strlen($ar[0]);
439    }
440    return false;
441  }else{
442    if ( !is_int($offset) ) {
443      trigger_error('Offset must be an integer',E_USER_WARNING);
444      return false;
445    }
446
447    $haystack = utf8_substr($haystack, $offset);
448
449    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
450       return $pos + $offset;
451    }
452    return false;
453  }
454}
455
456/**
457 * Encodes UTF-8 characters to HTML entities
458 *
459 * @author <vpribish at shopping dot com>
460 * @link   http://www.php.net/manual/en/function.utf8-decode.php
461 */
462function utf8_tohtml ($str) {
463  $ret = '';
464  $max = strlen($str);
465  $last = 0;  // keeps the index of the last regular character
466  for ($i=0; $i<$max; $i++) {
467    $c = $str{$i};
468    $c1 = ord($c);
469    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
470      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
471      $c1 &= 31; // remove the 3 bit two bytes prefix
472      $c2 = ord($str{++$i}); // the next byte
473      $c2 &= 63;  // remove the 2 bit trailing byte prefix
474      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
475      $c1 >>= 2; // c1 shifts 2 to the right
476      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
477      $last = $i+1;
478    }
479  }
480  return $ret . substr($str, $last, $i); // append the last batch of regular characters
481}
482
483/**
484 * Takes an UTF-8 string and returns an array of ints representing the
485 * Unicode characters. Astral planes are supported ie. the ints in the
486 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
487 * are not allowed.
488 *
489 * If $strict is set to true the function returns false if the input
490 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
491 * level E_USER_WARNING
492 *
493 * Note: this function has been modified slightly in this library to
494 * trigger errors on encountering bad bytes
495 *
496 * @author <hsivonen@iki.fi>
497 * @author Harry Fuecks <hfuecks@gmail.com>
498 * @param  string  UTF-8 encoded string
499 * @param  boolean Check for invalid sequences?
500 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
501 * @see    unicode_to_utf8
502 * @link   http://hsivonen.iki.fi/php-utf8/
503 * @link   http://sourceforge.net/projects/phputf8/
504 */
505function utf8_to_unicode($str,$strict=false) {
506    $mState = 0;     // cached expected number of octets after the current octet
507                     // until the beginning of the next UTF8 character sequence
508    $mUcs4  = 0;     // cached Unicode character
509    $mBytes = 1;     // cached expected number of octets in the current sequence
510
511    $out = array();
512
513    $len = strlen($str);
514
515    for($i = 0; $i < $len; $i++) {
516
517        $in = ord($str{$i});
518
519        if ( $mState == 0) {
520
521            // When mState is zero we expect either a US-ASCII character or a
522            // multi-octet sequence.
523            if (0 == (0x80 & ($in))) {
524                // US-ASCII, pass straight through.
525                $out[] = $in;
526                $mBytes = 1;
527
528            } else if (0xC0 == (0xE0 & ($in))) {
529                // First octet of 2 octet sequence
530                $mUcs4 = ($in);
531                $mUcs4 = ($mUcs4 & 0x1F) << 6;
532                $mState = 1;
533                $mBytes = 2;
534
535            } else if (0xE0 == (0xF0 & ($in))) {
536                // First octet of 3 octet sequence
537                $mUcs4 = ($in);
538                $mUcs4 = ($mUcs4 & 0x0F) << 12;
539                $mState = 2;
540                $mBytes = 3;
541
542            } else if (0xF0 == (0xF8 & ($in))) {
543                // First octet of 4 octet sequence
544                $mUcs4 = ($in);
545                $mUcs4 = ($mUcs4 & 0x07) << 18;
546                $mState = 3;
547                $mBytes = 4;
548
549            } else if (0xF8 == (0xFC & ($in))) {
550                /* First octet of 5 octet sequence.
551                 *
552                 * This is illegal because the encoded codepoint must be either
553                 * (a) not the shortest form or
554                 * (b) outside the Unicode range of 0-0x10FFFF.
555                 * Rather than trying to resynchronize, we will carry on until the end
556                 * of the sequence and let the later error handling code catch it.
557                 */
558                $mUcs4 = ($in);
559                $mUcs4 = ($mUcs4 & 0x03) << 24;
560                $mState = 4;
561                $mBytes = 5;
562
563            } else if (0xFC == (0xFE & ($in))) {
564                // First octet of 6 octet sequence, see comments for 5 octet sequence.
565                $mUcs4 = ($in);
566                $mUcs4 = ($mUcs4 & 1) << 30;
567                $mState = 5;
568                $mBytes = 6;
569
570            } elseif($strict) {
571                /* Current octet is neither in the US-ASCII range nor a legal first
572                 * octet of a multi-octet sequence.
573                 */
574                trigger_error(
575                        'utf8_to_unicode: Illegal sequence identifier '.
576                            'in UTF-8 at byte '.$i,
577                        E_USER_WARNING
578                    );
579                return FALSE;
580
581            }
582
583        } else {
584
585            // When mState is non-zero, we expect a continuation of the multi-octet
586            // sequence
587            if (0x80 == (0xC0 & ($in))) {
588
589                // Legal continuation.
590                $shift = ($mState - 1) * 6;
591                $tmp = $in;
592                $tmp = ($tmp & 0x0000003F) << $shift;
593                $mUcs4 |= $tmp;
594
595                /**
596                 * End of the multi-octet sequence. mUcs4 now contains the final
597                 * Unicode codepoint to be output
598                 */
599                if (0 == --$mState) {
600
601                    /*
602                     * Check for illegal sequences and codepoints.
603                     */
604                    // From Unicode 3.1, non-shortest form is illegal
605                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
606                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
607                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
608                        (4 < $mBytes) ||
609                        // From Unicode 3.2, surrogate characters are illegal
610                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
611                        // Codepoints outside the Unicode range are illegal
612                        ($mUcs4 > 0x10FFFF)) {
613
614                        if($strict){
615                            trigger_error(
616                                    'utf8_to_unicode: Illegal sequence or codepoint '.
617                                        'in UTF-8 at byte '.$i,
618                                    E_USER_WARNING
619                                );
620
621                            return FALSE;
622                        }
623
624                    }
625
626                    if (0xFEFF != $mUcs4) {
627                        // BOM is legal but we don't want to output it
628                        $out[] = $mUcs4;
629                    }
630
631                    //initialize UTF8 cache
632                    $mState = 0;
633                    $mUcs4  = 0;
634                    $mBytes = 1;
635                }
636
637            } elseif($strict) {
638                /**
639                 *((0xC0 & (*in) != 0x80) && (mState != 0))
640                 * Incomplete multi-octet sequence.
641                 */
642                trigger_error(
643                        'utf8_to_unicode: Incomplete multi-octet '.
644                        '   sequence in UTF-8 at byte '.$i,
645                        E_USER_WARNING
646                    );
647
648                return FALSE;
649            }
650        }
651    }
652    return $out;
653}
654
655/**
656 * Takes an array of ints representing the Unicode characters and returns
657 * a UTF-8 string. Astral planes are supported ie. the ints in the
658 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
659 * are not allowed.
660 *
661 * If $strict is set to true the function returns false if the input
662 * array contains ints that represent surrogates or are outside the
663 * Unicode range and raises a PHP error at level E_USER_WARNING
664 *
665 * Note: this function has been modified slightly in this library to use
666 * output buffering to concatenate the UTF-8 string (faster) as well as
667 * reference the array by it's keys
668 *
669 * @param  array of unicode code points representing a string
670 * @param  boolean Check for invalid sequences?
671 * @return mixed UTF-8 string or FALSE if array contains invalid code points
672 * @author <hsivonen@iki.fi>
673 * @author Harry Fuecks <hfuecks@gmail.com>
674 * @see    utf8_to_unicode
675 * @link   http://hsivonen.iki.fi/php-utf8/
676 * @link   http://sourceforge.net/projects/phputf8/
677 */
678function unicode_to_utf8($arr,$strict=false) {
679    if (!is_array($arr)) return '';
680    ob_start();
681
682    foreach (array_keys($arr) as $k) {
683
684        # ASCII range (including control chars)
685        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
686
687            echo chr($arr[$k]);
688
689        # 2 byte sequence
690        } else if ($arr[$k] <= 0x07ff) {
691
692            echo chr(0xc0 | ($arr[$k] >> 6));
693            echo chr(0x80 | ($arr[$k] & 0x003f));
694
695        # Byte order mark (skip)
696        } else if($arr[$k] == 0xFEFF) {
697
698            // nop -- zap the BOM
699
700        # Test for illegal surrogates
701        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
702
703            // found a surrogate
704            if($strict){
705                trigger_error(
706                    'unicode_to_utf8: Illegal surrogate '.
707                        'at index: '.$k.', value: '.$arr[$k],
708                    E_USER_WARNING
709                    );
710                return FALSE;
711            }
712
713        # 3 byte sequence
714        } else if ($arr[$k] <= 0xffff) {
715
716            echo chr(0xe0 | ($arr[$k] >> 12));
717            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
718            echo chr(0x80 | ($arr[$k] & 0x003f));
719
720        # 4 byte sequence
721        } else if ($arr[$k] <= 0x10ffff) {
722
723            echo chr(0xf0 | ($arr[$k] >> 18));
724            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
725            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
726            echo chr(0x80 | ($arr[$k] & 0x3f));
727
728        } elseif($strict) {
729
730            trigger_error(
731                'unicode_to_utf8: Codepoint out of Unicode range '.
732                    'at index: '.$k.', value: '.$arr[$k],
733                E_USER_WARNING
734                );
735
736            // out of range
737            return FALSE;
738        }
739    }
740
741    $result = ob_get_contents();
742    ob_end_clean();
743    return $result;
744}
745
746/**
747 * UTF-8 to UTF-16BE conversion.
748 *
749 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
750 */
751function utf8_to_utf16be(&$str, $bom = false) {
752  $out = $bom ? "\xFE\xFF" : '';
753  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
754
755  $uni = utf8_to_unicode($str);
756  foreach($uni as $cp){
757    $out .= pack('n',$cp);
758  }
759  return $out;
760}
761
762/**
763 * UTF-8 to UTF-16BE conversion.
764 *
765 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
766 */
767function utf16be_to_utf8(&$str) {
768  $uni = unpack('n*',$str);
769  return unicode_to_utf8($uni);
770}
771
772/**
773 * Replace bad bytes with an alternative character
774 *
775 * ASCII character is recommended for replacement char
776 *
777 * PCRE Pattern to locate bad bytes in a UTF-8 string
778 * Comes from W3 FAQ: Multilingual Forms
779 * Note: modified to include full ASCII range including control chars
780 *
781 * @author Harry Fuecks <hfuecks@gmail.com>
782 * @see http://www.w3.org/International/questions/qa-forms-utf-8
783 * @param string to search
784 * @param string to replace bad bytes with (defaults to '?') - use ASCII
785 * @return string
786 */
787function utf8_bad_replace($str, $replace = '') {
788    $UTF8_BAD =
789     '([\x00-\x7F]'.                          # ASCII (including control chars)
790     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
791     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
792     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
793     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
794     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
795     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
796     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
797     '|(.{1}))';                              # invalid byte
798    ob_start();
799    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
800        if ( !isset($matches[2])) {
801            echo $matches[0];
802        } else {
803            echo $replace;
804        }
805        $str = substr($str,strlen($matches[0]));
806    }
807    $result = ob_get_contents();
808    ob_end_clean();
809    return $result;
810}
811
812/**
813 * adjust a byte index into a utf8 string to a utf8 character boundary
814 *
815 * @param $str   string   utf8 character string
816 * @param $i     int      byte index into $str
817 * @param $next  bool     direction to search for boundary,
818 *                           false = up (current character)
819 *                           true = down (next character)
820 *
821 * @return int            byte index into $str now pointing to a utf8 character boundary
822 *
823 * @author       chris smith <chris@jalakai.co.uk>
824 */
825function utf8_correctIdx(&$str,$i,$next=false) {
826
827  if ($i <= 0) return 0;
828
829  $limit = strlen($str);
830  if ($i>=$limit) return $limit;
831
832  if ($next) {
833    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
834  } else {
835    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
836  }
837
838  return $i;
839}
840
841// only needed if no mb_string available
842if(!UTF8_MBSTRING){
843
844  /**
845   * UTF-8 Case lookup table
846   *
847   * This lookuptable defines the upper case letters to their correspponding
848   * lower case letter in UTF-8
849   *
850   * @author Andreas Gohr <andi@splitbrain.org>
851   */
852  global $UTF8_LOWER_TO_UPPER;
853  $UTF8_LOWER_TO_UPPER = array(
854    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
855    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
856    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
857    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
858    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
859    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
860    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
861    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
862    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
863    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
864    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
865    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
866    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
867    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
868    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
869    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
870    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
871    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
872    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
873    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
874    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
875    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
876    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
877    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
878    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
879    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
880    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
881    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
882    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
883    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
884    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
885    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
886    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
887    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
888    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
889    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
890    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
891    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
892    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
893    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
894    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
895    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
896    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
897  );
898
899  /**
900   * UTF-8 Case lookup table
901   *
902   * This lookuptable defines the lower case letters to their correspponding
903   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
904   *
905   * @author Andreas Gohr <andi@splitbrain.org>
906   */
907  global $UTF8_UPPER_TO_LOWER;
908  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
909
910} // end of case lookup tables
911
912
913/**
914 * UTF-8 lookup table for lower case accented letters
915 *
916 * This lookuptable defines replacements for accented characters from the ASCII-7
917 * range. This are lower case letters only.
918 *
919 * @author Andreas Gohr <andi@splitbrain.org>
920 * @see    utf8_deaccent()
921 */
922global $UTF8_LOWER_ACCENTS;
923$UTF8_LOWER_ACCENTS = array(
924  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
925  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
926  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
927  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
928  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
929  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
930  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
931  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
932  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
933  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
934  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
935  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
936  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
937  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
938  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
939);
940
941/**
942 * UTF-8 lookup table for upper case accented letters
943 *
944 * This lookuptable defines replacements for accented characters from the ASCII-7
945 * range. This are upper case letters only.
946 *
947 * @author Andreas Gohr <andi@splitbrain.org>
948 * @see    utf8_deaccent()
949 */
950global $UTF8_UPPER_ACCENTS;
951$UTF8_UPPER_ACCENTS = array(
952  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
953  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
954  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
955  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
956  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
957  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
958  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
959  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
960  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
961  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
962  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
963  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
964  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
965  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
966  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
967);
968
969/**
970 * UTF-8 array of common special characters
971 *
972 * This array should contain all special characters (not a letter or digit)
973 * defined in the various local charsets - it's not a complete list of non-alphanum
974 * characters in UTF-8. It's not perfect but should match most cases of special
975 * chars.
976 *
977 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
978 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
979 *
980 * @author Andreas Gohr <andi@splitbrain.org>
981 * @see    utf8_stripspecials()
982 */
983global $UTF8_SPECIAL_CHARS;
984$UTF8_SPECIAL_CHARS = array(
985  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
986  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
987          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
988  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
989  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
990  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
991  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
992  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
993  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
994  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
995  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
996  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
997  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
998  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
999  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1000  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1001  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1002  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1003  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1004  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1005  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1006  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1007  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1008  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1009  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1010  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1011  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1012  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1013  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1014  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1015  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1016  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1017  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1018  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1019  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1020  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1021  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1022  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1023  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1024  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1025  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1026  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1027  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1028  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1029  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1030  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1031  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1032  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1033  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1034  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1035);
1036
1037// utf8 version of above data
1038global $UTF8_SPECIAL_CHARS2;
1039$UTF8_SPECIAL_CHARS2 =
1040    ' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1041    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1042    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1043    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1044    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1045    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1046    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1047    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1048    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1049    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1050    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1051    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1052    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1053    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1054    '➷➸➹➺➻➼➽➾�'.
1055    '�ﹼﹽ';
1056
1057/**
1058 * Romanization lookup table
1059 *
1060 * This lookup tables provides a way to transform strings written in a language
1061 * different from the ones based upon latin letters into plain ASCII.
1062 *
1063 * Please note: this is not a scientific transliteration table. It only works
1064 * oneway from nonlatin to ASCII and it works by simple character replacement
1065 * only. Specialities of each language are not supported.
1066 *
1067 * @author Andreas Gohr <andi@splitbrain.org>
1068 * @author Vitaly Blokhin <vitinfo@vitn.com>
1069 * @link   http://www.uconv.com/translit.htm
1070 * @author Bisqwit <bisqwit@iki.fi>
1071 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1072 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1073 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1074 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1075 */
1076global $UTF8_ROMANIZATION;
1077$UTF8_ROMANIZATION = array(
1078  //russian cyrillic
1079  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1080  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1081  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1082  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1083  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1084  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1085  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1086  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1087  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1088  // Ukrainian cyrillic
1089  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1090  // Georgian
1091  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1092  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1093  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1094  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1095  'ჰ'=>'xh',
1096  //Sanskrit
1097  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1098  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1099  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1100  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1101  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1102  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1103  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1104  //Hebrew
1105  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1106  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1107  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1108  'ש'=>'sh','ת'=>'t',
1109  //Arabic
1110  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1111  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1112  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1113  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1114
1115  // Japanese hiragana
1116  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1117  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1118  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1119  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1120  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1121  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1122  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1123  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1124  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1125  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1126  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1127  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1128  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1129  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1130  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1131  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1132  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1133  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1134  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1135  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1136  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1137  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1138  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1139  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1140  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1141  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1142  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1143  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1144  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1145  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1146  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1147  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1148  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1149  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1150  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1151  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1152  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1153  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1154  'じゅ'=>'zyu',
1155  // Japanese katakana
1156  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1157  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1158  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1159  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1160  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1161  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1162  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1163  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1164  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1165  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1166  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1167  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1168  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1169  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1170  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1171  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1172  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1173  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1174  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1175  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1176  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1177  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1178  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1179  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1180  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1181  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1182  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1183  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1184  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1185  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1186  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1187  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1188  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1189  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1190  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1191  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1192  'ジョ'=>'zyo','ジュ'=>'zyu',
1193
1194  // "Greeklish"
1195  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1196  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1197
1198  // Thai
1199  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1200  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1201  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1202  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1203  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1204  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1205  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1206  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1207  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1208  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1209  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1210  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1211  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1212  'เ–ียว'=>'iao',
1213
1214  // Korean
1215  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1216  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1217  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1218  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1219  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1220  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1221);
1222
1223//Setup VIM: ex: et ts=2 enc=utf-8 :
1224
1225