xref: /dokuwiki/inc/utf8.php (revision d07dd8ee598c9b9dc8dfc9a61e0fdfa023ad59de)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22
23/**
24 * URL-Encode a filename to allow unicodecharacters
25 *
26 * Slashes are not encoded
27 *
28 * When the second parameter is true the string will
29 * be encoded only if non ASCII characters are detected -
30 * This makes it safe to run it multiple times on the
31 * same string (default is true)
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @see    urlencode
35 */
36function utf8_encodeFN($file,$safe=true){
37  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38    return $file;
39  }
40  $file = urlencode($file);
41  $file = str_replace('%2F','/',$file);
42  return $file;
43}
44
45/**
46 * URL-Decode a filename
47 *
48 * This is just a wrapper around urldecode
49 *
50 * @author Andreas Gohr <andi@splitbrain.org>
51 * @see    urldecode
52 */
53function utf8_decodeFN($file){
54  $file = urldecode($file);
55  return $file;
56}
57
58/**
59 * Checks if a string contains 7bit ASCII only
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_isASCII($str){
64  for($i=0; $i<strlen($str); $i++){
65    if(ord($str{$i}) >127) return false;
66  }
67  return true;
68}
69
70/**
71 * Strips all highbyte chars
72 *
73 * Returns a pure ASCII7 string
74 *
75 * @author Andreas Gohr <andi@splitbrain.org>
76 */
77function utf8_strip($str){
78  $ascii = '';
79  for($i=0; $i<strlen($str); $i++){
80    if(ord($str{$i}) <128){
81      $ascii .= $str{$i};
82    }
83  }
84  return $ascii;
85}
86
87/**
88 * Tries to detect if a string is in Unicode encoding
89 *
90 * @author <bmorel@ssi.fr>
91 * @link   http://www.php.net/manual/en/function.utf8-encode.php
92 */
93function utf8_check($Str) {
94 for ($i=0; $i<strlen($Str); $i++) {
95  $b = ord($Str[$i]);
96  if ($b < 0x80) continue; # 0bbbbbbb
97  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
98  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
99  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
100  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
101  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102  else return false; # Does not match any model
103  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105   return false;
106  }
107 }
108 return true;
109}
110
111/**
112 * Unicode aware replacement for strlen()
113 *
114 * utf8_decode() converts characters that are not in ISO-8859-1
115 * to '?', which, for the purpose of counting, is alright - It's
116 * even faster than mb_strlen.
117 *
118 * @author <chernyshevsky at hotmail dot com>
119 * @see    strlen()
120 * @see    utf8_decode()
121 */
122function utf8_strlen($string){
123  return strlen(utf8_decode($string));
124}
125
126/**
127 * UTF-8 aware alternative to substr
128 *
129 * Return part of a string given character offset (and optionally length)
130 * Note: supports use of negative offsets and lengths but will be slower
131 * when doing so
132 *
133 * @author Harry Fuecks <hfuecks@gmail.com>
134 * @author Chris Smith <chris@jalakai.co.uk>
135 * @param string
136 * @param integer number of UTF-8 characters offset (from left)
137 * @param integer (optional) length in UTF-8 characters from offset
138 * @return mixed string or FALSE if failure
139 */
140function utf8_substr($str, $offset, $length = null) {
141    if(UTF8_MBSTRING){
142        if( $length === null ){
143            return mb_substr($str, $offset);
144        }else{
145            return mb_substr($str, $offset, $length);
146        }
147    }
148
149    if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) {
150        if ( $length === null ) {
151            $length = '*';
152        } else {
153            $strlen = strlen(utf8_decode($str));
154            if ( $offset > $strlen ) {
155                return '';
156            }
157
158            if ( ( $offset + $length ) > $strlen ) {
159               $length = '*';
160            } else {
161                $length = '{'.$length.'}';
162            }
163        }
164
165        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
166        preg_match($pattern, $str, $matches);
167
168        if ( isset($matches[1]) ) {
169            return $matches[1];
170        }
171        return false;
172
173    } else {
174
175      // convert character offsets to byte offsets and use normal substr()
176      // 1. normalise paramters into positive offset and length and carry out simple checks
177      $strlen = strlen(utf8_decode($str));
178
179      if ($offset < 0) {
180        $offset = max($strlen+$offset,0);
181      }
182      if ($offset >= $strlen) return false;
183
184      if ($length === null) {
185        // 2a. convert to start byte offset
186        list($start) = _utf8_byteindex($str,$offset);
187				return substr($str,$start);
188      }
189
190      if ($length < 0) {
191        $length = $strlen-$offset+$length;
192        if ($length < 0) return '';
193      }
194
195      if ($length === 0) return '';
196      if ($strlen - $offset < $length) $length = $strlen-$offset;
197
198      // 2b. convert to start and end byte offsets
199      list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length);
200      return substr($str,$start,$end-$start);
201    }
202}
203
204
205/**
206 * Unicode aware replacement for substr_replace()
207 *
208 * @author Andreas Gohr <andi@splitbrain.org>
209 * @see    substr_replace()
210 */
211function utf8_substr_replace($string, $replacement, $start , $length=0 ){
212  $ret = '';
213  if($start>0) $ret .= utf8_substr($string, 0, $start);
214  $ret .= $replacement;
215  $ret .= utf8_substr($string, $start+$length);
216  return $ret;
217}
218
219/**
220 * Unicode aware replacement for explode
221 *
222 * @TODO   support third limit arg
223 * @author Harry Fuecks <hfuecks@gmail.com>
224 * @see    explode();
225 */
226function utf8_explode($sep, $str) {
227  if ( $sep == '' ) {
228    trigger_error('Empty delimiter',E_USER_WARNING);
229    return FALSE;
230  }
231
232  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
233}
234
235/**
236 * Unicode aware replacement for strrepalce()
237 *
238 * @todo   support PHP5 count (fourth arg)
239 * @author Harry Fuecks <hfuecks@gmail.com>
240 * @see    strreplace();
241 */
242function utf8_str_replace($s,$r,$str){
243  if(!is_array($s)){
244    $s = '!'.preg_quote($s,'!').'!u';
245  }else{
246    foreach ($s as $k => $v) {
247      $s[$k] = '!'.preg_quote($v).'!u';
248    }
249  }
250  return preg_replace($s,$r,$str);
251}
252
253/**
254 * Unicode aware replacement for ltrim()
255 *
256 * @author Andreas Gohr <andi@splitbrain.org>
257 * @see    ltrim()
258 * @return string
259 */
260function utf8_ltrim($str,$charlist=''){
261  if($charlist == '') return ltrim($str);
262
263  //quote charlist for use in a characterclass
264  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
265
266  return preg_replace('/^['.$charlist.']+/u','',$str);
267}
268
269/**
270 * Unicode aware replacement for rtrim()
271 *
272 * @author Andreas Gohr <andi@splitbrain.org>
273 * @see    rtrim()
274 * @return string
275 */
276function  utf8_rtrim($str,$charlist=''){
277  if($charlist == '') return rtrim($str);
278
279  //quote charlist for use in a characterclass
280  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
281
282  return preg_replace('/['.$charlist.']+$/u','',$str);
283}
284
285/**
286 * Unicode aware replacement for trim()
287 *
288 * @author Andreas Gohr <andi@splitbrain.org>
289 * @see    trim()
290 * @return string
291 */
292function  utf8_trim($str,$charlist='') {
293  if($charlist == '') return trim($str);
294
295  return utf8_ltrim(utf8_rtrim($str));
296}
297
298
299/**
300 * This is a unicode aware replacement for strtolower()
301 *
302 * Uses mb_string extension if available
303 *
304 * @author Andreas Gohr <andi@splitbrain.org>
305 * @see    strtolower()
306 * @see    utf8_strtoupper()
307 */
308function utf8_strtolower($string){
309  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
310
311  global $UTF8_UPPER_TO_LOWER;
312  $uni = utf8_to_unicode($string);
313  $cnt = count($uni);
314  for ($i=0; $i < $cnt; $i++){
315    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
316      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
317    }
318  }
319  return unicode_to_utf8($uni);
320}
321
322/**
323 * This is a unicode aware replacement for strtoupper()
324 *
325 * Uses mb_string extension if available
326 *
327 * @author Andreas Gohr <andi@splitbrain.org>
328 * @see    strtoupper()
329 * @see    utf8_strtoupper()
330 */
331function utf8_strtoupper($string){
332  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
333
334  global $UTF8_LOWER_TO_UPPER;
335  $uni = utf8_to_unicode($string);
336  $cnt = count($uni);
337  for ($i=0; $i < $cnt; $i++){
338    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
339      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
340    }
341  }
342  return unicode_to_utf8($uni);
343}
344
345/**
346 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
347 *
348 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
349 * letters. Default is to deaccent both cases ($case = 0)
350 *
351 * @author Andreas Gohr <andi@splitbrain.org>
352 */
353function utf8_deaccent($string,$case=0){
354  if($case <= 0){
355    global $UTF8_LOWER_ACCENTS;
356    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
357  }
358  if($case >= 0){
359    global $UTF8_UPPER_ACCENTS;
360    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
361  }
362  return $string;
363}
364
365/**
366 * Romanize a non-latin string
367 *
368 * @author Andreas Gohr <andi@splitbrain.org>
369 */
370function utf8_romanize($string){
371  if(utf8_isASCII($string)) return $string; //nothing to do
372
373  global $UTF8_ROMANIZATION;
374  return strtr($string,$UTF8_ROMANIZATION);
375}
376
377/**
378 * Removes special characters (nonalphanumeric) from a UTF-8 string
379 *
380 * This function adds the controlchars 0x00 to 0x19 to the array of
381 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
382 *
383 * @author Andreas Gohr <andi@splitbrain.org>
384 * @param  string $string     The UTF8 string to strip of special chars
385 * @param  string $repl       Replace special with this string
386 * @param  string $additional Additional chars to strip (used in regexp char class)
387 */
388function utf8_stripspecials($string,$repl='',$additional=''){
389  global $UTF8_SPECIAL_CHARS;
390  global $UTF8_SPECIAL_CHARS2;
391
392  static $specials = null;
393  if(is_null($specials)){
394#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
395    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
396  }
397
398  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
399}
400
401/**
402 * This is an Unicode aware replacement for strpos
403 *
404 * Uses mb_string extension if available
405 *
406 * @author Harry Fuecks <hfuecks@gmail.com>
407 * @see    strpos()
408 */
409function utf8_strpos($haystack, $needle,$offset=0) {
410  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
411
412  if(!$offset){
413    $ar = utf8_explode($needle, $haystack);
414    if ( count($ar) > 1 ) {
415       return utf8_strlen($ar[0]);
416    }
417    return false;
418  }else{
419    if ( !is_int($offset) ) {
420      trigger_error('Offset must be an integer',E_USER_WARNING);
421      return false;
422    }
423
424    $haystack = utf8_substr($haystack, $offset);
425
426    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
427       return $pos + $offset;
428    }
429    return false;
430  }
431}
432
433/**
434 * Encodes UTF-8 characters to HTML entities
435 *
436 * @author <vpribish at shopping dot com>
437 * @link   http://www.php.net/manual/en/function.utf8-decode.php
438 */
439function utf8_tohtml ($str) {
440  $ret = '';
441  $max = strlen($str);
442  $last = 0;  // keeps the index of the last regular character
443  for ($i=0; $i<$max; $i++) {
444    $c = $str{$i};
445    $c1 = ord($c);
446    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
447      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
448      $c1 &= 31; // remove the 3 bit two bytes prefix
449      $c2 = ord($str{++$i}); // the next byte
450      $c2 &= 63;  // remove the 2 bit trailing byte prefix
451      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
452      $c1 >>= 2; // c1 shifts 2 to the right
453      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
454      $last = $i+1;
455    }
456  }
457  return $ret . substr($str, $last, $i); // append the last batch of regular characters
458}
459
460/**
461 * Takes an UTF-8 string and returns an array of ints representing the
462 * Unicode characters. Astral planes are supported ie. the ints in the
463 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
464 * are not allowed.
465 *
466 * If $strict is set to true the function returns false if the input
467 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
468 * level E_USER_WARNING
469 *
470 * Note: this function has been modified slightly in this library to
471 * trigger errors on encountering bad bytes
472 *
473 * @author <hsivonen@iki.fi>
474 * @author Harry Fuecks <hfuecks@gmail.com>
475 * @param  string  UTF-8 encoded string
476 * @param  boolean Check for invalid sequences?
477 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
478 * @see    unicode_to_utf8
479 * @link   http://hsivonen.iki.fi/php-utf8/
480 * @link   http://sourceforge.net/projects/phputf8/
481 */
482function utf8_to_unicode($str,$strict=false) {
483    $mState = 0;     // cached expected number of octets after the current octet
484                     // until the beginning of the next UTF8 character sequence
485    $mUcs4  = 0;     // cached Unicode character
486    $mBytes = 1;     // cached expected number of octets in the current sequence
487
488    $out = array();
489
490    $len = strlen($str);
491
492    for($i = 0; $i < $len; $i++) {
493
494        $in = ord($str{$i});
495
496        if ( $mState == 0) {
497
498            // When mState is zero we expect either a US-ASCII character or a
499            // multi-octet sequence.
500            if (0 == (0x80 & ($in))) {
501                // US-ASCII, pass straight through.
502                $out[] = $in;
503                $mBytes = 1;
504
505            } else if (0xC0 == (0xE0 & ($in))) {
506                // First octet of 2 octet sequence
507                $mUcs4 = ($in);
508                $mUcs4 = ($mUcs4 & 0x1F) << 6;
509                $mState = 1;
510                $mBytes = 2;
511
512            } else if (0xE0 == (0xF0 & ($in))) {
513                // First octet of 3 octet sequence
514                $mUcs4 = ($in);
515                $mUcs4 = ($mUcs4 & 0x0F) << 12;
516                $mState = 2;
517                $mBytes = 3;
518
519            } else if (0xF0 == (0xF8 & ($in))) {
520                // First octet of 4 octet sequence
521                $mUcs4 = ($in);
522                $mUcs4 = ($mUcs4 & 0x07) << 18;
523                $mState = 3;
524                $mBytes = 4;
525
526            } else if (0xF8 == (0xFC & ($in))) {
527                /* First octet of 5 octet sequence.
528                 *
529                 * This is illegal because the encoded codepoint must be either
530                 * (a) not the shortest form or
531                 * (b) outside the Unicode range of 0-0x10FFFF.
532                 * Rather than trying to resynchronize, we will carry on until the end
533                 * of the sequence and let the later error handling code catch it.
534                 */
535                $mUcs4 = ($in);
536                $mUcs4 = ($mUcs4 & 0x03) << 24;
537                $mState = 4;
538                $mBytes = 5;
539
540            } else if (0xFC == (0xFE & ($in))) {
541                // First octet of 6 octet sequence, see comments for 5 octet sequence.
542                $mUcs4 = ($in);
543                $mUcs4 = ($mUcs4 & 1) << 30;
544                $mState = 5;
545                $mBytes = 6;
546
547            } elseif($strict) {
548                /* Current octet is neither in the US-ASCII range nor a legal first
549                 * octet of a multi-octet sequence.
550                 */
551                trigger_error(
552                        'utf8_to_unicode: Illegal sequence identifier '.
553                            'in UTF-8 at byte '.$i,
554                        E_USER_WARNING
555                    );
556                return FALSE;
557
558            }
559
560        } else {
561
562            // When mState is non-zero, we expect a continuation of the multi-octet
563            // sequence
564            if (0x80 == (0xC0 & ($in))) {
565
566                // Legal continuation.
567                $shift = ($mState - 1) * 6;
568                $tmp = $in;
569                $tmp = ($tmp & 0x0000003F) << $shift;
570                $mUcs4 |= $tmp;
571
572                /**
573                 * End of the multi-octet sequence. mUcs4 now contains the final
574                 * Unicode codepoint to be output
575                 */
576                if (0 == --$mState) {
577
578                    /*
579                     * Check for illegal sequences and codepoints.
580                     */
581                    // From Unicode 3.1, non-shortest form is illegal
582                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
583                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
584                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
585                        (4 < $mBytes) ||
586                        // From Unicode 3.2, surrogate characters are illegal
587                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
588                        // Codepoints outside the Unicode range are illegal
589                        ($mUcs4 > 0x10FFFF)) {
590
591                        if($strict){
592                            trigger_error(
593                                    'utf8_to_unicode: Illegal sequence or codepoint '.
594                                        'in UTF-8 at byte '.$i,
595                                    E_USER_WARNING
596                                );
597
598                            return FALSE;
599                        }
600
601                    }
602
603                    if (0xFEFF != $mUcs4) {
604                        // BOM is legal but we don't want to output it
605                        $out[] = $mUcs4;
606                    }
607
608                    //initialize UTF8 cache
609                    $mState = 0;
610                    $mUcs4  = 0;
611                    $mBytes = 1;
612                }
613
614            } elseif($strict) {
615                /**
616                 *((0xC0 & (*in) != 0x80) && (mState != 0))
617                 * Incomplete multi-octet sequence.
618                 */
619                trigger_error(
620                        'utf8_to_unicode: Incomplete multi-octet '.
621                        '   sequence in UTF-8 at byte '.$i,
622                        E_USER_WARNING
623                    );
624
625                return FALSE;
626            }
627        }
628    }
629    return $out;
630}
631
632/**
633 * Takes an array of ints representing the Unicode characters and returns
634 * a UTF-8 string. Astral planes are supported ie. the ints in the
635 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
636 * are not allowed.
637 *
638 * If $strict is set to true the function returns false if the input
639 * array contains ints that represent surrogates or are outside the
640 * Unicode range and raises a PHP error at level E_USER_WARNING
641 *
642 * Note: this function has been modified slightly in this library to use
643 * output buffering to concatenate the UTF-8 string (faster) as well as
644 * reference the array by it's keys
645 *
646 * @param  array of unicode code points representing a string
647 * @param  boolean Check for invalid sequences?
648 * @return mixed UTF-8 string or FALSE if array contains invalid code points
649 * @author <hsivonen@iki.fi>
650 * @author Harry Fuecks <hfuecks@gmail.com>
651 * @see    utf8_to_unicode
652 * @link   http://hsivonen.iki.fi/php-utf8/
653 * @link   http://sourceforge.net/projects/phputf8/
654 */
655function unicode_to_utf8($arr,$strict=false) {
656    if (!is_array($arr)) return '';
657    ob_start();
658
659    foreach (array_keys($arr) as $k) {
660
661        # ASCII range (including control chars)
662        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
663
664            echo chr($arr[$k]);
665
666        # 2 byte sequence
667        } else if ($arr[$k] <= 0x07ff) {
668
669            echo chr(0xc0 | ($arr[$k] >> 6));
670            echo chr(0x80 | ($arr[$k] & 0x003f));
671
672        # Byte order mark (skip)
673        } else if($arr[$k] == 0xFEFF) {
674
675            // nop -- zap the BOM
676
677        # Test for illegal surrogates
678        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
679
680            // found a surrogate
681            if($strict){
682                trigger_error(
683                    'unicode_to_utf8: Illegal surrogate '.
684                        'at index: '.$k.', value: '.$arr[$k],
685                    E_USER_WARNING
686                    );
687                return FALSE;
688            }
689
690        # 3 byte sequence
691        } else if ($arr[$k] <= 0xffff) {
692
693            echo chr(0xe0 | ($arr[$k] >> 12));
694            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
695            echo chr(0x80 | ($arr[$k] & 0x003f));
696
697        # 4 byte sequence
698        } else if ($arr[$k] <= 0x10ffff) {
699
700            echo chr(0xf0 | ($arr[$k] >> 18));
701            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
702            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
703            echo chr(0x80 | ($arr[$k] & 0x3f));
704
705        } elseif($strict) {
706
707            trigger_error(
708                'unicode_to_utf8: Codepoint out of Unicode range '.
709                    'at index: '.$k.', value: '.$arr[$k],
710                E_USER_WARNING
711                );
712
713            // out of range
714            return FALSE;
715        }
716    }
717
718    $result = ob_get_contents();
719    ob_end_clean();
720    return $result;
721}
722
723/**
724 * UTF-8 to UTF-16BE conversion.
725 *
726 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
727 */
728function utf8_to_utf16be(&$str, $bom = false) {
729  $out = $bom ? "\xFE\xFF" : '';
730  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
731
732  $uni = utf8_to_unicode($str);
733  foreach($uni as $cp){
734    $out .= pack('n',$cp);
735  }
736  return $out;
737}
738
739/**
740 * UTF-8 to UTF-16BE conversion.
741 *
742 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
743 */
744function utf16be_to_utf8(&$str) {
745  $uni = unpack('n*',$str);
746  return unicode_to_utf8($uni);
747}
748
749/**
750 * Replace bad bytes with an alternative character
751 *
752 * ASCII character is recommended for replacement char
753 *
754 * PCRE Pattern to locate bad bytes in a UTF-8 string
755 * Comes from W3 FAQ: Multilingual Forms
756 * Note: modified to include full ASCII range including control chars
757 *
758 * @author Harry Fuecks <hfuecks@gmail.com>
759 * @see http://www.w3.org/International/questions/qa-forms-utf-8
760 * @param string to search
761 * @param string to replace bad bytes with (defaults to '?') - use ASCII
762 * @return string
763 */
764function utf8_bad_replace($str, $replace = '') {
765    $UTF8_BAD =
766     '([\x00-\x7F]'.                          # ASCII (including control chars)
767     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
768     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
769     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
770     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
771     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
772     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
773     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
774     '|(.{1}))';                              # invalid byte
775    ob_start();
776    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
777        if ( !isset($matches[2])) {
778            echo $matches[0];
779        } else {
780            echo $replace;
781        }
782        $str = substr($str,strlen($matches[0]));
783    }
784    $result = ob_get_contents();
785    ob_end_clean();
786    return $result;
787}
788
789/**
790 * adjust a byte index into a utf8 string to a utf8 character boundary
791 *
792 * @param $str   string   utf8 character string
793 * @param $i     int      byte index into $str
794 * @param $next  bool     direction to search for boundary,
795 *                           false = up (current character)
796 *                           true = down (next character)
797 *
798 * @return int            byte index into $str now pointing to a utf8 character boundary
799 *
800 * @author       chris smith <chris@jalakai.co.uk>
801 */
802function utf8_correctIdx(&$str,$i,$next=false) {
803
804  if ($i <= 0) return 0;
805
806  $limit = strlen($str);
807  if ($i>=$limit) return $limit;
808
809  if ($next) {
810    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
811  } else {
812    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
813  }
814
815  return $i;
816}
817
818/**
819 * determine the byte indexes into a utf-8 string for one or more character offsets
820 * PRIVATE  (could be made public with proper paramter checking)
821 *
822 * @author  Chris Smith <chris@jalakai.co.uk>
823 *
824 * @param   string    $str      utf8 string
825 * @param   int       $offset   any number of character offsets into $str
826 *
827 * @return  array     byte indexes into $str, one index for each offset argument
828 */
829function _utf8_byteindex() {
830
831  $args = func_get_args();
832  $str =& array_shift($args);
833  if (!is_string($str)) return false;
834
835  $result = array();
836
837  // use a short piece of str to estimate bytes per character
838  $i = utf8_correctIdx($str, 300, true);           // $i (& $j) -> byte indexes into $str
839  $c = utf8_strlen(substr($str,0,$i));             // $c -> character offset into $str
840
841  sort($args);                                     // deal with arguments from lowest to highest
842  foreach ($args as $offset) {
843    // sanity checks FIXME
844
845    // 0 is an easy check
846    if ($offset == 0) { $result[] = 0; continue; }
847
848    $safety_valve = 50;                            // ensure no endless looping
849
850    do {
851      $j = (int)($offset * $i/$c);                 // apply latest bytes/character estimate to offset
852      $j = utf8_correctIdx($str, $j, true);        // correct to utf8 character boundary
853
854      if ($j > $i) {
855        $c += utf8_strlen(substr($str,$i,$j-$i));  // determine new character offset
856      } else {
857        $c -= utf8_strlen(substr($str,$j,$i-$j));  // ditto
858      }
859
860      $error = abs($c-$offset);
861
862      $i = $j;                                     // ready for next time around
863    } while (($error > 7) && --$safety_valve) ;    // from 7 it is faster to iterate over the string
864
865    if ($error && $error <= 7) {
866      if ($c < $offset) {
867        // move up
868        while ($error--) { $i = utf8_correctIdx($str,++$i,true); }
869      } else {
870        // move down
871        while ($error--) { $i = utf8_correctIdx($str,--$i,false); }
872      }
873      $c = $offset;                                // ready for next arg
874    }
875    $result[] = $i;
876  }
877
878  return $result;
879}
880
881// only needed if no mb_string available
882if(!UTF8_MBSTRING){
883
884  /**
885   * UTF-8 Case lookup table
886   *
887   * This lookuptable defines the upper case letters to their correspponding
888   * lower case letter in UTF-8
889   *
890   * @author Andreas Gohr <andi@splitbrain.org>
891   */
892  global $UTF8_LOWER_TO_UPPER;
893  $UTF8_LOWER_TO_UPPER = array(
894    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
895    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
896    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
897    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
898    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
899    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
900    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
901    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
902    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
903    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
904    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
905    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
906    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
907    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
908    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
909    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
910    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
911    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
912    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
913    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
914    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
915    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
916    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
917    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
918    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
919    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
920    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
921    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
922    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
923    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
924    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
925    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
926    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
927    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
928    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
929    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
930    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
931    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
932    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
933    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
934    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
935    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
936    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
937  );
938
939  /**
940   * UTF-8 Case lookup table
941   *
942   * This lookuptable defines the lower case letters to their correspponding
943   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
944   *
945   * @author Andreas Gohr <andi@splitbrain.org>
946   */
947  global $UTF8_UPPER_TO_LOWER;
948  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
949
950} // end of case lookup tables
951
952
953/**
954 * UTF-8 lookup table for lower case accented letters
955 *
956 * This lookuptable defines replacements for accented characters from the ASCII-7
957 * range. This are lower case letters only.
958 *
959 * @author Andreas Gohr <andi@splitbrain.org>
960 * @see    utf8_deaccent()
961 */
962global $UTF8_LOWER_ACCENTS;
963$UTF8_LOWER_ACCENTS = array(
964  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
965  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
966  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
967  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
968  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
969  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
970  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
971  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
972  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
973  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
974  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
975  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
976  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
977  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
978  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
979);
980
981/**
982 * UTF-8 lookup table for upper case accented letters
983 *
984 * This lookuptable defines replacements for accented characters from the ASCII-7
985 * range. This are upper case letters only.
986 *
987 * @author Andreas Gohr <andi@splitbrain.org>
988 * @see    utf8_deaccent()
989 */
990global $UTF8_UPPER_ACCENTS;
991$UTF8_UPPER_ACCENTS = array(
992  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
993  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
994  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
995  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
996  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
997  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
998  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
999  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1000  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1001  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1002  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1003  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1004  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1005  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1006  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1007);
1008
1009/**
1010 * UTF-8 array of common special characters
1011 *
1012 * This array should contain all special characters (not a letter or digit)
1013 * defined in the various local charsets - it's not a complete list of non-alphanum
1014 * characters in UTF-8. It's not perfect but should match most cases of special
1015 * chars.
1016 *
1017 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1018 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1019 *
1020 * @author Andreas Gohr <andi@splitbrain.org>
1021 * @see    utf8_stripspecials()
1022 */
1023global $UTF8_SPECIAL_CHARS;
1024$UTF8_SPECIAL_CHARS = array(
1025  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1026  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1027          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1028  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1029  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1030  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1031  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1032  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1033  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1034  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1035  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1036  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1037  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1038  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1039  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1040  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1041  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1042  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1043  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1044  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1045  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1046  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1047  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1048  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1049  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1050  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1051  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1052  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1053  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1054  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1055  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1056  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1057  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1058  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1059  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1060  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1061  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1062  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1063  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1064  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1065  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1066  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1067  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1068  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1069  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1070  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1071  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1072  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1073  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1074  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1075);
1076
1077// utf8 version of above data
1078global $UTF8_SPECIAL_CHARS2;
1079$UTF8_SPECIAL_CHARS2 =
1080    ' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1081    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1082    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1083    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1084    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1085    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1086    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1087    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1088    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1089    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1090    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1091    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1092    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1093    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1094    '➷➸➹➺➻➼➽➾�'.
1095    '�ﹼﹽ';
1096
1097/**
1098 * Romanization lookup table
1099 *
1100 * This lookup tables provides a way to transform strings written in a language
1101 * different from the ones based upon latin letters into plain ASCII.
1102 *
1103 * Please note: this is not a scientific transliteration table. It only works
1104 * oneway from nonlatin to ASCII and it works by simple character replacement
1105 * only. Specialities of each language are not supported.
1106 *
1107 * @author Andreas Gohr <andi@splitbrain.org>
1108 * @author Vitaly Blokhin <vitinfo@vitn.com>
1109 * @link   http://www.uconv.com/translit.htm
1110 * @author Bisqwit <bisqwit@iki.fi>
1111 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1112 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1113 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1114 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1115 */
1116global $UTF8_ROMANIZATION;
1117$UTF8_ROMANIZATION = array(
1118  //russian cyrillic
1119  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1120  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1121  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1122  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1123  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1124  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1125  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1126  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1127  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1128  // Ukrainian cyrillic
1129  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1130  // Georgian
1131  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1132  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1133  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1134  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1135  'ჰ'=>'xh',
1136  //Sanskrit
1137  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1138  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1139  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1140  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1141  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1142  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1143  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1144  //Hebrew
1145  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1146  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1147  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1148  'ש'=>'sh','ת'=>'t',
1149  //Arabic
1150  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1151  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1152  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1153  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1154
1155  // Japanese hiragana
1156  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1157  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1158  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1159  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1160  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1161  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1162  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1163  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1164  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1165  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1166  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1167  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1168  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1169  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1170  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1171  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1172  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1173  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1174  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1175  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1176  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1177  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1178  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1179  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1180  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1181  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1182  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1183  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1184  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1185  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1186  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1187  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1188  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1189  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1190  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1191  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1192  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1193  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1194  'じゅ'=>'zyu',
1195  // Japanese katakana
1196  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1197  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1198  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1199  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1200  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1201  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1202  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1203  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1204  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1205  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1206  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1207  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1208  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1209  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1210  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1211  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1212  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1213  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1214  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1215  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1216  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1217  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1218  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1219  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1220  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1221  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1222  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1223  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1224  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1225  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1226  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1227  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1228  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1229  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1230  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1231  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1232  'ジョ'=>'zyo','ジュ'=>'zyu',
1233
1234  // "Greeklish"
1235  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1236  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1237
1238  // Thai
1239  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1240  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1241  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1242  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1243  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1244  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1245  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1246  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1247  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1248  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1249  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1250  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1251  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1252  'เ–ียว'=>'iao',
1253
1254  // Korean
1255  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1256  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1257  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1258  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1259  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1260  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1261);
1262
1263//Setup VIM: ex: et ts=2 enc=utf-8 :
1264
1265