xref: /dokuwiki/inc/utf8.php (revision 350a8730ed011f71d41833e11599c6b3e7d4bcfb)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9
10/**
11 * check for mb_string support
12 */
13if(!defined('UTF8_MBSTRING')){
14  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
15    define('UTF8_MBSTRING',1);
16  }else{
17    define('UTF8_MBSTRING',0);
18  }
19}
20
21
22/**
23 * URL-Encode a filename to allow unicodecharacters
24 *
25 * Slashes are not encoded
26 *
27 * When the second parameter is true the string will
28 * be encoded only if non ASCII characters are detected -
29 * This makes it safe to run it multiple times on the
30 * same string (default is true)
31 *
32 * @author Andreas Gohr <andi@splitbrain.org>
33 * @see    urlencode
34 */
35function utf8_encodeFN($file,$safe=true){
36  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
37    return $file;
38  }
39  $file = urlencode($file);
40  $file = str_replace('%2F','/',$file);
41  return $file;
42}
43
44/**
45 * URL-Decode a filename
46 *
47 * This is just a wrapper around urldecode
48 *
49 * @author Andreas Gohr <andi@splitbrain.org>
50 * @see    urldecode
51 */
52function utf8_decodeFN($file){
53  $file = urldecode($file);
54  return $file;
55}
56
57/**
58 * Checks if a string contains 7bit ASCII only
59 *
60 * @author Andreas Gohr <andi@splitbrain.org>
61 */
62function utf8_isASCII($str){
63  for($i=0; $i<strlen($str); $i++){
64    if(ord($str{$i}) >127) return false;
65  }
66  return true;
67}
68
69/**
70 * Strips all highbyte chars
71 *
72 * Returns a pure ASCII7 string
73 *
74 * @author Andreas Gohr <andi@splitbrain.org>
75 */
76function utf8_strip($str){
77  $ascii = '';
78  for($i=0; $i<strlen($str); $i++){
79    if(ord($str{$i}) <128){
80      $ascii .= $str{$i};
81    }
82  }
83  return $ascii;
84}
85
86/**
87 * Tries to detect if a string is in Unicode encoding
88 *
89 * @author <bmorel@ssi.fr>
90 * @link   http://www.php.net/manual/en/function.utf8-encode.php
91 */
92function utf8_check($Str) {
93 for ($i=0; $i<strlen($Str); $i++) {
94  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
95  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
96  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
97  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
98  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
99  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
100  else return false; # Does not match any model
101  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
102   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
103   return false;
104  }
105 }
106 return true;
107}
108
109/**
110 * Unicode aware replacement for strlen()
111 *
112 * utf8_decode() converts characters that are not in ISO-8859-1
113 * to '?', which, for the purpose of counting, is alright - It's
114 * even faster than mb_strlen.
115 *
116 * @author <chernyshevsky at hotmail dot com>
117 * @see    strlen()
118 * @see    utf8_decode()
119 */
120function utf8_strlen($string){
121  return strlen(utf8_decode($string));
122}
123
124/**
125 * UTF-8 aware alternative to substr
126 *
127 * Return part of a string given character offset (and optionally length)
128 * Note: supports use of negative offsets and lengths but will be slower
129 * when doing so
130 *
131 * @author Harry Fuecks <hfuecks@gmail.com>
132 * @param string
133 * @param integer number of UTF-8 characters offset (from left)
134 * @param integer (optional) length in UTF-8 characters from offset
135 * @return mixed string or FALSE if failure
136 */
137function utf8_substr($str, $offset, $length = null) {
138    if(UTF8_MBSTRING){
139        if( $length === null ){
140            mb_substr($str, $offset);
141        }else{
142            mb_substr($str, $offset, $length);
143        }
144    }
145
146    if ( $offset >= 0 && $length >= 0 ) {
147        if ( $length === null ) {
148            $length = '*';
149        } else {
150            $strlen = strlen(utf8_decode($str));
151            if ( $offset > $strlen ) {
152                return '';
153            }
154
155            if ( ( $offset + $length ) > $strlen ) {
156               $length = '*';
157            } else {
158                $length = '{'.$length.'}';
159            }
160        }
161
162        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
163        preg_match($pattern, $str, $matches);
164
165        if ( isset($matches[1]) ) {
166            return $matches[1];
167        }
168        return false;
169
170    } else {
171        // Handle negatives using different, slower technique
172        // From: http://www.php.net/manual/en/function.substr.php#44838
173        preg_match_all('/./u', $str, $ar);
174        if( $length !== null ) {
175            return join('',array_slice($ar[0],$offset,$length));
176        } else {
177            return join('',array_slice($ar[0],$offset));
178        }
179    }
180}
181
182
183/**
184 * Unicode aware replacement for substr_replace()
185 *
186 * @author Andreas Gohr <andi@splitbrain.org>
187 * @see    substr_replace()
188 */
189function utf8_substr_replace($string, $replacement, $start , $length=0 ){
190  $ret = '';
191  if($start>0) $ret .= utf8_substr($string, 0, $start);
192  $ret .= $replacement;
193  $ret .= utf8_substr($string, $start+$length);
194  return $ret;
195}
196
197/**
198 * Unicode aware replacement for explode
199 *
200 * @TODO   support third limit arg
201 * @author Harry Fuecks <hfuecks@gmail.com>
202 * @see    explode();
203 */
204function utf8_explode($sep, $str) {
205  if ( $sep == '' ) {
206    trigger_error('Empty delimiter',E_USER_WARNING);
207    return FALSE;
208  }
209
210  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
211}
212
213/**
214 * Unicode aware replacement for strrepalce()
215 *
216 * @todo   support PHP5 count (fourth arg)
217 * @author Harry Fuecks <hfuecks@gmail.com>
218 * @see    strreplace();
219 */
220function utf8_str_replace($s,$r,$str){
221  if(!is_array($s)){
222    $s = '!'.preg_quote($s,'!').'!u';
223  }else{
224    foreach ($s as $k => $v) {
225      $s[$k] = '!'.preg_quote($v).'!u';
226    }
227  }
228  return preg_replace($s,$r,$str);
229}
230
231/**
232 * Unicode aware replacement for ltrim()
233 *
234 * @author Andreas Gohr <andi@splitbrain.org>
235 * @see    ltrim()
236 * @return string
237 */
238function utf8_ltrim($str,$charlist=''){
239  if($charlist == '') return ltrim($str);
240
241  //quote charlist for use in a characterclass
242  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
243
244  return preg_replace('/^['.$charlist.']+/u','',$str);
245}
246
247/**
248 * Unicode aware replacement for rtrim()
249 *
250 * @author Andreas Gohr <andi@splitbrain.org>
251 * @see    rtrim()
252 * @return string
253 */
254function  utf8_rtrim($str,$charlist=''){
255  if($charlist == '') return rtrim($str);
256
257  //quote charlist for use in a characterclass
258  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
259
260  return preg_replace('/['.$charlist.']+$/u','',$str);
261}
262
263/**
264 * Unicode aware replacement for trim()
265 *
266 * @author Andreas Gohr <andi@splitbrain.org>
267 * @see    trim()
268 * @return string
269 */
270function  utf8_trim($str,$charlist='') {
271  if($charlist == '') return trim($str);
272
273  return utf8_ltrim(utf8_rtrim($str));
274}
275
276
277/**
278 * This is a unicode aware replacement for strtolower()
279 *
280 * Uses mb_string extension if available
281 *
282 * @author Andreas Gohr <andi@splitbrain.org>
283 * @see    strtolower()
284 * @see    utf8_strtoupper()
285 */
286function utf8_strtolower($string){
287  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
288
289  global $UTF8_UPPER_TO_LOWER;
290  $uni = utf8_to_unicode($string);
291  $cnt = count($uni);
292  for ($i=0; $i < $cnt; $i++){
293    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
294      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
295    }
296  }
297  return unicode_to_utf8($uni);
298}
299
300/**
301 * This is a unicode aware replacement for strtoupper()
302 *
303 * Uses mb_string extension if available
304 *
305 * @author Andreas Gohr <andi@splitbrain.org>
306 * @see    strtoupper()
307 * @see    utf8_strtoupper()
308 */
309function utf8_strtoupper($string){
310  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
311
312  global $UTF8_LOWER_TO_UPPER;
313  $uni = utf8_to_unicode($string);
314  $cnt = count($uni);
315  for ($i=0; $i < $cnt; $i++){
316    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
317      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
318    }
319  }
320  return unicode_to_utf8($uni);
321}
322
323/**
324 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
325 *
326 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
327 * letters. Default is to deaccent both cases ($case = 0)
328 *
329 * @author Andreas Gohr <andi@splitbrain.org>
330 */
331function utf8_deaccent($string,$case=0){
332  if($case <= 0){
333    global $UTF8_LOWER_ACCENTS;
334    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
335  }
336  if($case >= 0){
337    global $UTF8_UPPER_ACCENTS;
338    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
339  }
340  return $string;
341}
342
343/**
344 * Romanize a non-latin string
345 *
346 * @author Andreas Gohr <andi@splitbrain.org>
347 */
348function utf8_romanize($string){
349  if(utf8_isASCII($string)) return $string; //nothing to do
350
351  global $UTF8_ROMANIZATION;
352  return strtr($string,$UTF8_ROMANIZATION);
353}
354
355/**
356 * Removes special characters (nonalphanumeric) from a UTF-8 string
357 *
358 * This function adds the controlchars 0x00 to 0x19 to the array of
359 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
360 *
361 * @author Andreas Gohr <andi@splitbrain.org>
362 * @param  string $string     The UTF8 string to strip of special chars
363 * @param  string $repl       Replace special with this string
364 * @param  string $additional Additional chars to strip (used in regexp char class)
365 */
366function utf8_stripspecials($string,$repl='',$additional=''){
367  global $UTF8_SPECIAL_CHARS;
368
369  static $specials = null;
370  if(is_null($specials)){
371    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
372  }
373
374  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
375}
376
377/**
378 * This is an Unicode aware replacement for strpos
379 *
380 * Uses mb_string extension if available
381 *
382 * @author Harry Fuecks <hfuecks@gmail.com>
383 * @see    strpos()
384 */
385function utf8_strpos($haystack, $needle,$offset=0) {
386  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
387
388  if(!$offset){
389    $ar = utf8_explode($needle, $haystack);
390    if ( count($ar) > 1 ) {
391       return utf8_strlen($ar[0]);
392    }
393    return false;
394  }else{
395    if ( !is_int($offset) ) {
396      trigger_error('Offset must be an integer',E_USER_WARNING);
397      return false;
398    }
399
400    $haystack = utf8_substr($haystack, $offset);
401
402    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
403       return $pos + $offset;
404    }
405    return false;
406  }
407}
408
409/**
410 * Encodes UTF-8 characters to HTML entities
411 *
412 * @author <vpribish at shopping dot com>
413 * @link   http://www.php.net/manual/en/function.utf8-decode.php
414 */
415function utf8_tohtml ($str) {
416  $ret = '';
417  $max = strlen($str);
418  $last = 0;  // keeps the index of the last regular character
419  for ($i=0; $i<$max; $i++) {
420    $c = $str{$i};
421    $c1 = ord($c);
422    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
423      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
424      $c1 &= 31; // remove the 3 bit two bytes prefix
425      $c2 = ord($str{++$i}); // the next byte
426      $c2 &= 63;  // remove the 2 bit trailing byte prefix
427      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
428      $c1 >>= 2; // c1 shifts 2 to the right
429      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
430      $last = $i+1;
431    }
432  }
433  return $ret . substr($str, $last, $i); // append the last batch of regular characters
434}
435
436/**
437 * Takes an UTF-8 string and returns an array of ints representing the
438 * Unicode characters. Astral planes are supported ie. the ints in the
439 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
440 * are not allowed.
441 *
442 * If $strict is set to true the function returns false if the input
443 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
444 * level E_USER_WARNING
445 *
446 * Note: this function has been modified slightly in this library to
447 * trigger errors on encountering bad bytes
448 *
449 * @author <hsivonen@iki.fi>
450 * @author Harry Fuecks <hfuecks@gmail.com>
451 * @param  string  UTF-8 encoded string
452 * @param  boolean Check for invalid sequences?
453 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
454 * @see    unicode_to_utf8
455 * @link   http://hsivonen.iki.fi/php-utf8/
456 * @link   http://sourceforge.net/projects/phputf8/
457 */
458function utf8_to_unicode($str,$strict=false) {
459    $mState = 0;     // cached expected number of octets after the current octet
460                     // until the beginning of the next UTF8 character sequence
461    $mUcs4  = 0;     // cached Unicode character
462    $mBytes = 1;     // cached expected number of octets in the current sequence
463
464    $out = array();
465
466    $len = strlen($str);
467
468    for($i = 0; $i < $len; $i++) {
469
470        $in = ord($str{$i});
471
472        if ( $mState == 0) {
473
474            // When mState is zero we expect either a US-ASCII character or a
475            // multi-octet sequence.
476            if (0 == (0x80 & ($in))) {
477                // US-ASCII, pass straight through.
478                $out[] = $in;
479                $mBytes = 1;
480
481            } else if (0xC0 == (0xE0 & ($in))) {
482                // First octet of 2 octet sequence
483                $mUcs4 = ($in);
484                $mUcs4 = ($mUcs4 & 0x1F) << 6;
485                $mState = 1;
486                $mBytes = 2;
487
488            } else if (0xE0 == (0xF0 & ($in))) {
489                // First octet of 3 octet sequence
490                $mUcs4 = ($in);
491                $mUcs4 = ($mUcs4 & 0x0F) << 12;
492                $mState = 2;
493                $mBytes = 3;
494
495            } else if (0xF0 == (0xF8 & ($in))) {
496                // First octet of 4 octet sequence
497                $mUcs4 = ($in);
498                $mUcs4 = ($mUcs4 & 0x07) << 18;
499                $mState = 3;
500                $mBytes = 4;
501
502            } else if (0xF8 == (0xFC & ($in))) {
503                /* First octet of 5 octet sequence.
504                 *
505                 * This is illegal because the encoded codepoint must be either
506                 * (a) not the shortest form or
507                 * (b) outside the Unicode range of 0-0x10FFFF.
508                 * Rather than trying to resynchronize, we will carry on until the end
509                 * of the sequence and let the later error handling code catch it.
510                 */
511                $mUcs4 = ($in);
512                $mUcs4 = ($mUcs4 & 0x03) << 24;
513                $mState = 4;
514                $mBytes = 5;
515
516            } else if (0xFC == (0xFE & ($in))) {
517                // First octet of 6 octet sequence, see comments for 5 octet sequence.
518                $mUcs4 = ($in);
519                $mUcs4 = ($mUcs4 & 1) << 30;
520                $mState = 5;
521                $mBytes = 6;
522
523            } elseif($strict) {
524                /* Current octet is neither in the US-ASCII range nor a legal first
525                 * octet of a multi-octet sequence.
526                 */
527                trigger_error(
528                        'utf8_to_unicode: Illegal sequence identifier '.
529                            'in UTF-8 at byte '.$i,
530                        E_USER_WARNING
531                    );
532                return FALSE;
533
534            }
535
536        } else {
537
538            // When mState is non-zero, we expect a continuation of the multi-octet
539            // sequence
540            if (0x80 == (0xC0 & ($in))) {
541
542                // Legal continuation.
543                $shift = ($mState - 1) * 6;
544                $tmp = $in;
545                $tmp = ($tmp & 0x0000003F) << $shift;
546                $mUcs4 |= $tmp;
547
548                /**
549                 * End of the multi-octet sequence. mUcs4 now contains the final
550                 * Unicode codepoint to be output
551                 */
552                if (0 == --$mState) {
553
554                    /*
555                     * Check for illegal sequences and codepoints.
556                     */
557                    // From Unicode 3.1, non-shortest form is illegal
558                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
559                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
560                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
561                        (4 < $mBytes) ||
562                        // From Unicode 3.2, surrogate characters are illegal
563                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
564                        // Codepoints outside the Unicode range are illegal
565                        ($mUcs4 > 0x10FFFF)) {
566
567                        if($strict){
568                            trigger_error(
569                                    'utf8_to_unicode: Illegal sequence or codepoint '.
570                                        'in UTF-8 at byte '.$i,
571                                    E_USER_WARNING
572                                );
573
574                            return FALSE;
575                        }
576
577                    }
578
579                    if (0xFEFF != $mUcs4) {
580                        // BOM is legal but we don't want to output it
581                        $out[] = $mUcs4;
582                    }
583
584                    //initialize UTF8 cache
585                    $mState = 0;
586                    $mUcs4  = 0;
587                    $mBytes = 1;
588                }
589
590            } elseif($strict) {
591                /**
592                 *((0xC0 & (*in) != 0x80) && (mState != 0))
593                 * Incomplete multi-octet sequence.
594                 */
595                trigger_error(
596                        'utf8_to_unicode: Incomplete multi-octet '.
597                        '   sequence in UTF-8 at byte '.$i,
598                        E_USER_WARNING
599                    );
600
601                return FALSE;
602            }
603        }
604    }
605    return $out;
606}
607
608/**
609 * Takes an array of ints representing the Unicode characters and returns
610 * a UTF-8 string. Astral planes are supported ie. the ints in the
611 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
612 * are not allowed.
613 *
614 * If $strict is set to true the function returns false if the input
615 * array contains ints that represent surrogates or are outside the
616 * Unicode range and raises a PHP error at level E_USER_WARNING
617 *
618 * Note: this function has been modified slightly in this library to use
619 * output buffering to concatenate the UTF-8 string (faster) as well as
620 * reference the array by it's keys
621 *
622 * @param  array of unicode code points representing a string
623 * @param  boolean Check for invalid sequences?
624 * @return mixed UTF-8 string or FALSE if array contains invalid code points
625 * @author <hsivonen@iki.fi>
626 * @author Harry Fuecks <hfuecks@gmail.com>
627 * @see    utf8_to_unicode
628 * @link   http://hsivonen.iki.fi/php-utf8/
629 * @link   http://sourceforge.net/projects/phputf8/
630 */
631function unicode_to_utf8($arr,$strict=false) {
632    if (!is_array($arr)) return '';
633    ob_start();
634
635    foreach (array_keys($arr) as $k) {
636
637        # ASCII range (including control chars)
638        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
639
640            echo chr($arr[$k]);
641
642        # 2 byte sequence
643        } else if ($arr[$k] <= 0x07ff) {
644
645            echo chr(0xc0 | ($arr[$k] >> 6));
646            echo chr(0x80 | ($arr[$k] & 0x003f));
647
648        # Byte order mark (skip)
649        } else if($arr[$k] == 0xFEFF) {
650
651            // nop -- zap the BOM
652
653        # Test for illegal surrogates
654        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
655
656            // found a surrogate
657            if($strict){
658                trigger_error(
659                    'unicode_to_utf8: Illegal surrogate '.
660                        'at index: '.$k.', value: '.$arr[$k],
661                    E_USER_WARNING
662                    );
663                return FALSE;
664            }
665
666        # 3 byte sequence
667        } else if ($arr[$k] <= 0xffff) {
668
669            echo chr(0xe0 | ($arr[$k] >> 12));
670            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
671            echo chr(0x80 | ($arr[$k] & 0x003f));
672
673        # 4 byte sequence
674        } else if ($arr[$k] <= 0x10ffff) {
675
676            echo chr(0xf0 | ($arr[$k] >> 18));
677            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
678            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
679            echo chr(0x80 | ($arr[$k] & 0x3f));
680
681        } elseif($strict) {
682
683            trigger_error(
684                'unicode_to_utf8: Codepoint out of Unicode range '.
685                    'at index: '.$k.', value: '.$arr[$k],
686                E_USER_WARNING
687                );
688
689            // out of range
690            return FALSE;
691        }
692    }
693
694    $result = ob_get_contents();
695    ob_end_clean();
696    return $result;
697}
698
699/**
700 * UTF-8 to UTF-16BE conversion.
701 *
702 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
703 */
704function utf8_to_utf16be(&$str, $bom = false) {
705  $out = $bom ? "\xFE\xFF" : '';
706  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
707
708  $uni = utf8_to_unicode($str);
709  foreach($uni as $cp){
710    $out .= pack('n',$cp);
711  }
712  return $out;
713}
714
715/**
716 * UTF-8 to UTF-16BE conversion.
717 *
718 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
719 */
720function utf16be_to_utf8(&$str) {
721  $uni = unpack('n*',$str);
722  return unicode_to_utf8($uni);
723}
724
725
726// only needed if no mb_string available
727if(!UTF8_MBSTRING){
728
729  /**
730   * UTF-8 Case lookup table
731   *
732   * This lookuptable defines the upper case letters to their correspponding
733   * lower case letter in UTF-8
734   *
735   * @author Andreas Gohr <andi@splitbrain.org>
736   */
737  global $UTF8_LOWER_TO_UPPER;
738  $UTF8_LOWER_TO_UPPER = array(
739    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
740    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
741    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
742    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
743    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
744    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
745    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
746    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
747    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
748    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
749    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
750    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
751    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
752    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
753    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
754    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
755    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
756    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
757    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
758    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
759    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
760    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
761    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
762    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
763    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
764    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
765    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
766    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
767    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
768    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
769    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
770    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
771    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
772    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
773    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
774    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
775    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
776    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
777    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
778    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
779    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
780    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
781    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
782  );
783
784  /**
785   * UTF-8 Case lookup table
786   *
787   * This lookuptable defines the lower case letters to their correspponding
788   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
789   *
790   * @author Andreas Gohr <andi@splitbrain.org>
791   */
792  global $UTF8_UPPER_TO_LOWER;
793  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
794
795} // end of case lookup tables
796
797
798/**
799 * UTF-8 lookup table for lower case accented letters
800 *
801 * This lookuptable defines replacements for accented characters from the ASCII-7
802 * range. This are lower case letters only.
803 *
804 * @author Andreas Gohr <andi@splitbrain.org>
805 * @see    utf8_deaccent()
806 */
807global $UTF8_LOWER_ACCENTS;
808$UTF8_LOWER_ACCENTS = array(
809  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
810  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
811  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
812  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
813  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
814  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
815  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
816  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
817  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
818  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
819  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
820  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
821  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
822  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
823  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
824);
825
826/**
827 * UTF-8 lookup table for upper case accented letters
828 *
829 * This lookuptable defines replacements for accented characters from the ASCII-7
830 * range. This are upper case letters only.
831 *
832 * @author Andreas Gohr <andi@splitbrain.org>
833 * @see    utf8_deaccent()
834 */
835global $UTF8_UPPER_ACCENTS;
836$UTF8_UPPER_ACCENTS = array(
837  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
838  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
839  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
840  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
841  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
842  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
843  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
844  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
845  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
846  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
847  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
848  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
849  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
850  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
851  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
852);
853
854/**
855 * UTF-8 array of common special characters
856 *
857 * This array should contain all special characters (not a letter or digit)
858 * defined in the various local charsets - it's not a complete list of non-alphanum
859 * characters in UTF-8. It's not perfect but should match most cases of special
860 * chars.
861 *
862 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
863 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
864 *
865 * @author Andreas Gohr <andi@splitbrain.org>
866 * @see    utf8_stripspecials()
867 */
868global $UTF8_SPECIAL_CHARS;
869$UTF8_SPECIAL_CHARS = array(
870  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
871  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
872          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
873  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
874  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
875  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
876  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
877  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
878  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
879  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
880  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
881  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
882  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
883  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
884  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
885  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
886  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
887  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
888  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
889  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
890  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
891  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
892  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
893  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
894  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
895  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
896  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
897  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
898  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
899  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
900  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
901  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
902  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
903  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
904  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
905  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
906  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
907  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
908  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
909  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
910  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
911  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
912  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
913  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
914  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
915  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
916  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
917  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
918  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
919  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
920);
921
922/**
923 * Romanization lookup table
924 *
925 * This lookup tables provides a way to transform strings written in a language
926 * different from the ones based upon latin letters into plain ASCII.
927 *
928 * Please note: this is not a scientific transliteration table. It only works
929 * oneway from nonlatin to ASCII and it works by simple character replacement
930 * only. Specialities of each language are not supported.
931 *
932 * @author Andreas Gohr <andi@splitbrain.org>
933 * @author Vitaly Blokhin <vitinfo@vitn.com>
934 * @link   http://www.uconv.com/translit.htm
935 * @author Bisqwit <bisqwit@iki.fi>
936 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
937 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
938 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
939 * @link   http://www.btranslations.com/resources/romanization/korean.asp
940 */
941global $UTF8_ROMANIZATION;
942$UTF8_ROMANIZATION = array(
943  //russian cyrillic
944  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
945  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
946  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
947  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
948  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
949  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
950  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
951  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
952  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
953  // Ukrainian cyrillic
954  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
955  // Georgian
956  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
957  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
958  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
959  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
960  'ჰ'=>'xh',
961  //Sanskrit
962  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
963  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
964  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
965  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
966  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
967  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
968  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
969  //Hebrew
970  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
971  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
972  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
973  'ש'=>'sh','ת'=>'t',
974  //Arabic
975  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
976  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
977  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
978  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
979
980  // Japanese hiragana
981  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
982  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
983  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
984  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
985  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
986  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
987  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
988  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
989  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
990  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
991  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
992  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
993  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
994  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
995  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
996  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
997  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
998  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
999  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1000  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1001  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1002  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1003  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1004  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1005  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1006  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1007  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1008  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1009  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1010  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1011  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1012  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1013  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1014  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1015  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1016  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1017  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1018  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1019  'じゅ'=>'zyu',
1020  // Japanese katakana
1021  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1022  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1023  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1024  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1025  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1026  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1027  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1028  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1029  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1030  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1031  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1032  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1033  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1034  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1035  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1036  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1037  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1038  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1039  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1040  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1041  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1042  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1043  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1044  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1045  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1046  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1047  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1048  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1049  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1050  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1051  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1052  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1053  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1054  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1055  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1056  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1057  'ジョ'=>'zyo','ジュ'=>'zyu',
1058
1059  // "Greeklish"
1060  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1061  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1062
1063  // Thai
1064  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1065  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1066  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1067  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1068  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1069  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1070  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1071  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1072  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1073  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1074  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1075  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1076  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1077  'เ–ียว'=>'iao',
1078
1079  // Korean
1080  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1081  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1082  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1083  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1084  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1085  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1086);
1087
1088//Setup VIM: ex: et ts=2 enc=utf-8 :
1089
1090