xref: /dokuwiki/inc/utf8.php (revision 10f09f2a66400f77b4696f973c4c526424e44bc1)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * URL-Encode a filename to allow unicodecharacters
11 *
12 * Slashes are not encoded
13 *
14 * When the second parameter is true the string will
15 * be encoded only if non ASCII characters are detected -
16 * This makes it safe to run it multiple times on the
17 * same string (default is true)
18 *
19 * @author Andreas Gohr <andi@splitbrain.org>
20 * @see    urlencode
21 */
22function utf8_encodeFN($file,$safe=true){
23  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24    return $file;
25  }
26  $file = urlencode($file);
27  $file = str_replace('%2F','/',$file);
28  return $file;
29}
30
31/**
32 * URL-Decode a filename
33 *
34 * This is just a wrapper around urldecode
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @see    urldecode
38 */
39function utf8_decodeFN($file){
40  $file = urldecode($file);
41  return $file;
42}
43
44/**
45 * Checks if a string contains 7bit ASCII only
46 *
47 * @author Andreas Gohr <andi@splitbrain.org>
48 */
49function utf8_isASCII($str){
50  for($i=0; $i<strlen($str); $i++){
51    if(ord($str{$i}) >127) return false;
52  }
53  return true;
54}
55
56/**
57 * Strips all highbyte chars
58 *
59 * Returns a pure ASCII7 string
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_strip($str){
64  $ascii = '';
65  for($i=0; $i<strlen($str); $i++){
66    if(ord($str{$i}) <128){
67      $ascii .= $str{$i};
68    }
69  }
70  return $ascii;
71}
72
73/**
74 * Tries to detect if a string is in Unicode encoding
75 *
76 * @author <bmorel@ssi.fr>
77 * @link   http://www.php.net/manual/en/function.utf8-encode.php
78 */
79function utf8_check($Str) {
80 for ($i=0; $i<strlen($Str); $i++) {
81  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87  else return false; # Does not match any model
88  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90   return false;
91  }
92 }
93 return true;
94}
95
96/**
97 * Unicode aware replacement for strlen()
98 *
99 * utf8_decode() converts characters that are not in ISO-8859-1
100 * to '?', which, for the purpose of counting, is alright - It's
101 * even faster than mb_strlen.
102 *
103 * @author <chernyshevsky at hotmail dot com>
104 * @see    strlen()
105 * @see    utf8_decode()
106 */
107function utf8_strlen($string){
108  return strlen(utf8_decode($string));
109}
110
111/**
112 * UTF-8 aware alternative to substr
113 *
114 * Return part of a string given character offset (and optionally length)
115 * Note: supports use of negative offsets and lengths but will be slower
116 * when doing so
117 *
118 * @author Harry Fuecks <hfuecks@gmail.com>
119 * @param string
120 * @param integer number of UTF-8 characters offset (from left)
121 * @param integer (optional) length in UTF-8 characters from offset
122 * @return mixed string or FALSE if failure
123 */
124function utf8_substr($str, $offset, $length = null) {
125    if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr')){
126        if( $length === null ){
127            mb_substr($str, $offset);
128        }else{
129            mb_substr($str, $offset, $length);
130        }
131    }
132
133    if ( $offset >= 0 && $length >= 0 ) {
134        if ( $length === null ) {
135            $length = '*';
136        } else {
137            $strlen = strlen(utf8_decode($str));
138            if ( $offset > $strlen ) {
139                return '';
140            }
141
142            if ( ( $offset + $length ) > $strlen ) {
143               $length = '*';
144            } else {
145                $length = '{'.$length.'}';
146            }
147        }
148
149        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
150        preg_match($pattern, $str, $matches);
151
152        if ( isset($matches[1]) ) {
153            return $matches[1];
154        }
155        return false;
156
157    } else {
158        // Handle negatives using different, slower technique
159        // From: http://www.php.net/manual/en/function.substr.php#44838
160        preg_match_all('/./u', $str, $ar);
161        if( $length !== null ) {
162            return join('',array_slice($ar[0],$offset,$length));
163        } else {
164            return join('',array_slice($ar[0],$offset));
165        }
166    }
167}
168
169
170/**
171 * Unicode aware replacement for substr_replace()
172 *
173 * @author Andreas Gohr <andi@splitbrain.org>
174 * @see    substr_replace()
175 */
176function utf8_substr_replace($string, $replacement, $start , $length=0 ){
177  $ret = '';
178  if($start>0) $ret .= utf8_substr($string, 0, $start);
179  $ret .= $replacement;
180  $ret .= utf8_substr($string, $start+$length);
181  return $ret;
182}
183
184/**
185 * Unicode aware replacement for explode
186 *
187 * @TODO   support third limit arg
188 * @author Harry Fuecks <hfuecks@gmail.com>
189 * @see    explode();
190 */
191function utf8_explode($sep, $str) {
192  if ( $sep == '' ) {
193    trigger_error('Empty delimiter',E_USER_WARNING);
194    return FALSE;
195  }
196
197  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
198}
199
200/**
201 * Unicode aware replacement for strrepalce()
202 *
203 * @todo   support PHP5 count (fourth arg)
204 * @author Harry Fuecks <hfuecks@gmail.com>
205 * @see    strreplace();
206 */
207function utf8_str_replace($s,$r,$str){
208  if(!is_array($s)){
209    $s = '!'.preg_quote($s,'!').'!u';
210  }else{
211    foreach ($s as $k => $v) {
212      $s[$k] = '!'.preg_quote($v).'!u';
213    }
214  }
215  return preg_replace($s,$r,$str);
216}
217
218/**
219 * Unicode aware replacement for ltrim()
220 *
221 * @author Andreas Gohr <andi@splitbrain.org>
222 * @see    ltrim()
223 * @return string
224 */
225function utf8_ltrim($str,$charlist=''){
226  if($charlist == '') return ltrim($str);
227
228  //quote charlist for use in a characterclass
229  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
230
231  return preg_replace('/^['.$charlist.']+/u','',$str);
232}
233
234/**
235 * Unicode aware replacement for rtrim()
236 *
237 * @author Andreas Gohr <andi@splitbrain.org>
238 * @see    rtrim()
239 * @return string
240 */
241function  utf8_rtrim($str,$charlist=''){
242  if($charlist == '') return rtrim($str);
243
244  //quote charlist for use in a characterclass
245  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
246
247  return preg_replace('/['.$charlist.']+$/u','',$str);
248}
249
250/**
251 * Unicode aware replacement for trim()
252 *
253 * @author Andreas Gohr <andi@splitbrain.org>
254 * @see    trim()
255 * @return string
256 */
257function  utf8_trim($str,$charlist='') {
258  if($charlist == '') return trim($str);
259
260  return utf8_ltrim(utf8_rtrim($str));
261}
262
263
264/**
265 * This is a unicode aware replacement for strtolower()
266 *
267 * Uses mb_string extension if available
268 *
269 * @author Andreas Gohr <andi@splitbrain.org>
270 * @see    strtolower()
271 * @see    utf8_strtoupper()
272 */
273function utf8_strtolower($string){
274  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
275    return mb_strtolower($string,'utf-8');
276
277  global $UTF8_UPPER_TO_LOWER;
278  $uni = utf8_to_unicode($string);
279  $cnt = count($uni);
280  for ($i=0; $i < $cnt; $i++){
281    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
282      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
283    }
284  }
285  return unicode_to_utf8($uni);
286}
287
288/**
289 * This is a unicode aware replacement for strtoupper()
290 *
291 * Uses mb_string extension if available
292 *
293 * @author Andreas Gohr <andi@splitbrain.org>
294 * @see    strtoupper()
295 * @see    utf8_strtoupper()
296 */
297function utf8_strtoupper($string){
298  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
299    return mb_strtoupper($string,'utf-8');
300
301  global $UTF8_LOWER_TO_UPPER;
302  $uni = utf8_to_unicode($string);
303  $cnt = count($uni);
304  for ($i=0; $i < $cnt; $i++){
305    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
306      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
307    }
308  }
309  return unicode_to_utf8($uni);
310}
311
312/**
313 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
314 *
315 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
316 * letters. Default is to deaccent both cases ($case = 0)
317 *
318 * @author Andreas Gohr <andi@splitbrain.org>
319 */
320function utf8_deaccent($string,$case=0){
321  if($case <= 0){
322    global $UTF8_LOWER_ACCENTS;
323    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
324  }
325  if($case >= 0){
326    global $UTF8_UPPER_ACCENTS;
327    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
328  }
329  return $string;
330}
331
332/**
333 * Romanize a non-latin string
334 *
335 * @author Andreas Gohr <andi@splitbrain.org>
336 */
337function utf8_romanize($string){
338  if(utf8_isASCII($string)) return $string; //nothing to do
339
340  global $UTF8_ROMANIZATION;
341  return strtr($string,$UTF8_ROMANIZATION);
342}
343
344/**
345 * Removes special characters (nonalphanumeric) from a UTF-8 string
346 *
347 * This function adds the controlchars 0x00 to 0x19 to the array of
348 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
349 *
350 * @author Andreas Gohr <andi@splitbrain.org>
351 * @param  string $string     The UTF8 string to strip of special chars
352 * @param  string $repl       Replace special with this string
353 * @param  string $additional Additional chars to strip (used in regexp char class)
354 */
355function utf8_stripspecials($string,$repl='',$additional=''){
356  global $UTF8_SPECIAL_CHARS;
357
358  static $specials = null;
359  if(is_null($specials)){
360    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
361  }
362
363  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
364}
365
366/**
367 * This is an Unicode aware replacement for strpos
368 *
369 * Uses mb_string extension if available
370 *
371 * @author Harry Fuecks <hfuecks@gmail.com>
372 * @see    strpos()
373 */
374function utf8_strpos($haystack, $needle,$offset=0) {
375  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
376    return mb_strpos($haystack,$needle,$offset,'utf-8');
377
378  if(!$offset){
379    $ar = utf8_explode($needle, $str);
380    if ( count($ar) > 1 ) {
381       return utf8_strlen($ar[0]);
382    }
383    return false;
384  }else{
385    if ( !is_int($offset) ) {
386      trigger_error('Offset must be an integer',E_USER_WARNING);
387      return false;
388    }
389
390    $str = utf8_substr($str, $offset);
391
392    if ( false !== ($pos = utf8_strpos($str,$needle))){
393       return $pos + $offset;
394    }
395    return false;
396  }
397}
398
399/**
400 * Encodes UTF-8 characters to HTML entities
401 *
402 * @author <vpribish at shopping dot com>
403 * @link   http://www.php.net/manual/en/function.utf8-decode.php
404 */
405function utf8_tohtml ($str) {
406  $ret = '';
407  $max = strlen($str);
408  $last = 0;  // keeps the index of the last regular character
409  for ($i=0; $i<$max; $i++) {
410    $c = $str{$i};
411    $c1 = ord($c);
412    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
413      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
414      $c1 &= 31; // remove the 3 bit two bytes prefix
415      $c2 = ord($str{++$i}); // the next byte
416      $c2 &= 63;  // remove the 2 bit trailing byte prefix
417      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
418      $c1 >>= 2; // c1 shifts 2 to the right
419      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
420      $last = $i+1;
421    }
422  }
423  return $ret . substr($str, $last, $i); // append the last batch of regular characters
424}
425
426/**
427 * Takes an UTF-8 string and returns an array of ints representing the
428 * Unicode characters. Astral planes are supported ie. the ints in the
429 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
430 * are not allowed.
431 *
432 * If $strict is set to true the function returns false if the input
433 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
434 * level E_USER_WARNING
435 *
436 * Note: this function has been modified slightly in this library to
437 * trigger errors on encountering bad bytes
438 *
439 * @author <hsivonen@iki.fi>
440 * @author Harry Fuecks <hfuecks@gmail.com>
441 * @param  string  UTF-8 encoded string
442 * @param  boolean Check for invalid sequences?
443 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
444 * @see    unicode_to_utf8
445 * @link   http://hsivonen.iki.fi/php-utf8/
446 * @link   http://sourceforge.net/projects/phputf8/
447 */
448function utf8_to_unicode($str,$strict=false) {
449    $mState = 0;     // cached expected number of octets after the current octet
450                     // until the beginning of the next UTF8 character sequence
451    $mUcs4  = 0;     // cached Unicode character
452    $mBytes = 1;     // cached expected number of octets in the current sequence
453
454    $out = array();
455
456    $len = strlen($str);
457
458    for($i = 0; $i < $len; $i++) {
459
460        $in = ord($str{$i});
461
462        if ( $mState == 0) {
463
464            // When mState is zero we expect either a US-ASCII character or a
465            // multi-octet sequence.
466            if (0 == (0x80 & ($in))) {
467                // US-ASCII, pass straight through.
468                $out[] = $in;
469                $mBytes = 1;
470
471            } else if (0xC0 == (0xE0 & ($in))) {
472                // First octet of 2 octet sequence
473                $mUcs4 = ($in);
474                $mUcs4 = ($mUcs4 & 0x1F) << 6;
475                $mState = 1;
476                $mBytes = 2;
477
478            } else if (0xE0 == (0xF0 & ($in))) {
479                // First octet of 3 octet sequence
480                $mUcs4 = ($in);
481                $mUcs4 = ($mUcs4 & 0x0F) << 12;
482                $mState = 2;
483                $mBytes = 3;
484
485            } else if (0xF0 == (0xF8 & ($in))) {
486                // First octet of 4 octet sequence
487                $mUcs4 = ($in);
488                $mUcs4 = ($mUcs4 & 0x07) << 18;
489                $mState = 3;
490                $mBytes = 4;
491
492            } else if (0xF8 == (0xFC & ($in))) {
493                /* First octet of 5 octet sequence.
494                 *
495                 * This is illegal because the encoded codepoint must be either
496                 * (a) not the shortest form or
497                 * (b) outside the Unicode range of 0-0x10FFFF.
498                 * Rather than trying to resynchronize, we will carry on until the end
499                 * of the sequence and let the later error handling code catch it.
500                 */
501                $mUcs4 = ($in);
502                $mUcs4 = ($mUcs4 & 0x03) << 24;
503                $mState = 4;
504                $mBytes = 5;
505
506            } else if (0xFC == (0xFE & ($in))) {
507                // First octet of 6 octet sequence, see comments for 5 octet sequence.
508                $mUcs4 = ($in);
509                $mUcs4 = ($mUcs4 & 1) << 30;
510                $mState = 5;
511                $mBytes = 6;
512
513            } elseif($strict) {
514                /* Current octet is neither in the US-ASCII range nor a legal first
515                 * octet of a multi-octet sequence.
516                 */
517                trigger_error(
518                        'utf8_to_unicode: Illegal sequence identifier '.
519                            'in UTF-8 at byte '.$i,
520                        E_USER_WARNING
521                    );
522                return FALSE;
523
524            }
525
526        } else {
527
528            // When mState is non-zero, we expect a continuation of the multi-octet
529            // sequence
530            if (0x80 == (0xC0 & ($in))) {
531
532                // Legal continuation.
533                $shift = ($mState - 1) * 6;
534                $tmp = $in;
535                $tmp = ($tmp & 0x0000003F) << $shift;
536                $mUcs4 |= $tmp;
537
538                /**
539                 * End of the multi-octet sequence. mUcs4 now contains the final
540                 * Unicode codepoint to be output
541                 */
542                if (0 == --$mState) {
543
544                    /*
545                     * Check for illegal sequences and codepoints.
546                     */
547                    // From Unicode 3.1, non-shortest form is illegal
548                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
549                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
550                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
551                        (4 < $mBytes) ||
552                        // From Unicode 3.2, surrogate characters are illegal
553                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
554                        // Codepoints outside the Unicode range are illegal
555                        ($mUcs4 > 0x10FFFF)) {
556
557                        if($strict){
558                            trigger_error(
559                                    'utf8_to_unicode: Illegal sequence or codepoint '.
560                                        'in UTF-8 at byte '.$i,
561                                    E_USER_WARNING
562                                );
563
564                            return FALSE;
565                        }
566
567                    }
568
569                    if (0xFEFF != $mUcs4) {
570                        // BOM is legal but we don't want to output it
571                        $out[] = $mUcs4;
572                    }
573
574                    //initialize UTF8 cache
575                    $mState = 0;
576                    $mUcs4  = 0;
577                    $mBytes = 1;
578                }
579
580            } elseif($strict) {
581                /**
582                 *((0xC0 & (*in) != 0x80) && (mState != 0))
583                 * Incomplete multi-octet sequence.
584                 */
585                trigger_error(
586                        'utf8_to_unicode: Incomplete multi-octet '.
587                        '   sequence in UTF-8 at byte '.$i,
588                        E_USER_WARNING
589                    );
590
591                return FALSE;
592            }
593        }
594    }
595    return $out;
596}
597
598/**
599 * Takes an array of ints representing the Unicode characters and returns
600 * a UTF-8 string. Astral planes are supported ie. the ints in the
601 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
602 * are not allowed.
603 *
604 * If $strict is set to true the function returns false if the input
605 * array contains ints that represent surrogates or are outside the
606 * Unicode range and raises a PHP error at level E_USER_WARNING
607 *
608 * Note: this function has been modified slightly in this library to use
609 * output buffering to concatenate the UTF-8 string (faster) as well as
610 * reference the array by it's keys
611 *
612 * @param  array of unicode code points representing a string
613 * @param  boolean Check for invalid sequences?
614 * @return mixed UTF-8 string or FALSE if array contains invalid code points
615 * @author <hsivonen@iki.fi>
616 * @author Harry Fuecks <hfuecks@gmail.com>
617 * @see    utf8_to_unicode
618 * @link   http://hsivonen.iki.fi/php-utf8/
619 * @link   http://sourceforge.net/projects/phputf8/
620 */
621function unicode_to_utf8($arr,$strict=false) {
622    if (!is_array($arr)) return '';
623    ob_start();
624
625    foreach (array_keys($arr) as $k) {
626
627        # ASCII range (including control chars)
628        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
629
630            echo chr($arr[$k]);
631
632        # 2 byte sequence
633        } else if ($arr[$k] <= 0x07ff) {
634
635            echo chr(0xc0 | ($arr[$k] >> 6));
636            echo chr(0x80 | ($arr[$k] & 0x003f));
637
638        # Byte order mark (skip)
639        } else if($arr[$k] == 0xFEFF) {
640
641            // nop -- zap the BOM
642
643        # Test for illegal surrogates
644        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
645
646            // found a surrogate
647            if($strict){
648                trigger_error(
649                    'unicode_to_utf8: Illegal surrogate '.
650                        'at index: '.$k.', value: '.$arr[$k],
651                    E_USER_WARNING
652                    );
653                return FALSE;
654            }
655
656        # 3 byte sequence
657        } else if ($arr[$k] <= 0xffff) {
658
659            echo chr(0xe0 | ($arr[$k] >> 12));
660            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
661            echo chr(0x80 | ($arr[$k] & 0x003f));
662
663        # 4 byte sequence
664        } else if ($arr[$k] <= 0x10ffff) {
665
666            echo chr(0xf0 | ($arr[$k] >> 18));
667            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
668            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
669            echo chr(0x80 | ($arr[$k] & 0x3f));
670
671        } elseif($strict) {
672
673            trigger_error(
674                'unicode_to_utf8: Codepoint out of Unicode range '.
675                    'at index: '.$k.', value: '.$arr[$k],
676                E_USER_WARNING
677                );
678
679            // out of range
680            return FALSE;
681        }
682    }
683
684    $result = ob_get_contents();
685    ob_end_clean();
686    return $result;
687}
688
689/**
690 * UTF-8 to UTF-16BE conversion.
691 *
692 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
693 */
694function utf8_to_utf16be(&$str, $bom = false) {
695  $out = $bom ? "\xFE\xFF" : '';
696  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
697    return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
698
699  $uni = utf8_to_unicode($str);
700  foreach($uni as $cp){
701    $out .= pack('n',$cp);
702  }
703  return $out;
704}
705
706/**
707 * UTF-8 to UTF-16BE conversion.
708 *
709 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
710 */
711function utf16be_to_utf8(&$str) {
712  $uni = unpack('n*',$str);
713  return unicode_to_utf8($uni);
714}
715
716/**
717 * UTF-8 Case lookup table
718 *
719 * This lookuptable defines the upper case letters to their correspponding
720 * lower case letter in UTF-8
721 *
722 * @author Andreas Gohr <andi@splitbrain.org>
723 */
724static $UTF8_LOWER_TO_UPPER = array(
725  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
726  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
727  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
728  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
729  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
730  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
731  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
732  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
733  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
734  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
735  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
736  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
737  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
738  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
739  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
740  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
741  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
742  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
743  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
744  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
745  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
746  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
747  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
748  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
749  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
750  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
751  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
752  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
753  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
754  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
755  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
756  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
757  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
758  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
759  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
760  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
761  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
762  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
763  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
764  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
765  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
766  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
767  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
768);
769
770/**
771 * UTF-8 Case lookup table
772 *
773 * This lookuptable defines the lower case letters to their correspponding
774 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
775 *
776 * @author Andreas Gohr <andi@splitbrain.org>
777 */
778$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
779
780/**
781 * UTF-8 lookup table for lower case accented letters
782 *
783 * This lookuptable defines replacements for accented characters from the ASCII-7
784 * range. This are lower case letters only.
785 *
786 * @author Andreas Gohr <andi@splitbrain.org>
787 * @see    utf8_deaccent()
788 */
789$UTF8_LOWER_ACCENTS = array(
790  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
791  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
792  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
793  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
794  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
795  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
796  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
797  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
798  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
799  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
800  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
801  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
802  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
803  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
804  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
805);
806
807/**
808 * UTF-8 lookup table for upper case accented letters
809 *
810 * This lookuptable defines replacements for accented characters from the ASCII-7
811 * range. This are upper case letters only.
812 *
813 * @author Andreas Gohr <andi@splitbrain.org>
814 * @see    utf8_deaccent()
815 */
816$UTF8_UPPER_ACCENTS = array(
817  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
818  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
819  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
820  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
821  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
822  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
823  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
824  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
825  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
826  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
827  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
828  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
829  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
830  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
831  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
832);
833
834/**
835 * UTF-8 array of common special characters
836 *
837 * This array should contain all special characters (not a letter or digit)
838 * defined in the various local charsets - it's not a complete list of non-alphanum
839 * characters in UTF-8. It's not perfect but should match most cases of special
840 * chars.
841 *
842 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
843 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
844 *
845 * @author Andreas Gohr <andi@splitbrain.org>
846 * @see    utf8_stripspecials()
847 */
848$UTF8_SPECIAL_CHARS = array(
849  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
850  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
851          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
852  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
853  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
854  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
855  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
856  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
857  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
858  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
859  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
860  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
861  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
862  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
863  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
864  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
865  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
866  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
867  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
868  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
869  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
870  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
871  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
872  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
873  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
874  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
875  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
876  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
877  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
878  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
879  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
880  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
881  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
882  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
883  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
884  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
885  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
886  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
887  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
888  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
889  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
890  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
891  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
892  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
893  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
894  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
895  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
896  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
897  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
898  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
899);
900
901/**
902 * Romanization lookup table
903 *
904 * This lookup tables provides a way to transform strings written in a language
905 * different from the ones based upon latin letters into plain ASCII.
906 *
907 * Please note: this is not a scientific transliteration table. It only works
908 * oneway from nonlatin to ASCII and it works by simple character replacement
909 * only. Specialities of each language are not supported.
910 *
911 * @author Andreas Gohr <andi@splitbrain.org>
912 * @author Vitaly Blokhin <vitinfo@vitn.com>
913 * @link   http://www.uconv.com/translit.htm
914 * @author Bisqwit <bisqwit@iki.fi>
915 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
916 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
917 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
918 * @link   http://www.btranslations.com/resources/romanization/korean.asp
919 */
920$UTF8_ROMANIZATION = array(
921  //russian cyrillic
922  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
923  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
924  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
925  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
926  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
927  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
928  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
929  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
930  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
931  // Ukrainian cyrillic
932  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
933  // Georgian
934  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
935  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
936  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
937  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
938  'ჰ'=>'xh',
939  //Sanskrit
940  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
941  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
942  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
943  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
944  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
945  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
946  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
947  //Hebrew
948  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
949  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
950  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
951  'ש'=>'sh','ת'=>'t',
952  //Arabic
953  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
954  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
955  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
956  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
957
958  // Japanese hiragana
959  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
960  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
961  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
962  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
963  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
964  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
965  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
966  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
967  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
968  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
969  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
970  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
971  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
972  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
973  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
974  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
975  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
976  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
977  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
978  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
979  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
980  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
981  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
982  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
983  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
984  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
985  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
986  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
987  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
988  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
989  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
990  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
991  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
992  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
993  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
994  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
995  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
996  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
997  'じゅ'=>'zyu',
998  // Japanese katakana
999  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1000  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1001  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1002  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1003  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1004  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1005  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1006  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1007  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1008  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1009  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1010  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1011  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1012  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1013  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1014  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1015  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1016  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1017  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1018  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1019  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1020  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1021  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1022  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1023  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1024  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1025  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1026  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1027  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1028  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1029  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1030  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1031  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1032  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1033  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1034  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1035  'ジョ'=>'zyo','ジュ'=>'zyu',
1036
1037  // "Greeklish"
1038  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1039  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1040
1041  // Thai
1042  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1043  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1044  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1045  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1046  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1047  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1048  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1049  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1050  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1051  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1052  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1053  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1054  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1055  'เ–ียว'=>'iao',
1056
1057  // Korean
1058  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1059  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1060  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1061  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1062  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1063  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1064);
1065
1066//Setup VIM: ex: et ts=2 enc=utf-8 :
1067
1068