xref: /dokuwiki/inc/utf8.php (revision b7b9e2f23c056d1778b8aaa689ac65e1c61884b4)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * URL-Encode a filename to allow unicodecharacters
11 *
12 * Slashes are not encoded
13 *
14 * When the second parameter is true the string will
15 * be encoded only if non ASCII characters are detected -
16 * This makes it safe to run it multiple times on the
17 * same string (default is true)
18 *
19 * @author Andreas Gohr <andi@splitbrain.org>
20 * @see    urlencode
21 */
22function utf8_encodeFN($file,$safe=true){
23  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24    return $file;
25  }
26  $file = urlencode($file);
27  $file = str_replace('%2F','/',$file);
28  return $file;
29}
30
31/**
32 * URL-Decode a filename
33 *
34 * This is just a wrapper around urldecode
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @see    urldecode
38 */
39function utf8_decodeFN($file){
40  $file = urldecode($file);
41  return $file;
42}
43
44/**
45 * Checks if a string contains 7bit ASCII only
46 *
47 * @author Andreas Gohr <andi@splitbrain.org>
48 */
49function utf8_isASCII($str){
50  for($i=0; $i<strlen($str); $i++){
51    if(ord($str{$i}) >127) return false;
52  }
53  return true;
54}
55
56/**
57 * Strips all highbyte chars
58 *
59 * Returns a pure ASCII7 string
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_strip($str){
64  $ascii = '';
65  for($i=0; $i<strlen($str); $i++){
66    if(ord($str{$i}) <128){
67      $ascii .= $str{$i};
68    }
69  }
70  return $ascii;
71}
72
73/**
74 * Tries to detect if a string is in Unicode encoding
75 *
76 * @author <bmorel@ssi.fr>
77 * @link   http://www.php.net/manual/en/function.utf8-encode.php
78 */
79function utf8_check($Str) {
80 for ($i=0; $i<strlen($Str); $i++) {
81  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87  else return false; # Does not match any model
88  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90   return false;
91  }
92 }
93 return true;
94}
95
96/**
97 * Unicode aware replacement for strlen()
98 *
99 * utf8_decode() converts characters that are not in ISO-8859-1
100 * to '?', which, for the purpose of counting, is alright - It's
101 * even faster than mb_strlen.
102 *
103 * @author <chernyshevsky at hotmail dot com>
104 * @see    strlen()
105 * @see    utf8_decode()
106 */
107function utf8_strlen($string){
108  return strlen(utf8_decode($string));
109}
110
111/**
112 * Unicode aware replacement for substr()
113 *
114 * @author lmak at NOSPAM dot iti dot gr
115 * @link   http://www.php.net/manual/en/function.substr.php
116 * @see    substr()
117 */
118function utf8_substr($str,$start,$length=null){
119   preg_match_all("/./u", $str, $ar);
120
121   if($length != null) {
122       return join("",array_slice($ar[0],$start,$length));
123   } else {
124       return join("",array_slice($ar[0],$start));
125   }
126}
127
128/**
129 * Unicode aware replacement for substr_replace()
130 *
131 * @author Andreas Gohr <andi@splitbrain.org>
132 * @see    substr_replace()
133 */
134function utf8_substr_replace($string, $replacement, $start , $length=0 ){
135  $ret = '';
136  if($start>0) $ret .= utf8_substr($string, 0, $start);
137  $ret .= $replacement;
138  $ret .= utf8_substr($string, $start+$length);
139  return $ret;
140}
141
142/**
143 * Unicode aware replacement for explode
144 *
145 * @TODO   support third limit arg
146 * @author Harry Fuecks <hfuecks@gmail.com>
147 * @see    explode();
148 */
149function utf8_explode($sep, $str) {
150  if ( $sep == '' ) {
151    trigger_error('Empty delimiter',E_USER_WARNING);
152    return FALSE;
153  }
154
155  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
156}
157
158/**
159 * Unicode aware replacement for strrepalce()
160 *
161 * @todo   support PHP5 count (fourth arg)
162 * @author Harry Fuecks <hfuecks@gmail.com>
163 * @see    strreplace();
164 */
165function utf8_str_replace($s,$r,$str){
166  if(!is_array($s)){
167    $s = '!'.preg_quote($s,'!').'!u';
168  }else{
169    foreach ($s as $k => $v) {
170      $s[$k] = '!'.preg_quote($v).'!u';
171    }
172  }
173  return preg_replace($s,$r,$str);
174}
175
176/**
177 * Unicode aware replacement for ltrim()
178 *
179 * @author Andreas Gohr <andi@splitbrain.org>
180 * @see    ltrim()
181 * @return string
182 */
183function utf8_ltrim($str,$charlist=''){
184  if($charlist == '') return ltrim($str);
185
186  //quote charlist for use in a characterclass
187  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
188
189  return preg_replace('/^['.$charlist.']+/u','',$str);
190}
191
192/**
193 * Unicode aware replacement for rtrim()
194 *
195 * @author Andreas Gohr <andi@splitbrain.org>
196 * @see    rtrim()
197 * @return string
198 */
199function  utf8_rtrim($str,$charlist=''){
200  if($charlist == '') return rtrim($str);
201
202  //quote charlist for use in a characterclass
203  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
204
205  return preg_replace('/['.$charlist.']+$/u','',$str);
206}
207
208/**
209 * Unicode aware replacement for trim()
210 *
211 * @author Andreas Gohr <andi@splitbrain.org>
212 * @see    trim()
213 * @return string
214 */
215function  utf8_trim($str,$charlist='') {
216  if($charlist == '') return trim($str);
217
218  return utf8_ltrim(utf8_rtrim($str));
219}
220
221
222/**
223 * This is a unicode aware replacement for strtolower()
224 *
225 * Uses mb_string extension if available
226 *
227 * @author Andreas Gohr <andi@splitbrain.org>
228 * @see    strtolower()
229 * @see    utf8_strtoupper()
230 */
231function utf8_strtolower($string){
232  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
233    return mb_strtolower($string,'utf-8');
234
235  global $UTF8_UPPER_TO_LOWER;
236  $uni = utf8_to_unicode($string);
237  $cnt = count($uni);
238  for ($i=0; $i < $cnt; $i++){
239    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
240      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
241    }
242  }
243  return unicode_to_utf8($uni);
244}
245
246/**
247 * This is a unicode aware replacement for strtoupper()
248 *
249 * Uses mb_string extension if available
250 *
251 * @author Andreas Gohr <andi@splitbrain.org>
252 * @see    strtoupper()
253 * @see    utf8_strtoupper()
254 */
255function utf8_strtoupper($string){
256  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
257    return mb_strtoupper($string,'utf-8');
258
259  global $UTF8_LOWER_TO_UPPER;
260  $uni = utf8_to_unicode($string);
261  $cnt = count($uni);
262  for ($i=0; $i < $cnt; $i++){
263    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
264      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
265    }
266  }
267  return unicode_to_utf8($uni);
268}
269
270/**
271 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
272 *
273 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
274 * letters. Default is to deaccent both cases ($case = 0)
275 *
276 * @author Andreas Gohr <andi@splitbrain.org>
277 */
278function utf8_deaccent($string,$case=0){
279  if($case <= 0){
280    global $UTF8_LOWER_ACCENTS;
281    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
282  }
283  if($case >= 0){
284    global $UTF8_UPPER_ACCENTS;
285    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
286  }
287  return $string;
288}
289
290/**
291 * Romanize a non-latin string
292 *
293 * @author Andreas Gohr <andi@splitbrain.org>
294 */
295function utf8_romanize($string){
296  if(utf8_isASCII($string)) return $string; //nothing to do
297
298  global $UTF8_ROMANIZATION;
299  return strtr($string,$UTF8_ROMANIZATION);
300}
301
302/**
303 * Removes special characters (nonalphanumeric) from a UTF-8 string
304 *
305 * This function adds the controlchars 0x00 to 0x19 to the array of
306 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
307 *
308 * @author Andreas Gohr <andi@splitbrain.org>
309 * @param  string $string     The UTF8 string to strip of special chars
310 * @param  string $repl       Replace special with this string
311 * @param  string $additional Additional chars to strip (used in regexp char class)
312 */
313function utf8_stripspecials($string,$repl='',$additional=''){
314  global $UTF8_SPECIAL_CHARS;
315
316  static $specials = null;
317  if(is_null($specials)){
318    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
319  }
320
321  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
322}
323
324/**
325 * This is an Unicode aware replacement for strpos
326 *
327 * Uses mb_string extension if available
328 *
329 * @author Harry Fuecks <hfuecks@gmail.com>
330 * @see    strpos()
331 */
332function utf8_strpos($haystack, $needle,$offset=0) {
333  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
334    return mb_strpos($haystack,$needle,$offset,'utf-8');
335
336  if(!$offset){
337    $ar = utf8_explode($needle, $str);
338    if ( count($ar) > 1 ) {
339       return utf8_strlen($ar[0]);
340    }
341    return false;
342  }else{
343    if ( !is_int($offset) ) {
344      trigger_error('Offset must be an integer',E_USER_WARNING);
345      return false;
346    }
347
348    $str = utf8_substr($str, $offset);
349
350    if ( false !== ($pos = utf8_strpos($str,$needle))){
351       return $pos + $offset;
352    }
353    return false;
354  }
355}
356
357/**
358 * Encodes UTF-8 characters to HTML entities
359 *
360 * @author <vpribish at shopping dot com>
361 * @link   http://www.php.net/manual/en/function.utf8-decode.php
362 */
363function utf8_tohtml ($str) {
364  $ret = '';
365  $max = strlen($str);
366  $last = 0;  // keeps the index of the last regular character
367  for ($i=0; $i<$max; $i++) {
368    $c = $str{$i};
369    $c1 = ord($c);
370    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
371      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
372      $c1 &= 31; // remove the 3 bit two bytes prefix
373      $c2 = ord($str{++$i}); // the next byte
374      $c2 &= 63;  // remove the 2 bit trailing byte prefix
375      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
376      $c1 >>= 2; // c1 shifts 2 to the right
377      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
378      $last = $i+1;
379    }
380  }
381  return $ret . substr($str, $last, $i); // append the last batch of regular characters
382}
383
384/**
385 * Takes an UTF-8 string and returns an array of ints representing the
386 * Unicode characters. Astral planes are supported ie. the ints in the
387 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
388 * are not allowed.
389 *
390 * If $strict is set to true the function returns false if the input
391 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
392 * level E_USER_WARNING
393 *
394 * Note: this function has been modified slightly in this library to
395 * trigger errors on encountering bad bytes
396 *
397 * @author <hsivonen@iki.fi>
398 * @author Harry Fuecks <hfuecks@gmail.com>
399 * @param  string  UTF-8 encoded string
400 * @param  boolean Check for invalid sequences?
401 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
402 * @see    unicode_to_utf8
403 * @link   http://hsivonen.iki.fi/php-utf8/
404 * @link   http://sourceforge.net/projects/phputf8/
405 */
406function utf8_to_unicode($str,$strict=false) {
407    $mState = 0;     // cached expected number of octets after the current octet
408                     // until the beginning of the next UTF8 character sequence
409    $mUcs4  = 0;     // cached Unicode character
410    $mBytes = 1;     // cached expected number of octets in the current sequence
411
412    $out = array();
413
414    $len = strlen($str);
415
416    for($i = 0; $i < $len; $i++) {
417
418        $in = ord($str{$i});
419
420        if ( $mState == 0) {
421
422            // When mState is zero we expect either a US-ASCII character or a
423            // multi-octet sequence.
424            if (0 == (0x80 & ($in))) {
425                // US-ASCII, pass straight through.
426                $out[] = $in;
427                $mBytes = 1;
428
429            } else if (0xC0 == (0xE0 & ($in))) {
430                // First octet of 2 octet sequence
431                $mUcs4 = ($in);
432                $mUcs4 = ($mUcs4 & 0x1F) << 6;
433                $mState = 1;
434                $mBytes = 2;
435
436            } else if (0xE0 == (0xF0 & ($in))) {
437                // First octet of 3 octet sequence
438                $mUcs4 = ($in);
439                $mUcs4 = ($mUcs4 & 0x0F) << 12;
440                $mState = 2;
441                $mBytes = 3;
442
443            } else if (0xF0 == (0xF8 & ($in))) {
444                // First octet of 4 octet sequence
445                $mUcs4 = ($in);
446                $mUcs4 = ($mUcs4 & 0x07) << 18;
447                $mState = 3;
448                $mBytes = 4;
449
450            } else if (0xF8 == (0xFC & ($in))) {
451                /* First octet of 5 octet sequence.
452                 *
453                 * This is illegal because the encoded codepoint must be either
454                 * (a) not the shortest form or
455                 * (b) outside the Unicode range of 0-0x10FFFF.
456                 * Rather than trying to resynchronize, we will carry on until the end
457                 * of the sequence and let the later error handling code catch it.
458                 */
459                $mUcs4 = ($in);
460                $mUcs4 = ($mUcs4 & 0x03) << 24;
461                $mState = 4;
462                $mBytes = 5;
463
464            } else if (0xFC == (0xFE & ($in))) {
465                // First octet of 6 octet sequence, see comments for 5 octet sequence.
466                $mUcs4 = ($in);
467                $mUcs4 = ($mUcs4 & 1) << 30;
468                $mState = 5;
469                $mBytes = 6;
470
471            } elseif($strict) {
472                /* Current octet is neither in the US-ASCII range nor a legal first
473                 * octet of a multi-octet sequence.
474                 */
475                trigger_error(
476                        'utf8_to_unicode: Illegal sequence identifier '.
477                            'in UTF-8 at byte '.$i,
478                        E_USER_WARNING
479                    );
480                return FALSE;
481
482            }
483
484        } else {
485
486            // When mState is non-zero, we expect a continuation of the multi-octet
487            // sequence
488            if (0x80 == (0xC0 & ($in))) {
489
490                // Legal continuation.
491                $shift = ($mState - 1) * 6;
492                $tmp = $in;
493                $tmp = ($tmp & 0x0000003F) << $shift;
494                $mUcs4 |= $tmp;
495
496                /**
497                 * End of the multi-octet sequence. mUcs4 now contains the final
498                 * Unicode codepoint to be output
499                 */
500                if (0 == --$mState) {
501
502                    /*
503                     * Check for illegal sequences and codepoints.
504                     */
505                    // From Unicode 3.1, non-shortest form is illegal
506                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
507                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
508                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
509                        (4 < $mBytes) ||
510                        // From Unicode 3.2, surrogate characters are illegal
511                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
512                        // Codepoints outside the Unicode range are illegal
513                        ($mUcs4 > 0x10FFFF)) {
514
515                        if($strict){
516                            trigger_error(
517                                    'utf8_to_unicode: Illegal sequence or codepoint '.
518                                        'in UTF-8 at byte '.$i,
519                                    E_USER_WARNING
520                                );
521
522                            return FALSE;
523                        }
524
525                    }
526
527                    if (0xFEFF != $mUcs4) {
528                        // BOM is legal but we don't want to output it
529                        $out[] = $mUcs4;
530                    }
531
532                    //initialize UTF8 cache
533                    $mState = 0;
534                    $mUcs4  = 0;
535                    $mBytes = 1;
536                }
537
538            } elseif($strict) {
539                /**
540                 *((0xC0 & (*in) != 0x80) && (mState != 0))
541                 * Incomplete multi-octet sequence.
542                 */
543                trigger_error(
544                        'utf8_to_unicode: Incomplete multi-octet '.
545                        '   sequence in UTF-8 at byte '.$i,
546                        E_USER_WARNING
547                    );
548
549                return FALSE;
550            }
551        }
552    }
553    return $out;
554}
555
556/**
557 * Takes an array of ints representing the Unicode characters and returns
558 * a UTF-8 string. Astral planes are supported ie. the ints in the
559 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
560 * are not allowed.
561 *
562 * If $strict is set to true the function returns false if the input
563 * array contains ints that represent surrogates or are outside the
564 * Unicode range and raises a PHP error at level E_USER_WARNING
565 *
566 * Note: this function has been modified slightly in this library to use
567 * output buffering to concatenate the UTF-8 string (faster) as well as
568 * reference the array by it's keys
569 *
570 * @param  array of unicode code points representing a string
571 * @param  boolean Check for invalid sequences?
572 * @return mixed UTF-8 string or FALSE if array contains invalid code points
573 * @author <hsivonen@iki.fi>
574 * @author Harry Fuecks <hfuecks@gmail.com>
575 * @see    utf8_to_unicode
576 * @link   http://hsivonen.iki.fi/php-utf8/
577 * @link   http://sourceforge.net/projects/phputf8/
578 */
579function unicode_to_utf8($arr,$strict=false) {
580    if (!is_array($arr)) return '';
581    ob_start();
582
583    foreach (array_keys($arr) as $k) {
584
585        # ASCII range (including control chars)
586        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
587
588            echo chr($arr[$k]);
589
590        # 2 byte sequence
591        } else if ($arr[$k] <= 0x07ff) {
592
593            echo chr(0xc0 | ($arr[$k] >> 6));
594            echo chr(0x80 | ($arr[$k] & 0x003f));
595
596        # Byte order mark (skip)
597        } else if($arr[$k] == 0xFEFF) {
598
599            // nop -- zap the BOM
600
601        # Test for illegal surrogates
602        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
603
604            // found a surrogate
605            if($strict){
606                trigger_error(
607                    'unicode_to_utf8: Illegal surrogate '.
608                        'at index: '.$k.', value: '.$arr[$k],
609                    E_USER_WARNING
610                    );
611                return FALSE;
612            }
613
614        # 3 byte sequence
615        } else if ($arr[$k] <= 0xffff) {
616
617            echo chr(0xe0 | ($arr[$k] >> 12));
618            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
619            echo chr(0x80 | ($arr[$k] & 0x003f));
620
621        # 4 byte sequence
622        } else if ($arr[$k] <= 0x10ffff) {
623
624            echo chr(0xf0 | ($arr[$k] >> 18));
625            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
626            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
627            echo chr(0x80 | ($arr[$k] & 0x3f));
628
629        } elseif($strict) {
630
631            trigger_error(
632                'unicode_to_utf8: Codepoint out of Unicode range '.
633                    'at index: '.$k.', value: '.$arr[$k],
634                E_USER_WARNING
635                );
636
637            // out of range
638            return FALSE;
639        }
640    }
641
642    $result = ob_get_contents();
643    ob_end_clean();
644    return $result;
645}
646
647/**
648 * UTF-8 to UTF-16BE conversion.
649 *
650 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
651 */
652function utf8_to_utf16be(&$str, $bom = false) {
653  $out = $bom ? "\xFE\xFF" : '';
654  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
655    return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
656
657  $uni = utf8_to_unicode($str);
658  foreach($uni as $cp){
659    $out .= pack('n',$cp);
660  }
661  return $out;
662}
663
664/**
665 * UTF-8 to UTF-16BE conversion.
666 *
667 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
668 */
669function utf16be_to_utf8(&$str) {
670  $uni = unpack('n*',$str);
671  return unicode_to_utf8($uni);
672}
673
674/**
675 * UTF-8 Case lookup table
676 *
677 * This lookuptable defines the upper case letters to their correspponding
678 * lower case letter in UTF-8
679 *
680 * @author Andreas Gohr <andi@splitbrain.org>
681 */
682static $UTF8_LOWER_TO_UPPER = array(
683  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
684  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
685  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
686  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
687  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
688  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
689  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
690  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
691  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
692  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
693  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
694  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
695  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
696  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
697  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
698  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
699  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
700  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
701  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
702  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
703  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
704  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
705  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
706  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
707  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
708  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
709  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
710  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
711  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
712  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
713  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
714  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
715  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
716  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
717  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
718  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
719  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
720  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
721  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
722  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
723  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
724  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
725  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
726);
727
728/**
729 * UTF-8 Case lookup table
730 *
731 * This lookuptable defines the lower case letters to their correspponding
732 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
733 *
734 * @author Andreas Gohr <andi@splitbrain.org>
735 */
736$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
737
738/**
739 * UTF-8 lookup table for lower case accented letters
740 *
741 * This lookuptable defines replacements for accented characters from the ASCII-7
742 * range. This are lower case letters only.
743 *
744 * @author Andreas Gohr <andi@splitbrain.org>
745 * @see    utf8_deaccent()
746 */
747$UTF8_LOWER_ACCENTS = array(
748  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
749  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
750  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
751  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
752  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
753  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
754  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
755  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
756  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
757  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
758  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
759  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
760  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
761  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
762  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
763);
764
765/**
766 * UTF-8 lookup table for upper case accented letters
767 *
768 * This lookuptable defines replacements for accented characters from the ASCII-7
769 * range. This are upper case letters only.
770 *
771 * @author Andreas Gohr <andi@splitbrain.org>
772 * @see    utf8_deaccent()
773 */
774$UTF8_UPPER_ACCENTS = array(
775  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
776  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
777  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
778  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
779  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
780  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
781  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
782  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
783  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
784  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
785  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
786  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
787  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
788  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
789  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
790);
791
792/**
793 * UTF-8 array of common special characters
794 *
795 * This array should contain all special characters (not a letter or digit)
796 * defined in the various local charsets - it's not a complete list of non-alphanum
797 * characters in UTF-8. It's not perfect but should match most cases of special
798 * chars.
799 *
800 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
801 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
802 *
803 * @author Andreas Gohr <andi@splitbrain.org>
804 * @see    utf8_stripspecials()
805 */
806$UTF8_SPECIAL_CHARS = array(
807  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
808  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
809          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
810  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
811  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
812  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
813  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
814  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
815  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
816  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
817  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
818  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
819  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
820  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
821  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
822  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
823  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
824  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
825  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
826  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
827  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
828  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
829  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
830  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
831  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
832  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
833  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
834  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
835  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
836  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
837  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
838  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
839  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
840  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
841  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
842  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
843  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
844  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
845  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
846  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
847  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
848  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
849  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
850  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
851  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
852  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
853  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
854  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
855  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
856  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
857);
858
859/**
860 * Romanization lookup table
861 *
862 * This lookup tables provides a way to transform strings written in a language
863 * different from the ones based upon latin letters into plain ASCII.
864 *
865 * Please note: this is not a scientific transliteration table. It only works
866 * oneway from nonlatin to ASCII and it works by simple character replacement
867 * only. Specialities of each language are not supported.
868 *
869 * @author Andreas Gohr <andi@splitbrain.org>
870 * @author Vitaly Blokhin <vitinfo@vitn.com>
871 * @link   http://www.uconv.com/translit.htm
872 * @author Bisqwit <bisqwit@iki.fi>
873 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
874 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
875 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
876 * @link   http://www.btranslations.com/resources/romanization/korean.asp
877 */
878$UTF8_ROMANIZATION = array(
879  //russian cyrillic
880  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
881  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
882  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
883  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
884  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
885  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
886  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
887  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
888  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
889  // Ukrainian cyrillic
890  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
891  // Georgian
892  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
893  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
894  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
895  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
896  'ჰ'=>'xh',
897  //Sanskrit
898  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
899  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
900  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
901  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
902  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
903  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
904  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
905  //Hebrew
906  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
907  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
908  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
909  'ש'=>'sh','ת'=>'t',
910  //Arabic
911  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
912  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
913  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
914  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
915
916  // Japanese hiragana
917  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
918  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
919  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
920  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
921  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
922  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
923  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
924  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
925  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
926  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
927  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
928  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
929  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
930  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
931  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
932  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
933  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
934  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
935  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
936  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
937  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
938  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
939  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
940  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
941  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
942  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
943  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
944  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
945  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
946  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
947  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
948  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
949  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
950  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
951  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
952  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
953  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
954  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
955  'じゅ'=>'zyu',
956  // Japanese katakana
957  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
958  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
959  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
960  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
961  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
962  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
963  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
964  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
965  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
966  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
967  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
968  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
969  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
970  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
971  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
972  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
973  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
974  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
975  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
976  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
977  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
978  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
979  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
980  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
981  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
982  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
983  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
984  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
985  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
986  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
987  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
988  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
989  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
990  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
991  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
992  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
993  'ジョ'=>'zyo','ジュ'=>'zyu',
994
995  // "Greeklish"
996  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
997  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
998
999  // Thai
1000  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1001  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1002  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1003  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1004  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1005  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1006  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1007  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1008  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1009  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1010  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1011  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1012  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1013  'เ–ียว'=>'iao',
1014
1015  // Korean
1016  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1017  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1018  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1019  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1020  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1021  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1022);
1023
1024//Setup VIM: ex: et ts=2 enc=utf-8 :
1025
1026