xref: /dokuwiki/inc/utf8.php (revision 9f9db1ad22c6352bfd4291b205f7e24ebd5ac9c9)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_encodeFN')){
23    /**
24     * URL-Encode a filename to allow unicodecharacters
25     *
26     * Slashes are not encoded
27     *
28     * When the second parameter is true the string will
29     * be encoded only if non ASCII characters are detected -
30     * This makes it safe to run it multiple times on the
31     * same string (default is true)
32     *
33     * @author Andreas Gohr <andi@splitbrain.org>
34     * @see    urlencode
35     */
36    function utf8_encodeFN($file,$safe=true){
37      if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38        return $file;
39      }
40      $file = urlencode($file);
41      $file = str_replace('%2F','/',$file);
42      return $file;
43    }
44}
45
46if(!function_exists('utf8_decodeFN')){
47    /**
48     * URL-Decode a filename
49     *
50     * This is just a wrapper around urldecode
51     *
52     * @author Andreas Gohr <andi@splitbrain.org>
53     * @see    urldecode
54     */
55    function utf8_decodeFN($file){
56        $file = urldecode($file);
57        return $file;
58    }
59}
60
61if(!function_exists('utf8_isASCII')){
62    /**
63     * Checks if a string contains 7bit ASCII only
64     *
65     * @author Andreas Gohr <andi@splitbrain.org>
66     */
67    function utf8_isASCII($str){
68      for($i=0; $i<strlen($str); $i++){
69        if(ord($str{$i}) >127) return false;
70      }
71      return true;
72    }
73}
74
75if(!function_exists('utf8_strip')){
76    /**
77     * Strips all highbyte chars
78     *
79     * Returns a pure ASCII7 string
80     *
81     * @author Andreas Gohr <andi@splitbrain.org>
82     */
83    function utf8_strip($str){
84      $ascii = '';
85      for($i=0; $i<strlen($str); $i++){
86        if(ord($str{$i}) <128){
87          $ascii .= $str{$i};
88        }
89      }
90      return $ascii;
91    }
92}
93
94if(!function_exists('utf8_check')){
95    /**
96     * Tries to detect if a string is in Unicode encoding
97     *
98     * @author <bmorel@ssi.fr>
99     * @link   http://www.php.net/manual/en/function.utf8-encode.php
100     */
101    function utf8_check($Str) {
102        for ($i=0; $i<strlen($Str); $i++) {
103            $b = ord($Str[$i]);
104            if ($b < 0x80) continue; # 0bbbbbbb
105            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
106            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
107            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
108            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
109            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
110            else return false; # Does not match any model
111
112            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
113                if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
114                    return false;
115            }
116        }
117        return true;
118    }
119}
120
121if(!function_exists('utf8_strlen')){
122    /**
123     * Unicode aware replacement for strlen()
124     *
125     * utf8_decode() converts characters that are not in ISO-8859-1
126     * to '?', which, for the purpose of counting, is alright - It's
127     * even faster than mb_strlen.
128     *
129     * @author <chernyshevsky at hotmail dot com>
130     * @see    strlen()
131     * @see    utf8_decode()
132     */
133    function utf8_strlen($string){
134        return strlen(utf8_decode($string));
135    }
136}
137
138if(!function_exists('utf8_substr')){
139    /**
140     * UTF-8 aware alternative to substr
141     *
142     * Return part of a string given character offset (and optionally length)
143     *
144     * @author Harry Fuecks <hfuecks@gmail.com>
145     * @author Chris Smith <chris@jalakai.co.uk>
146     * @param string
147     * @param integer number of UTF-8 characters offset (from left)
148     * @param integer (optional) length in UTF-8 characters from offset
149     * @return mixed string or false if failure
150     */
151    function utf8_substr($str, $offset, $length = null) {
152        if(UTF8_MBSTRING){
153            if( $length === null ){
154                return mb_substr($str, $offset);
155            }else{
156                return mb_substr($str, $offset, $length);
157            }
158        }
159
160        /*
161         * Notes:
162         *
163         * no mb string support, so we'll use pcre regex's with 'u' flag
164         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
165         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
166         *
167         * substr documentation states false can be returned in some cases (e.g. offset > string length)
168         * mb_substr never returns false, it will return an empty string instead.
169         *
170         * calculating the number of characters in the string is a relatively expensive operation, so
171         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
172         */
173
174        // cast parameters to appropriate types to avoid multiple notices/warnings
175        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
176        $offset = (int)$offset;
177        if (!is_null($length)) $length = (int)$length;
178
179        // handle trivial cases
180        if ($length === 0) return '';
181        if ($offset < 0 && $length < 0 && $length < $offset) return '';
182
183        $offset_pattern = '';
184        $length_pattern = '';
185
186        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
187        if ($offset < 0) {
188          $strlen = strlen(utf8_decode($str));        // see notes
189          $offset = $strlen + $offset;
190          if ($offset < 0) $offset = 0;
191        }
192
193        // establish a pattern for offset, a non-captured group equal in length to offset
194        if ($offset > 0) {
195          $Ox = (int)($offset/65535);
196          $Oy = $offset%65535;
197
198          if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
199          $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
200        } else {
201          $offset_pattern = '^';                      // offset == 0; just anchor the pattern
202        }
203
204        // establish a pattern for length
205        if (is_null($length)) {
206          $length_pattern = '(.*)$';                  // the rest of the string
207        } else {
208
209          if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
210          if ($offset > $strlen) return '';           // another trivial case
211
212          if ($length > 0) {
213
214            $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
215
216            $Lx = (int)($length/65535);
217            $Ly = $length%65535;
218
219            // +ve length requires ... a captured group of length characters
220            if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
221            $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
222
223          } else if ($length < 0) {
224
225            if ($length < ($offset - $strlen)) return '';
226
227            $Lx = (int)((-$length)/65535);
228            $Ly = (-$length)%65535;
229
230            // -ve length requires ... capture everything except a group of -length characters
231            //                         anchored at the tail-end of the string
232            if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
233            $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
234          }
235        }
236
237        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
238        return $match[1];
239    }
240}
241
242if(!function_exists('utf8_substr_replace')){
243    /**
244     * Unicode aware replacement for substr_replace()
245     *
246     * @author Andreas Gohr <andi@splitbrain.org>
247     * @see    substr_replace()
248     */
249    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
250      $ret = '';
251      if($start>0) $ret .= utf8_substr($string, 0, $start);
252      $ret .= $replacement;
253      $ret .= utf8_substr($string, $start+$length);
254      return $ret;
255    }
256}
257
258if(!function_exists('utf8_ltrim')){
259    /**
260     * Unicode aware replacement for ltrim()
261     *
262     * @author Andreas Gohr <andi@splitbrain.org>
263     * @see    ltrim()
264     * @return string
265     */
266    function utf8_ltrim($str,$charlist=''){
267      if($charlist == '') return ltrim($str);
268
269      //quote charlist for use in a characterclass
270      $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
271
272      return preg_replace('/^['.$charlist.']+/u','',$str);
273    }
274}
275
276if(!function_exists('utf8_rtrim')){
277    /**
278     * Unicode aware replacement for rtrim()
279     *
280     * @author Andreas Gohr <andi@splitbrain.org>
281     * @see    rtrim()
282     * @return string
283     */
284    function  utf8_rtrim($str,$charlist=''){
285      if($charlist == '') return rtrim($str);
286
287      //quote charlist for use in a characterclass
288      $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
289
290      return preg_replace('/['.$charlist.']+$/u','',$str);
291    }
292}
293
294if(!function_exists('utf8_trim')){
295    /**
296     * Unicode aware replacement for trim()
297     *
298     * @author Andreas Gohr <andi@splitbrain.org>
299     * @see    trim()
300     * @return string
301     */
302    function  utf8_trim($str,$charlist='') {
303      if($charlist == '') return trim($str);
304
305      return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
306    }
307}
308
309if(!function_exists('utf8_strtolower')){
310    /**
311     * This is a unicode aware replacement for strtolower()
312     *
313     * Uses mb_string extension if available
314     *
315     * @author Leo Feyer <leo@typolight.org>
316     * @see    strtolower()
317     * @see    utf8_strtoupper()
318     */
319    function utf8_strtolower($string){
320      if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
321
322      global $UTF8_UPPER_TO_LOWER;
323      return strtr($string,$UTF8_UPPER_TO_LOWER);
324    }
325}
326
327if(!function_exists('utf8_strtoupper')){
328    /**
329     * This is a unicode aware replacement for strtoupper()
330     *
331     * Uses mb_string extension if available
332     *
333     * @author Leo Feyer <leo@typolight.org>
334     * @see    strtoupper()
335     * @see    utf8_strtoupper()
336     */
337    function utf8_strtoupper($string){
338      if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
339
340      global $UTF8_LOWER_TO_UPPER;
341      return strtr($string,$UTF8_LOWER_TO_UPPER);
342    }
343}
344
345if(!function_exists('utf8_ucfirst')){
346    /**
347     * UTF-8 aware alternative to ucfirst
348     * Make a string's first character uppercase
349     *
350     * @author Harry Fuecks
351     * @param string
352     * @return string with first character as upper case (if applicable)
353     */
354    function utf8_ucfirst($str){
355      switch ( utf8_strlen($str) ) {
356        case 0:
357            return '';
358        case 1:
359            return utf8_strtoupper($str);
360        default:
361            preg_match('/^(.{1})(.*)$/us', $str, $matches);
362            return utf8_strtoupper($matches[1]).$matches[2];
363      }
364    }
365}
366
367if(!function_exists('utf8_ucwords')){
368    /**
369     * UTF-8 aware alternative to ucwords
370     * Uppercase the first character of each word in a string
371     *
372     * @author Harry Fuecks
373     * @param string
374     * @return string with first char of each word uppercase
375     * @see http://www.php.net/ucwords
376     */
377    function utf8_ucwords($str) {
378      // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
379      // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
380      // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
381      $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
382
383      return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
384    }
385
386    /**
387     * Callback function for preg_replace_callback call in utf8_ucwords
388     * You don't need to call this yourself
389     *
390     * @author Harry Fuecks
391     * @param array of matches corresponding to a single word
392     * @return string with first char of the word in uppercase
393     * @see utf8_ucwords
394     * @see utf8_strtoupper
395     */
396    function utf8_ucwords_callback($matches) {
397      $leadingws = $matches[2];
398      $ucfirst = utf8_strtoupper($matches[3]);
399      $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
400      return $leadingws . $ucword;
401    }
402}
403
404if(!function_exists('utf8_deaccent')){
405    /**
406     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
407     *
408     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
409     * letters. Default is to deaccent both cases ($case = 0)
410     *
411     * @author Andreas Gohr <andi@splitbrain.org>
412     */
413    function utf8_deaccent($string,$case=0){
414        if($case <= 0){
415            global $UTF8_LOWER_ACCENTS;
416            $string = strtr($string,$UTF8_LOWER_ACCENTS);
417        }
418        if($case >= 0){
419            global $UTF8_UPPER_ACCENTS;
420            $string = strtr($string,$UTF8_UPPER_ACCENTS);
421        }
422        return $string;
423    }
424}
425
426if(!function_exists('utf8_romanize')){
427    /**
428     * Romanize a non-latin string
429     *
430     * @author Andreas Gohr <andi@splitbrain.org>
431     */
432    function utf8_romanize($string){
433        if(utf8_isASCII($string)) return $string; //nothing to do
434
435        global $UTF8_ROMANIZATION;
436        return strtr($string,$UTF8_ROMANIZATION);
437    }
438}
439
440if(!function_exists('utf8_stripspecials')){
441    /**
442     * Removes special characters (nonalphanumeric) from a UTF-8 string
443     *
444     * This function adds the controlchars 0x00 to 0x19 to the array of
445     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
446     *
447     * @author Andreas Gohr <andi@splitbrain.org>
448     * @param  string $string     The UTF8 string to strip of special chars
449     * @param  string $repl       Replace special with this string
450     * @param  string $additional Additional chars to strip (used in regexp char class)
451     */
452    function utf8_stripspecials($string,$repl='',$additional=''){
453        global $UTF8_SPECIAL_CHARS;
454        global $UTF8_SPECIAL_CHARS2;
455
456        static $specials = null;
457        if(is_null($specials)){
458            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
459            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
460        }
461
462        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
463    }
464}
465
466if(!function_exists('utf8_strpos')){
467    /**
468     * This is an Unicode aware replacement for strpos
469     *
470     * @author Leo Feyer <leo@typolight.org>
471     * @see    strpos()
472     * @param  string
473     * @param  string
474     * @param  integer
475     * @return integer
476     */
477    function utf8_strpos($haystack, $needle, $offset=0){
478        $comp = 0;
479        $length = null;
480
481        while (is_null($length) || $length < $offset) {
482            $pos = strpos($haystack, $needle, $offset + $comp);
483
484            if ($pos === false)
485                return false;
486
487            $length = utf8_strlen(substr($haystack, 0, $pos));
488
489            if ($length < $offset)
490                $comp = $pos - $length;
491        }
492
493        return $length;
494    }
495}
496
497if(!function_exists('utf8_tohtml')){
498    /**
499     * Encodes UTF-8 characters to HTML entities
500     *
501     * @author Tom N Harris <tnharris@whoopdedo.org>
502     * @author <vpribish at shopping dot com>
503     * @link   http://www.php.net/manual/en/function.utf8-decode.php
504     */
505    function utf8_tohtml ($str) {
506        $ret = '';
507        foreach (utf8_to_unicode($str) as $cp) {
508            if ($cp < 0x80)
509                $ret .= chr($cp);
510            elseif ($cp < 0x100)
511                $ret .= "&#$cp;";
512            else
513                $ret .= '&#x'.dechex($cp).';';
514        }
515        return $ret;
516    }
517}
518
519if(!function_exists('utf8_unhtml')){
520    /**
521     * Decodes HTML entities to UTF-8 characters
522     *
523     * Convert any &#..; entity to a codepoint,
524     * The entities flag defaults to only decoding numeric entities.
525     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
526     * are handled as well. Avoids the problem that would occur if you
527     * had to decode "&amp;#38;&#38;amp;#38;"
528     *
529     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
530     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
531     * what it should be                   -> "&#38;&amp#38;"
532     *
533     * @author Tom N Harris <tnharris@whoopdedo.org>
534     * @param  string  $str      UTF-8 encoded string
535     * @param  boolean $entities Flag controlling decoding of named entities.
536     * @return UTF-8 encoded string with numeric (and named) entities replaced.
537     */
538    function utf8_unhtml($str, $entities=null) {
539        static $decoder = null;
540        if (is_null($decoder))
541            $decoder = new utf8_entity_decoder();
542        if (is_null($entities))
543            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
544                                         'utf8_decode_numeric', $str);
545        else
546            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
547                                         array(&$decoder, 'decode'), $str);
548    }
549}
550
551if(!function_exists('utf8_decode_numeric')){
552    function utf8_decode_numeric($ent) {
553        switch ($ent[2]) {
554          case 'X':
555          case 'x':
556              $cp = hexdec($ent[3]);
557              break;
558          default:
559              $cp = intval($ent[3]);
560              break;
561        }
562        return unicode_to_utf8(array($cp));
563    }
564}
565
566if(!class_exists('utf8_entity_decoder')){
567    class utf8_entity_decoder {
568        var $table;
569        function utf8_entity_decoder() {
570            $table = get_html_translation_table(HTML_ENTITIES);
571            $table = array_flip($table);
572            $this->table = array_map(array(&$this,'makeutf8'), $table);
573        }
574        function makeutf8($c) {
575            return unicode_to_utf8(array(ord($c)));
576        }
577        function decode($ent) {
578            if ($ent[1] == '#') {
579                return utf8_decode_numeric($ent);
580            } elseif (array_key_exists($ent[0],$this->table)) {
581                return $this->table[$ent[0]];
582            } else {
583                return $ent[0];
584            }
585        }
586    }
587}
588
589if(!function_exists('utf8_to_unicode')){
590    /**
591     * Takes an UTF-8 string and returns an array of ints representing the
592     * Unicode characters. Astral planes are supported ie. the ints in the
593     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
594     * are not allowed.
595     *
596     * If $strict is set to true the function returns false if the input
597     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
598     * level E_USER_WARNING
599     *
600     * Note: this function has been modified slightly in this library to
601     * trigger errors on encountering bad bytes
602     *
603     * @author <hsivonen@iki.fi>
604     * @author Harry Fuecks <hfuecks@gmail.com>
605     * @param  string  UTF-8 encoded string
606     * @param  boolean Check for invalid sequences?
607     * @return mixed array of unicode code points or false if UTF-8 invalid
608     * @see    unicode_to_utf8
609     * @link   http://hsivonen.iki.fi/php-utf8/
610     * @link   http://sourceforge.net/projects/phputf8/
611     */
612    function utf8_to_unicode($str,$strict=false) {
613        $mState = 0;     // cached expected number of octets after the current octet
614                         // until the beginning of the next UTF8 character sequence
615        $mUcs4  = 0;     // cached Unicode character
616        $mBytes = 1;     // cached expected number of octets in the current sequence
617
618        $out = array();
619
620        $len = strlen($str);
621
622        for($i = 0; $i < $len; $i++) {
623
624            $in = ord($str{$i});
625
626            if ( $mState == 0) {
627
628                // When mState is zero we expect either a US-ASCII character or a
629                // multi-octet sequence.
630                if (0 == (0x80 & ($in))) {
631                    // US-ASCII, pass straight through.
632                    $out[] = $in;
633                    $mBytes = 1;
634
635                } else if (0xC0 == (0xE0 & ($in))) {
636                    // First octet of 2 octet sequence
637                    $mUcs4 = ($in);
638                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
639                    $mState = 1;
640                    $mBytes = 2;
641
642                } else if (0xE0 == (0xF0 & ($in))) {
643                    // First octet of 3 octet sequence
644                    $mUcs4 = ($in);
645                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
646                    $mState = 2;
647                    $mBytes = 3;
648
649                } else if (0xF0 == (0xF8 & ($in))) {
650                    // First octet of 4 octet sequence
651                    $mUcs4 = ($in);
652                    $mUcs4 = ($mUcs4 & 0x07) << 18;
653                    $mState = 3;
654                    $mBytes = 4;
655
656                } else if (0xF8 == (0xFC & ($in))) {
657                    /* First octet of 5 octet sequence.
658                     *
659                     * This is illegal because the encoded codepoint must be either
660                     * (a) not the shortest form or
661                     * (b) outside the Unicode range of 0-0x10FFFF.
662                     * Rather than trying to resynchronize, we will carry on until the end
663                     * of the sequence and let the later error handling code catch it.
664                     */
665                    $mUcs4 = ($in);
666                    $mUcs4 = ($mUcs4 & 0x03) << 24;
667                    $mState = 4;
668                    $mBytes = 5;
669
670                } else if (0xFC == (0xFE & ($in))) {
671                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
672                    $mUcs4 = ($in);
673                    $mUcs4 = ($mUcs4 & 1) << 30;
674                    $mState = 5;
675                    $mBytes = 6;
676
677                } elseif($strict) {
678                    /* Current octet is neither in the US-ASCII range nor a legal first
679                     * octet of a multi-octet sequence.
680                     */
681                    trigger_error(
682                            'utf8_to_unicode: Illegal sequence identifier '.
683                                'in UTF-8 at byte '.$i,
684                            E_USER_WARNING
685                        );
686                    return false;
687
688                }
689
690            } else {
691
692                // When mState is non-zero, we expect a continuation of the multi-octet
693                // sequence
694                if (0x80 == (0xC0 & ($in))) {
695
696                    // Legal continuation.
697                    $shift = ($mState - 1) * 6;
698                    $tmp = $in;
699                    $tmp = ($tmp & 0x0000003F) << $shift;
700                    $mUcs4 |= $tmp;
701
702                    /**
703                     * End of the multi-octet sequence. mUcs4 now contains the final
704                     * Unicode codepoint to be output
705                     */
706                    if (0 == --$mState) {
707
708                        /*
709                         * Check for illegal sequences and codepoints.
710                         */
711                        // From Unicode 3.1, non-shortest form is illegal
712                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
713                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
714                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
715                            (4 < $mBytes) ||
716                            // From Unicode 3.2, surrogate characters are illegal
717                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
718                            // Codepoints outside the Unicode range are illegal
719                            ($mUcs4 > 0x10FFFF)) {
720
721                            if($strict){
722                                trigger_error(
723                                        'utf8_to_unicode: Illegal sequence or codepoint '.
724                                            'in UTF-8 at byte '.$i,
725                                        E_USER_WARNING
726                                    );
727
728                                return false;
729                            }
730
731                        }
732
733                        if (0xFEFF != $mUcs4) {
734                            // BOM is legal but we don't want to output it
735                            $out[] = $mUcs4;
736                        }
737
738                        //initialize UTF8 cache
739                        $mState = 0;
740                        $mUcs4  = 0;
741                        $mBytes = 1;
742                    }
743
744                } elseif($strict) {
745                    /**
746                     *((0xC0 & (*in) != 0x80) && (mState != 0))
747                     * Incomplete multi-octet sequence.
748                     */
749                    trigger_error(
750                            'utf8_to_unicode: Incomplete multi-octet '.
751                            '   sequence in UTF-8 at byte '.$i,
752                            E_USER_WARNING
753                        );
754
755                    return false;
756                }
757            }
758        }
759        return $out;
760    }
761}
762
763if(!function_exists('unicode_to_utf8')){
764    /**
765     * Takes an array of ints representing the Unicode characters and returns
766     * a UTF-8 string. Astral planes are supported ie. the ints in the
767     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
768     * are not allowed.
769     *
770     * If $strict is set to true the function returns false if the input
771     * array contains ints that represent surrogates or are outside the
772     * Unicode range and raises a PHP error at level E_USER_WARNING
773     *
774     * Note: this function has been modified slightly in this library to use
775     * output buffering to concatenate the UTF-8 string (faster) as well as
776     * reference the array by it's keys
777     *
778     * @param  array of unicode code points representing a string
779     * @param  boolean Check for invalid sequences?
780     * @return mixed UTF-8 string or false if array contains invalid code points
781     * @author <hsivonen@iki.fi>
782     * @author Harry Fuecks <hfuecks@gmail.com>
783     * @see    utf8_to_unicode
784     * @link   http://hsivonen.iki.fi/php-utf8/
785     * @link   http://sourceforge.net/projects/phputf8/
786     */
787    function unicode_to_utf8($arr,$strict=false) {
788        if (!is_array($arr)) return '';
789        ob_start();
790
791        foreach (array_keys($arr) as $k) {
792
793            # ASCII range (including control chars)
794            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
795
796                echo chr($arr[$k]);
797
798            # 2 byte sequence
799            } else if ($arr[$k] <= 0x07ff) {
800
801                echo chr(0xc0 | ($arr[$k] >> 6));
802                echo chr(0x80 | ($arr[$k] & 0x003f));
803
804            # Byte order mark (skip)
805            } else if($arr[$k] == 0xFEFF) {
806
807                // nop -- zap the BOM
808
809            # Test for illegal surrogates
810            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
811
812                // found a surrogate
813                if($strict){
814                    trigger_error(
815                        'unicode_to_utf8: Illegal surrogate '.
816                            'at index: '.$k.', value: '.$arr[$k],
817                        E_USER_WARNING
818                        );
819                    return false;
820                }
821
822            # 3 byte sequence
823            } else if ($arr[$k] <= 0xffff) {
824
825                echo chr(0xe0 | ($arr[$k] >> 12));
826                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
827                echo chr(0x80 | ($arr[$k] & 0x003f));
828
829            # 4 byte sequence
830            } else if ($arr[$k] <= 0x10ffff) {
831
832                echo chr(0xf0 | ($arr[$k] >> 18));
833                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
834                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
835                echo chr(0x80 | ($arr[$k] & 0x3f));
836
837            } elseif($strict) {
838
839                trigger_error(
840                    'unicode_to_utf8: Codepoint out of Unicode range '.
841                        'at index: '.$k.', value: '.$arr[$k],
842                    E_USER_WARNING
843                    );
844
845                // out of range
846                return false;
847            }
848        }
849
850        $result = ob_get_contents();
851        ob_end_clean();
852        return $result;
853    }
854}
855
856if(!function_exists('utf8_to_utf16be')){
857    /**
858     * UTF-8 to UTF-16BE conversion.
859     *
860     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
861     */
862    function utf8_to_utf16be(&$str, $bom = false) {
863        $out = $bom ? "\xFE\xFF" : '';
864        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
865
866        $uni = utf8_to_unicode($str);
867        foreach($uni as $cp){
868            $out .= pack('n',$cp);
869        }
870        return $out;
871    }
872}
873
874if(!function_exists('utf16be_to_utf8')){
875    /**
876     * UTF-8 to UTF-16BE conversion.
877     *
878     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
879     */
880    function utf16be_to_utf8(&$str) {
881        $uni = unpack('n*',$str);
882        return unicode_to_utf8($uni);
883    }
884}
885
886if(!function_exists('utf8_bad_replace')){
887    /**
888     * Replace bad bytes with an alternative character
889     *
890     * ASCII character is recommended for replacement char
891     *
892     * PCRE Pattern to locate bad bytes in a UTF-8 string
893     * Comes from W3 FAQ: Multilingual Forms
894     * Note: modified to include full ASCII range including control chars
895     *
896     * @author Harry Fuecks <hfuecks@gmail.com>
897     * @see http://www.w3.org/International/questions/qa-forms-utf-8
898     * @param string to search
899     * @param string to replace bad bytes with (defaults to '?') - use ASCII
900     * @return string
901     */
902    function utf8_bad_replace($str, $replace = '') {
903        $UTF8_BAD =
904         '([\x00-\x7F]'.                          # ASCII (including control chars)
905         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
906         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
907         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
908         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
909         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
910         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
911         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
912         '|(.{1}))';                              # invalid byte
913        ob_start();
914        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
915            if ( !isset($matches[2])) {
916                echo $matches[0];
917            } else {
918                echo $replace;
919            }
920            $str = substr($str,strlen($matches[0]));
921        }
922        $result = ob_get_contents();
923        ob_end_clean();
924        return $result;
925    }
926}
927
928if(!function_exists('utf8_correctIdx')){
929    /**
930     * adjust a byte index into a utf8 string to a utf8 character boundary
931     *
932     * @param $str   string   utf8 character string
933     * @param $i     int      byte index into $str
934     * @param $next  bool     direction to search for boundary,
935     *                           false = up (current character)
936     *                           true = down (next character)
937     *
938     * @return int            byte index into $str now pointing to a utf8 character boundary
939     *
940     * @author       chris smith <chris@jalakai.co.uk>
941     */
942    function utf8_correctIdx(&$str,$i,$next=false) {
943
944        if ($i <= 0) return 0;
945
946        $limit = strlen($str);
947        if ($i>=$limit) return $limit;
948
949        if ($next) {
950            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
951        } else {
952            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
953        }
954
955        return $i;
956    }
957}
958
959// only needed if no mb_string available
960if(!UTF8_MBSTRING){
961  /**
962   * UTF-8 Case lookup table
963   *
964   * This lookuptable defines the upper case letters to their correspponding
965   * lower case letter in UTF-8
966   *
967   * @author Andreas Gohr <andi@splitbrain.org>
968   */
969  global $UTF8_LOWER_TO_UPPER;
970  if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
971    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
972    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
973    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
974    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
975    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
976    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
977    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
978    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
979    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
980    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
981    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
982    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
983    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
984    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
985    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
986    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
987    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
988    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
989    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
990    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
991    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
992    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
993    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
994    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
995    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
996    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
997    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
998    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
999    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1000    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1001    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1002    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1003    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1004    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1005    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1006    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1007    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1008    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1009    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1010    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1011    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1012    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1013    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1014    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1015    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1016    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1017    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1018    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1019    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1020    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1021    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1022    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1023    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1024    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1025    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1026    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1027    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1028    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1029    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1030    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1031    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1032    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1033    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1034    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1035    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1036    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1037    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1038  );
1039
1040  /**
1041   * UTF-8 Case lookup table
1042   *
1043   * This lookuptable defines the lower case letters to their correspponding
1044   * upper case letter in UTF-8
1045   *
1046   * @author Andreas Gohr <andi@splitbrain.org>
1047   */
1048  global $UTF8_UPPER_TO_LOWER;
1049  if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1050    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1051    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1052    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1053    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1054    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1055    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1056    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1057    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1058    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1059    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1060    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1061    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1062    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1063    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1064    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1065    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1066    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1067    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1068    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1069    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1070    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1071    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1072    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1073    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1074    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1075    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1076    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1077    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1078    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1079    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1080    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1081    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1082    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1083    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1084    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1085    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1086    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1087    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1088    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1089    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1090    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1091    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1092    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1093    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1094    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1095    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1096    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1097    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1098    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1099    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1100    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1101    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1102    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1103    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1104    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1105    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1106    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1107    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1108    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1109    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1110    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1111    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1112    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1113    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1114    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1115    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1116    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1117  );
1118}; // end of case lookup tables
1119
1120/**
1121 * UTF-8 lookup table for lower case accented letters
1122 *
1123 * This lookuptable defines replacements for accented characters from the ASCII-7
1124 * range. This are lower case letters only.
1125 *
1126 * @author Andreas Gohr <andi@splitbrain.org>
1127 * @see    utf8_deaccent()
1128 */
1129global $UTF8_LOWER_ACCENTS;
1130if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1131  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1132  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1133  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1134  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1135  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1136  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1137  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1138  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1139  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1140  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1141  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1142  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1143  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1144  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1145  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1146);
1147
1148/**
1149 * UTF-8 lookup table for upper case accented letters
1150 *
1151 * This lookuptable defines replacements for accented characters from the ASCII-7
1152 * range. This are upper case letters only.
1153 *
1154 * @author Andreas Gohr <andi@splitbrain.org>
1155 * @see    utf8_deaccent()
1156 */
1157global $UTF8_UPPER_ACCENTS;
1158if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1159  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1160  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1161  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1162  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1163  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1164  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1165  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1166  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1167  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1168  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1169  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1170  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1171  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1172  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1173  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1174);
1175
1176/**
1177 * UTF-8 array of common special characters
1178 *
1179 * This array should contain all special characters (not a letter or digit)
1180 * defined in the various local charsets - it's not a complete list of non-alphanum
1181 * characters in UTF-8. It's not perfect but should match most cases of special
1182 * chars.
1183 *
1184 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1185 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1186 *
1187 * @author Andreas Gohr <andi@splitbrain.org>
1188 * @see    utf8_stripspecials()
1189 */
1190global $UTF8_SPECIAL_CHARS;
1191if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1192  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1193  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1194          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1195  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1196  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1197  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1198  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1199  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1200  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1201  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1202  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1203  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1204  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1205  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1206  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1207  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1208  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1209  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1210  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1211  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1212  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1213  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1214  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1215  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1216  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1217  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1218  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1219  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1220  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1221  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1222  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1223  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1224  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1225  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1226  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1227  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1228  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1229  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1230  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1231  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1232  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1233  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1234  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1235  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1236  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1237  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1238  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1239  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1240  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1241  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1242  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1243  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1244  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1245          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1246  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1247  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1248  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1249  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1250  0xffeb, 0xffec, 0xffed, 0xffee,
1251  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1252  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1253  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1254  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1255  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1256);
1257
1258// utf8 version of above data
1259global $UTF8_SPECIAL_CHARS2;
1260if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1261    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1262    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1263    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1264    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1265    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1266    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1267    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1268    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1269    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1270    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1271    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1272    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1273    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1274    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1275    '➷➸➹➺➻➼➽➾'.
1276    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1277    '�'.
1278    '�ﹼﹽ'.
1279    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1280    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1281    '����������������������������������������������������������������'.
1282    '   ⁠';
1283
1284/**
1285 * Romanization lookup table
1286 *
1287 * This lookup tables provides a way to transform strings written in a language
1288 * different from the ones based upon latin letters into plain ASCII.
1289 *
1290 * Please note: this is not a scientific transliteration table. It only works
1291 * oneway from nonlatin to ASCII and it works by simple character replacement
1292 * only. Specialities of each language are not supported.
1293 *
1294 * @author Andreas Gohr <andi@splitbrain.org>
1295 * @author Vitaly Blokhin <vitinfo@vitn.com>
1296 * @link   http://www.uconv.com/translit.htm
1297 * @author Bisqwit <bisqwit@iki.fi>
1298 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1299 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1300 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1301 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1302 * @author Arthit Suriyawongkul <arthit@gmail.com>
1303 * @author Denis Scheither <amorphis@uni-bremen.de>
1304 */
1305global $UTF8_ROMANIZATION;
1306if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1307  // scandinavian - differs from what we do in deaccent
1308  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1309
1310  //russian cyrillic
1311  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1312  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1313  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1314  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1315  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1316  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1317  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1318  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1319  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1320  // Ukrainian cyrillic
1321  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1322  // Georgian
1323  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1324  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1325  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1326  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1327  'ჰ'=>'xh',
1328  //Sanskrit
1329  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1330  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1331  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1332  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1333  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1334  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1335  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1336  //Hebrew
1337  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1338  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1339  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1340  'ש'=>'sh','ת'=>'t',
1341  //Arabic
1342  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1343  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1344  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1345  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1346
1347  // Japanese characters  (last update: 2008-05-09)
1348
1349  // Japanese hiragana
1350
1351  // 3 character syllables, っ doubles the consonant after
1352  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1353  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1354  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1355  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1356  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1357  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1358  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1359  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1360  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1361  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1362  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1363
1364  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1365  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1366  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1367
1368   // 2 character syllables - normal
1369  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1370  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1371  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1372  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1373  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1374  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1375  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1376  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1377  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1378  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1379  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1380  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1381  'うぇ'=>'we','うぃ'=>'wi',
1382  'いぇ'=>'ye',
1383
1384  // 2 character syllables, っ doubles the consonant after
1385  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1386  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1387  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1388  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1389  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1390  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1391  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1392  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1393  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1394  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1395  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1396
1397  // 1 character syllabels
1398  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1399  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1400  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1401  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1402  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1403  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1404  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1405  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1406  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1407  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1408  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1409  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1410  'わ'=>'wa','を'=>'wo',
1411  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1412  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1413  // old characters
1414  'ゑ'=>'we','ゐ'=>'wi',
1415
1416  //  convert what's left (probably only kicks in when something's missing above)
1417  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1418  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1419
1420  // never seen one of those (disabled for the moment)
1421  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1422  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1423  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1424  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1425  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1426  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1427  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1428  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1429  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1430  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1431  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1432  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1433  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1434  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1435
1436  // 'spare' characters from other romanization systems
1437  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1438  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1439  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1440  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1441  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1442  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1443  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1444  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1445  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1446  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1447
1448
1449  // Japanese katakana
1450
1451  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1452  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1453  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1454  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1455  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1456  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1457  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1458  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1459  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1460  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1461  'ッティー'=>'ttii',
1462  'ッヂィー'=>'ddii',
1463
1464  // 3 character syllables - doubled vowels
1465  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1466  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1467  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1468  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1469  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1470  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1471  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1472  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1473  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1474  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1475  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1476  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1477  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1478  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1479  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1480  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1481  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1482  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1483  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1484  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1485  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1486  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1487  'ウェー'=>'wee','ウィー'=>'wii',
1488  'イェー'=>'yee',
1489  'ティー'=>'tii',
1490  'ヂィー'=>'dii',
1491
1492  // 3 character syllables - doubled consonants
1493  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1494  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1495  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1496  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1497  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1498  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1499  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1500  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1501  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1502  'ッティ'=>'tti',
1503  'ッヂィ'=>'ddi',
1504
1505  // 3 character syllables - doubled vowel and consonants
1506  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1507  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1508  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1509  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1510  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1511  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1512  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1513  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1514  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1515  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1516  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1517
1518  // 2 character syllables - normal
1519  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1520  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1521  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1522  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1523  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1524  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1525  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1526  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1527  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1528  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1529  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1530  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1531  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1532  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1533  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1534  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1535  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1536  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1537  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1538  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1539  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1540  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1541  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1542  'ウェ'=>'we','ウィ'=>'wi',
1543  'イェ'=>'ye',
1544  'ティ'=>'ti',
1545  'ヂィ'=>'di',
1546
1547  // 2 character syllables - doubled vocal
1548  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1549  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1550  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1551  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1552  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1553  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1554  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1555  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1556  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1557  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1558  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1559  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1560  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1561  'ワー'=>'waa','ヲー'=>'woo',
1562  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1563  'ヵー'=>'kaa','ヶー'=>'kee',
1564  // old characters
1565  'ヱー'=>'wee','ヰー'=>'wii',
1566
1567  // seperate katakana 'n'
1568  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1569  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1570
1571  // 2 character syllables - doubled consonants
1572  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1573  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1574  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1575  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1576  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1577  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1578  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1579  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1580  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1581  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1582  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1583
1584  // 1 character syllables
1585  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1586  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1587  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1588  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1589  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1590  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1591  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1592  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1593  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1594  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1595  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1596  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1597  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1598  'ワ'=>'wa','ヲ'=>'wo',
1599  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1600  'ヵ'=>'ka','ヶ'=>'ke',
1601  // old characters
1602  'ヱ'=>'we','ヰ'=>'wi',
1603
1604  //  convert what's left (probably only kicks in when something's missing above)
1605  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1606  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1607
1608  // special characters
1609  '・'=>'_','、'=>'_',
1610  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1611
1612  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1613  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1614  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1615  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1616  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1617  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1618  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1619  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1620  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1621  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1622  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1623  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1624
1625  // "Greeklish"
1626  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1627  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1628
1629  // Thai
1630  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1631  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1632  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1633  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1634  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1635  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1636  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1637  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1638  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1639  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1640  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1641  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1642  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1643  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1644  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1645  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1646	'๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1647  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1648
1649  // Korean
1650  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1651  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1652  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1653  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1654  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1655  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1656);
1657
1658//Setup VIM: ex: et ts=2 enc=utf-8 :
1659
1660