xref: /dokuwiki/inc/utf8.php (revision f4c788ee87e2a5de27fd70cd38490ef6d0cb7aa5)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_isASCII')){
23    /**
24     * Checks if a string contains 7bit ASCII only
25     *
26     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
27     */
28    function utf8_isASCII($str){
29        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30    }
31}
32
33if(!function_exists('utf8_strip')){
34    /**
35     * Strips all highbyte chars
36     *
37     * Returns a pure ASCII7 string
38     *
39     * @author Andreas Gohr <andi@splitbrain.org>
40     */
41    function utf8_strip($str){
42        $ascii = '';
43        $len = strlen($str);
44        for($i=0; $i<$len; $i++){
45            if(ord($str{$i}) <128){
46                $ascii .= $str{$i};
47            }
48        }
49        return $ascii;
50    }
51}
52
53if(!function_exists('utf8_check')){
54    /**
55     * Tries to detect if a string is in Unicode encoding
56     *
57     * @author <bmorel@ssi.fr>
58     * @link   http://www.php.net/manual/en/function.utf8-encode.php
59     */
60    function utf8_check($Str) {
61        $len = strlen($Str);
62        for ($i=0; $i<$len; $i++) {
63            $b = ord($Str[$i]);
64            if ($b < 0x80) continue; # 0bbbbbbb
65            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70            else return false; # Does not match any model
71
72            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74                    return false;
75            }
76        }
77        return true;
78    }
79}
80
81if(!function_exists('utf8_strlen')){
82    /**
83     * Unicode aware replacement for strlen()
84     *
85     * utf8_decode() converts characters that are not in ISO-8859-1
86     * to '?', which, for the purpose of counting, is alright - It's
87     * even faster than mb_strlen.
88     *
89     * @author <chernyshevsky at hotmail dot com>
90     * @see    strlen()
91     * @see    utf8_decode()
92     */
93    function utf8_strlen($string){
94        return strlen(utf8_decode($string));
95    }
96}
97
98if(!function_exists('utf8_substr')){
99    /**
100     * UTF-8 aware alternative to substr
101     *
102     * Return part of a string given character offset (and optionally length)
103     *
104     * @author Harry Fuecks <hfuecks@gmail.com>
105     * @author Chris Smith <chris@jalakai.co.uk>
106     * @param string $str
107     * @param int $offset number of UTF-8 characters offset (from left)
108     * @param int $length (optional) length in UTF-8 characters from offset
109     * @return mixed string or false if failure
110     */
111    function utf8_substr($str, $offset, $length = null) {
112        if(UTF8_MBSTRING){
113            if( $length === null ){
114                return mb_substr($str, $offset);
115            }else{
116                return mb_substr($str, $offset, $length);
117            }
118        }
119
120        /*
121         * Notes:
122         *
123         * no mb string support, so we'll use pcre regex's with 'u' flag
124         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
125         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
126         *
127         * substr documentation states false can be returned in some cases (e.g. offset > string length)
128         * mb_substr never returns false, it will return an empty string instead.
129         *
130         * calculating the number of characters in the string is a relatively expensive operation, so
131         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
132         */
133
134        // cast parameters to appropriate types to avoid multiple notices/warnings
135        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
136        $offset = (int)$offset;
137        if (!is_null($length)) $length = (int)$length;
138
139        // handle trivial cases
140        if ($length === 0) return '';
141        if ($offset < 0 && $length < 0 && $length < $offset) return '';
142
143        $offset_pattern = '';
144        $length_pattern = '';
145
146        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
147        if ($offset < 0) {
148            $strlen = strlen(utf8_decode($str));        // see notes
149            $offset = $strlen + $offset;
150            if ($offset < 0) $offset = 0;
151        }
152
153        // establish a pattern for offset, a non-captured group equal in length to offset
154        if ($offset > 0) {
155            $Ox = (int)($offset/65535);
156            $Oy = $offset%65535;
157
158            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
159            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
160        } else {
161            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
162        }
163
164        // establish a pattern for length
165        if (is_null($length)) {
166            $length_pattern = '(.*)$';                  // the rest of the string
167        } else {
168
169            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
170            if ($offset > $strlen) return '';           // another trivial case
171
172            if ($length > 0) {
173
174                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
175
176                $Lx = (int)($length/65535);
177                $Ly = $length%65535;
178
179                // +ve length requires ... a captured group of length characters
180                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
181                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
182
183            } else if ($length < 0) {
184
185                if ($length < ($offset - $strlen)) return '';
186
187                $Lx = (int)((-$length)/65535);
188                $Ly = (-$length)%65535;
189
190                // -ve length requires ... capture everything except a group of -length characters
191                //                         anchored at the tail-end of the string
192                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
193                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
194            }
195        }
196
197        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
198        return $match[1];
199    }
200}
201
202if(!function_exists('utf8_substr_replace')){
203    /**
204     * Unicode aware replacement for substr_replace()
205     *
206     * @author Andreas Gohr <andi@splitbrain.org>
207     * @see    substr_replace()
208     */
209    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
210        $ret = '';
211        if($start>0) $ret .= utf8_substr($string, 0, $start);
212        $ret .= $replacement;
213        $ret .= utf8_substr($string, $start+$length);
214        return $ret;
215    }
216}
217
218if(!function_exists('utf8_ltrim')){
219    /**
220     * Unicode aware replacement for ltrim()
221     *
222     * @author Andreas Gohr <andi@splitbrain.org>
223     * @see    ltrim()
224     * @param  string $str
225     * @param  string $charlist
226     * @return string
227     */
228    function utf8_ltrim($str,$charlist=''){
229        if($charlist == '') return ltrim($str);
230
231        //quote charlist for use in a characterclass
232        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
233
234        return preg_replace('/^['.$charlist.']+/u','',$str);
235    }
236}
237
238if(!function_exists('utf8_rtrim')){
239    /**
240     * Unicode aware replacement for rtrim()
241     *
242     * @author Andreas Gohr <andi@splitbrain.org>
243     * @see    rtrim()
244     * @param  string $str
245     * @param  string $charlist
246     * @return string
247     */
248    function  utf8_rtrim($str,$charlist=''){
249        if($charlist == '') return rtrim($str);
250
251        //quote charlist for use in a characterclass
252        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
253
254        return preg_replace('/['.$charlist.']+$/u','',$str);
255    }
256}
257
258if(!function_exists('utf8_trim')){
259    /**
260     * Unicode aware replacement for trim()
261     *
262     * @author Andreas Gohr <andi@splitbrain.org>
263     * @see    trim()
264     * @param  string $str
265     * @param  string $charlist
266     * @return string
267     */
268    function  utf8_trim($str,$charlist='') {
269        if($charlist == '') return trim($str);
270
271        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
272    }
273}
274
275if(!function_exists('utf8_strtolower')){
276    /**
277     * This is a unicode aware replacement for strtolower()
278     *
279     * Uses mb_string extension if available
280     *
281     * @author Leo Feyer <leo@typolight.org>
282     * @see    strtolower()
283     * @see    utf8_strtoupper()
284     */
285    function utf8_strtolower($string){
286        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
287
288        global $UTF8_UPPER_TO_LOWER;
289        return strtr($string,$UTF8_UPPER_TO_LOWER);
290    }
291}
292
293if(!function_exists('utf8_strtoupper')){
294    /**
295     * This is a unicode aware replacement for strtoupper()
296     *
297     * Uses mb_string extension if available
298     *
299     * @author Leo Feyer <leo@typolight.org>
300     * @see    strtoupper()
301     * @see    utf8_strtoupper()
302     */
303    function utf8_strtoupper($string){
304        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
305
306        global $UTF8_LOWER_TO_UPPER;
307        return strtr($string,$UTF8_LOWER_TO_UPPER);
308    }
309}
310
311if(!function_exists('utf8_ucfirst')){
312    /**
313     * UTF-8 aware alternative to ucfirst
314     * Make a string's first character uppercase
315     *
316     * @author Harry Fuecks
317     * @param string
318     * @return string with first character as upper case (if applicable)
319     */
320    function utf8_ucfirst($str){
321        switch ( utf8_strlen($str) ) {
322            case 0:
323                return '';
324            case 1:
325                return utf8_strtoupper($str);
326            default:
327                preg_match('/^(.{1})(.*)$/us', $str, $matches);
328                return utf8_strtoupper($matches[1]).$matches[2];
329        }
330    }
331}
332
333if(!function_exists('utf8_ucwords')){
334    /**
335     * UTF-8 aware alternative to ucwords
336     * Uppercase the first character of each word in a string
337     *
338     * @author Harry Fuecks
339     * @param string
340     * @return string with first char of each word uppercase
341     * @see http://www.php.net/ucwords
342     */
343    function utf8_ucwords($str) {
344        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
345        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
346        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
347        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
348
349        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
350    }
351
352    /**
353     * Callback function for preg_replace_callback call in utf8_ucwords
354     * You don't need to call this yourself
355     *
356     * @author Harry Fuecks
357     * @param  array $matches matches corresponding to a single word
358     * @return string with first char of the word in uppercase
359     * @see utf8_ucwords
360     * @see utf8_strtoupper
361     */
362    function utf8_ucwords_callback($matches) {
363        $leadingws = $matches[2];
364        $ucfirst = utf8_strtoupper($matches[3]);
365        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
366        return $leadingws . $ucword;
367    }
368}
369
370if(!function_exists('utf8_deaccent')){
371    /**
372     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
373     *
374     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
375     * letters. Default is to deaccent both cases ($case = 0)
376     *
377     * @author Andreas Gohr <andi@splitbrain.org>
378     */
379    function utf8_deaccent($string,$case=0){
380        if($case <= 0){
381            global $UTF8_LOWER_ACCENTS;
382            $string = strtr($string,$UTF8_LOWER_ACCENTS);
383        }
384        if($case >= 0){
385            global $UTF8_UPPER_ACCENTS;
386            $string = strtr($string,$UTF8_UPPER_ACCENTS);
387        }
388        return $string;
389    }
390}
391
392if(!function_exists('utf8_romanize')){
393    /**
394     * Romanize a non-latin string
395     *
396     * @author Andreas Gohr <andi@splitbrain.org>
397     */
398    function utf8_romanize($string){
399        if(utf8_isASCII($string)) return $string; //nothing to do
400
401        global $UTF8_ROMANIZATION;
402        return strtr($string,$UTF8_ROMANIZATION);
403    }
404}
405
406if(!function_exists('utf8_stripspecials')){
407    /**
408     * Removes special characters (nonalphanumeric) from a UTF-8 string
409     *
410     * This function adds the controlchars 0x00 to 0x19 to the array of
411     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
412     *
413     * @author Andreas Gohr <andi@splitbrain.org>
414     * @param  string $string     The UTF8 string to strip of special chars
415     * @param  string $repl       Replace special with this string
416     * @param  string $additional Additional chars to strip (used in regexp char class)
417     * @return string
418     */
419    function utf8_stripspecials($string,$repl='',$additional=''){
420        global $UTF8_SPECIAL_CHARS2;
421
422        static $specials = null;
423        if(is_null($specials)){
424            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
425            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
426        }
427
428        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
429    }
430}
431
432if(!function_exists('utf8_strpos')){
433    /**
434     * This is an Unicode aware replacement for strpos
435     *
436     * @author Leo Feyer <leo@typolight.org>
437     * @see    strpos()
438     * @param  string
439     * @param  string
440     * @param  integer
441     * @return integer
442     */
443    function utf8_strpos($haystack, $needle, $offset=0){
444        $comp = 0;
445        $length = null;
446
447        while (is_null($length) || $length < $offset) {
448            $pos = strpos($haystack, $needle, $offset + $comp);
449
450            if ($pos === false)
451                return false;
452
453            $length = utf8_strlen(substr($haystack, 0, $pos));
454
455            if ($length < $offset)
456                $comp = $pos - $length;
457        }
458
459        return $length;
460    }
461}
462
463if(!function_exists('utf8_tohtml')){
464    /**
465     * Encodes UTF-8 characters to HTML entities
466     *
467     * @author Tom N Harris <tnharris@whoopdedo.org>
468     * @author <vpribish at shopping dot com>
469     * @link   http://www.php.net/manual/en/function.utf8-decode.php
470     */
471    function utf8_tohtml ($str) {
472        $ret = '';
473        foreach (utf8_to_unicode($str) as $cp) {
474            if ($cp < 0x80)
475                $ret .= chr($cp);
476            elseif ($cp < 0x100)
477                $ret .= "&#$cp;";
478            else
479                $ret .= '&#x'.dechex($cp).';';
480        }
481        return $ret;
482    }
483}
484
485if(!function_exists('utf8_unhtml')){
486    /**
487     * Decodes HTML entities to UTF-8 characters
488     *
489     * Convert any &#..; entity to a codepoint,
490     * The entities flag defaults to only decoding numeric entities.
491     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
492     * are handled as well. Avoids the problem that would occur if you
493     * had to decode "&amp;#38;&#38;amp;#38;"
494     *
495     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
496     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
497     * what it should be                   -> "&#38;&amp#38;"
498     *
499     * @author Tom N Harris <tnharris@whoopdedo.org>
500     * @param  string  $str      UTF-8 encoded string
501     * @param  boolean $entities Flag controlling decoding of named entities.
502     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
503     */
504    function utf8_unhtml($str, $entities=null) {
505        static $decoder = null;
506        if (is_null($decoder))
507            $decoder = new utf8_entity_decoder();
508        if (is_null($entities))
509            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
510                                         'utf8_decode_numeric', $str);
511        else
512            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
513                                         array(&$decoder, 'decode'), $str);
514    }
515}
516
517if(!function_exists('utf8_decode_numeric')){
518    /**
519     * Decodes numeric HTML entities to their correct UTF-8 characters
520     *
521     * @param $ent string A numeric entity
522     * @return string
523     */
524    function utf8_decode_numeric($ent) {
525        switch ($ent[2]) {
526            case 'X':
527            case 'x':
528                $cp = hexdec($ent[3]);
529                break;
530            default:
531                $cp = intval($ent[3]);
532                break;
533        }
534        return unicode_to_utf8(array($cp));
535    }
536}
537
538if(!class_exists('utf8_entity_decoder')){
539    /**
540     * Encapsulate HTML entity decoding tables
541     */
542    class utf8_entity_decoder {
543        var $table;
544
545        /**
546         * Initializes the decoding tables
547         */
548        function __construct() {
549            $table = get_html_translation_table(HTML_ENTITIES);
550            $table = array_flip($table);
551            $this->table = array_map(array(&$this,'makeutf8'), $table);
552        }
553
554        /**
555         * Wrapper aorund unicode_to_utf8()
556         *
557         * @param $c string
558         * @return mixed
559         */
560        function makeutf8($c) {
561            return unicode_to_utf8(array(ord($c)));
562        }
563
564        /**
565         * Decodes any HTML entity to it's correct UTF-8 char equivalent
566         *
567         * @param $ent string An entity
568         * @return string
569         */
570        function decode($ent) {
571            if ($ent[1] == '#') {
572                return utf8_decode_numeric($ent);
573            } elseif (array_key_exists($ent[0],$this->table)) {
574                return $this->table[$ent[0]];
575            } else {
576                return $ent[0];
577            }
578        }
579    }
580}
581
582if(!function_exists('utf8_to_unicode')){
583    /**
584     * Takes an UTF-8 string and returns an array of ints representing the
585     * Unicode characters. Astral planes are supported ie. the ints in the
586     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
587     * are not allowed.
588     *
589     * If $strict is set to true the function returns false if the input
590     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
591     * level E_USER_WARNING
592     *
593     * Note: this function has been modified slightly in this library to
594     * trigger errors on encountering bad bytes
595     *
596     * @author <hsivonen@iki.fi>
597     * @author Harry Fuecks <hfuecks@gmail.com>
598     * @param  string  $str UTF-8 encoded string
599     * @param  boolean $strict Check for invalid sequences?
600     * @return mixed array of unicode code points or false if UTF-8 invalid
601     * @see    unicode_to_utf8
602     * @link   http://hsivonen.iki.fi/php-utf8/
603     * @link   http://sourceforge.net/projects/phputf8/
604     */
605    function utf8_to_unicode($str,$strict=false) {
606        $mState = 0;     // cached expected number of octets after the current octet
607                         // until the beginning of the next UTF8 character sequence
608        $mUcs4  = 0;     // cached Unicode character
609        $mBytes = 1;     // cached expected number of octets in the current sequence
610
611        $out = array();
612
613        $len = strlen($str);
614
615        for($i = 0; $i < $len; $i++) {
616
617            $in = ord($str{$i});
618
619            if ( $mState == 0) {
620
621                // When mState is zero we expect either a US-ASCII character or a
622                // multi-octet sequence.
623                if (0 == (0x80 & ($in))) {
624                    // US-ASCII, pass straight through.
625                    $out[] = $in;
626                    $mBytes = 1;
627
628                } else if (0xC0 == (0xE0 & ($in))) {
629                    // First octet of 2 octet sequence
630                    $mUcs4 = ($in);
631                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
632                    $mState = 1;
633                    $mBytes = 2;
634
635                } else if (0xE0 == (0xF0 & ($in))) {
636                    // First octet of 3 octet sequence
637                    $mUcs4 = ($in);
638                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
639                    $mState = 2;
640                    $mBytes = 3;
641
642                } else if (0xF0 == (0xF8 & ($in))) {
643                    // First octet of 4 octet sequence
644                    $mUcs4 = ($in);
645                    $mUcs4 = ($mUcs4 & 0x07) << 18;
646                    $mState = 3;
647                    $mBytes = 4;
648
649                } else if (0xF8 == (0xFC & ($in))) {
650                    /* First octet of 5 octet sequence.
651                     *
652                     * This is illegal because the encoded codepoint must be either
653                     * (a) not the shortest form or
654                     * (b) outside the Unicode range of 0-0x10FFFF.
655                     * Rather than trying to resynchronize, we will carry on until the end
656                     * of the sequence and let the later error handling code catch it.
657                     */
658                    $mUcs4 = ($in);
659                    $mUcs4 = ($mUcs4 & 0x03) << 24;
660                    $mState = 4;
661                    $mBytes = 5;
662
663                } else if (0xFC == (0xFE & ($in))) {
664                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
665                    $mUcs4 = ($in);
666                    $mUcs4 = ($mUcs4 & 1) << 30;
667                    $mState = 5;
668                    $mBytes = 6;
669
670                } elseif($strict) {
671                    /* Current octet is neither in the US-ASCII range nor a legal first
672                     * octet of a multi-octet sequence.
673                     */
674                    trigger_error(
675                            'utf8_to_unicode: Illegal sequence identifier '.
676                                'in UTF-8 at byte '.$i,
677                            E_USER_WARNING
678                        );
679                    return false;
680
681                }
682
683            } else {
684
685                // When mState is non-zero, we expect a continuation of the multi-octet
686                // sequence
687                if (0x80 == (0xC0 & ($in))) {
688
689                    // Legal continuation.
690                    $shift = ($mState - 1) * 6;
691                    $tmp = $in;
692                    $tmp = ($tmp & 0x0000003F) << $shift;
693                    $mUcs4 |= $tmp;
694
695                    /**
696                     * End of the multi-octet sequence. mUcs4 now contains the final
697                     * Unicode codepoint to be output
698                     */
699                    if (0 == --$mState) {
700
701                        /*
702                         * Check for illegal sequences and codepoints.
703                         */
704                        // From Unicode 3.1, non-shortest form is illegal
705                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
706                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
707                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
708                            (4 < $mBytes) ||
709                            // From Unicode 3.2, surrogate characters are illegal
710                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
711                            // Codepoints outside the Unicode range are illegal
712                            ($mUcs4 > 0x10FFFF)) {
713
714                            if($strict){
715                                trigger_error(
716                                        'utf8_to_unicode: Illegal sequence or codepoint '.
717                                            'in UTF-8 at byte '.$i,
718                                        E_USER_WARNING
719                                    );
720
721                                return false;
722                            }
723
724                        }
725
726                        if (0xFEFF != $mUcs4) {
727                            // BOM is legal but we don't want to output it
728                            $out[] = $mUcs4;
729                        }
730
731                        //initialize UTF8 cache
732                        $mState = 0;
733                        $mUcs4  = 0;
734                        $mBytes = 1;
735                    }
736
737                } elseif($strict) {
738                    /**
739                     *((0xC0 & (*in) != 0x80) && (mState != 0))
740                     * Incomplete multi-octet sequence.
741                     */
742                    trigger_error(
743                            'utf8_to_unicode: Incomplete multi-octet '.
744                            '   sequence in UTF-8 at byte '.$i,
745                            E_USER_WARNING
746                        );
747
748                    return false;
749                }
750            }
751        }
752        return $out;
753    }
754}
755
756if(!function_exists('unicode_to_utf8')){
757    /**
758     * Takes an array of ints representing the Unicode characters and returns
759     * a UTF-8 string. Astral planes are supported ie. the ints in the
760     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
761     * are not allowed.
762     *
763     * If $strict is set to true the function returns false if the input
764     * array contains ints that represent surrogates or are outside the
765     * Unicode range and raises a PHP error at level E_USER_WARNING
766     *
767     * Note: this function has been modified slightly in this library to use
768     * output buffering to concatenate the UTF-8 string (faster) as well as
769     * reference the array by it's keys
770     *
771     * @param  array $arr of unicode code points representing a string
772     * @param  boolean $strict Check for invalid sequences?
773     * @return mixed UTF-8 string or false if array contains invalid code points
774     * @author <hsivonen@iki.fi>
775     * @author Harry Fuecks <hfuecks@gmail.com>
776     * @see    utf8_to_unicode
777     * @link   http://hsivonen.iki.fi/php-utf8/
778     * @link   http://sourceforge.net/projects/phputf8/
779     */
780    function unicode_to_utf8($arr,$strict=false) {
781        if (!is_array($arr)) return '';
782        ob_start();
783
784        foreach (array_keys($arr) as $k) {
785
786            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
787                # ASCII range (including control chars)
788
789                echo chr($arr[$k]);
790
791            } else if ($arr[$k] <= 0x07ff) {
792                # 2 byte sequence
793
794                echo chr(0xc0 | ($arr[$k] >> 6));
795                echo chr(0x80 | ($arr[$k] & 0x003f));
796
797            } else if($arr[$k] == 0xFEFF) {
798                # Byte order mark (skip)
799
800                // nop -- zap the BOM
801
802            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
803                # Test for illegal surrogates
804
805                // found a surrogate
806                if($strict){
807                    trigger_error(
808                        'unicode_to_utf8: Illegal surrogate '.
809                            'at index: '.$k.', value: '.$arr[$k],
810                        E_USER_WARNING
811                        );
812                    return false;
813                }
814
815            } else if ($arr[$k] <= 0xffff) {
816                # 3 byte sequence
817
818                echo chr(0xe0 | ($arr[$k] >> 12));
819                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
820                echo chr(0x80 | ($arr[$k] & 0x003f));
821
822            } else if ($arr[$k] <= 0x10ffff) {
823                # 4 byte sequence
824
825                echo chr(0xf0 | ($arr[$k] >> 18));
826                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
827                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
828                echo chr(0x80 | ($arr[$k] & 0x3f));
829
830            } elseif($strict) {
831
832                trigger_error(
833                    'unicode_to_utf8: Codepoint out of Unicode range '.
834                        'at index: '.$k.', value: '.$arr[$k],
835                    E_USER_WARNING
836                    );
837
838                // out of range
839                return false;
840            }
841        }
842
843        $result = ob_get_contents();
844        ob_end_clean();
845        return $result;
846    }
847}
848
849if(!function_exists('utf8_to_utf16be')){
850    /**
851     * UTF-8 to UTF-16BE conversion.
852     *
853     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
854     */
855    function utf8_to_utf16be(&$str, $bom = false) {
856        $out = $bom ? "\xFE\xFF" : '';
857        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
858
859        $uni = utf8_to_unicode($str);
860        foreach($uni as $cp){
861            $out .= pack('n',$cp);
862        }
863        return $out;
864    }
865}
866
867if(!function_exists('utf16be_to_utf8')){
868    /**
869     * UTF-8 to UTF-16BE conversion.
870     *
871     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
872     */
873    function utf16be_to_utf8(&$str) {
874        $uni = unpack('n*',$str);
875        return unicode_to_utf8($uni);
876    }
877}
878
879if(!function_exists('utf8_bad_replace')){
880    /**
881     * Replace bad bytes with an alternative character
882     *
883     * ASCII character is recommended for replacement char
884     *
885     * PCRE Pattern to locate bad bytes in a UTF-8 string
886     * Comes from W3 FAQ: Multilingual Forms
887     * Note: modified to include full ASCII range including control chars
888     *
889     * @author Harry Fuecks <hfuecks@gmail.com>
890     * @see http://www.w3.org/International/questions/qa-forms-utf-8
891     * @param string $str to search
892     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
893     * @return string
894     */
895    function utf8_bad_replace($str, $replace = '') {
896        $UTF8_BAD =
897         '([\x00-\x7F]'.                          # ASCII (including control chars)
898         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
899         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
900         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
901         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
902         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
903         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
904         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
905         '|(.{1}))';                              # invalid byte
906        ob_start();
907        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
908            if ( !isset($matches[2])) {
909                echo $matches[0];
910            } else {
911                echo $replace;
912            }
913            $str = substr($str,strlen($matches[0]));
914        }
915        $result = ob_get_contents();
916        ob_end_clean();
917        return $result;
918    }
919}
920
921if(!function_exists('utf8_correctIdx')){
922    /**
923     * adjust a byte index into a utf8 string to a utf8 character boundary
924     *
925     * @param $str   string   utf8 character string
926     * @param $i     int      byte index into $str
927     * @param $next  bool     direction to search for boundary,
928     *                           false = up (current character)
929     *                           true = down (next character)
930     *
931     * @return int            byte index into $str now pointing to a utf8 character boundary
932     *
933     * @author       chris smith <chris@jalakai.co.uk>
934     */
935    function utf8_correctIdx(&$str,$i,$next=false) {
936
937        if ($i <= 0) return 0;
938
939        $limit = strlen($str);
940        if ($i>=$limit) return $limit;
941
942        if ($next) {
943            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
944        } else {
945            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
946        }
947
948        return $i;
949    }
950}
951
952// only needed if no mb_string available
953if(!UTF8_MBSTRING){
954    /**
955     * UTF-8 Case lookup table
956     *
957     * This lookuptable defines the upper case letters to their correspponding
958     * lower case letter in UTF-8
959     *
960     * @author Andreas Gohr <andi@splitbrain.org>
961     */
962    global $UTF8_LOWER_TO_UPPER;
963    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
964            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
965            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
966            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
967            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
968            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
969            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
970            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
971            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
972            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
973            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
974            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
975            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
976            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
977            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
978            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
979            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
980            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
981            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
982            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
983            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
984            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
985            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
986            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
987            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
988            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
989            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
990            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
991            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
992            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
993            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
994            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
995            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
996            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
997            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
998            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
999            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1000            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1001            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1002            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1003            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1004            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1005            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1006            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1007            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1008            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1009            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1010            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1011            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1012            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1013            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1014            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1015            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1016            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1017            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1018            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1019            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1020            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1021            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1022            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1023            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1024            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1025            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1026            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1027            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1028            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1029            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1030            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1031                );
1032
1033    /**
1034     * UTF-8 Case lookup table
1035     *
1036     * This lookuptable defines the lower case letters to their corresponding
1037     * upper case letter in UTF-8
1038     *
1039     * @author Andreas Gohr <andi@splitbrain.org>
1040     */
1041    global $UTF8_UPPER_TO_LOWER;
1042    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1043            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1044            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1045            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1046            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1047            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1048            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1049            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1050            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1051            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1052            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1053            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1054            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1055            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1056            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1057            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1058            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1059            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1060            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1061            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1062            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1063            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1064            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1065            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1066            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1067            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1068            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1069            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1070            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1071            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1072            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1073            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1074            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1075            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1076            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1077            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1078            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1079            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1080            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1081            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1082            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1083            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1084            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1085            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1086            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1087            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1088            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1089            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1090            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1091            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1092            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1093            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1094            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1095            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1096            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1097            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1098            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1099            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1100            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1101            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1102            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1103            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1104            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1105            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1106            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1107            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1108            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1109            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1110                );
1111}; // end of case lookup tables
1112
1113/**
1114 * UTF-8 lookup table for lower case accented letters
1115 *
1116 * This lookuptable defines replacements for accented characters from the ASCII-7
1117 * range. This are lower case letters only.
1118 *
1119 * @author Andreas Gohr <andi@splitbrain.org>
1120 * @see    utf8_deaccent()
1121 */
1122global $UTF8_LOWER_ACCENTS;
1123if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1124  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1125  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1126  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1127  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1128  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1129  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1130  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1131  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1132  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1133  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1134  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1135  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1136  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1137  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1138  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1139);
1140
1141/**
1142 * UTF-8 lookup table for upper case accented letters
1143 *
1144 * This lookuptable defines replacements for accented characters from the ASCII-7
1145 * range. This are upper case letters only.
1146 *
1147 * @author Andreas Gohr <andi@splitbrain.org>
1148 * @see    utf8_deaccent()
1149 */
1150global $UTF8_UPPER_ACCENTS;
1151if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1152  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1153  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1154  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1155  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1156  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1157  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1158  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1159  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1160  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1161  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1162  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1163  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1164  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1165  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1166  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1167);
1168
1169/**
1170 * UTF-8 array of common special characters
1171 *
1172 * This array should contain all special characters (not a letter or digit)
1173 * defined in the various local charsets - it's not a complete list of non-alphanum
1174 * characters in UTF-8. It's not perfect but should match most cases of special
1175 * chars.
1176 *
1177 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1178 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1179 *
1180 * @author Andreas Gohr <andi@splitbrain.org>
1181 * @see    utf8_stripspecials()
1182 */
1183global $UTF8_SPECIAL_CHARS;
1184if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1185  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1186  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1187          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1188  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1189  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1190  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1191  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1192  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1193  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1194  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1195  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1196  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1197  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1198  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1199  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1200  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1201  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1202  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1203  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1204  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1205  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1206  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1207  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1208  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1209  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1210  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1211  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1212  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1213  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1214  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1215  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1216  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1217  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1218  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1219  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1220  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1221  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1222  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1223  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1224  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1225  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1226  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1227  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1228  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1229  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1230  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1231  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1232  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1233  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1234  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1235  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1236  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1237  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1238          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1239  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1240  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1241  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1242  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1243  0xffeb, 0xffec, 0xffed, 0xffee,
1244  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1245  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1246  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1247  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1248  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1249);
1250
1251// utf8 version of above data
1252global $UTF8_SPECIAL_CHARS2;
1253if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1254    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1255    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1256    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1257    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1258    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1259    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1260    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1261    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1262    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1263    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1264    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1265    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1266    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1267    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1268    '➷➸➹➺➻➼➽➾'.
1269    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1270    '�'.
1271    '�ﹼﹽ'.
1272    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1273    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1274    '����������������������������������������������������������������'.
1275    '   ⁠';
1276
1277/**
1278 * Romanization lookup table
1279 *
1280 * This lookup tables provides a way to transform strings written in a language
1281 * different from the ones based upon latin letters into plain ASCII.
1282 *
1283 * Please note: this is not a scientific transliteration table. It only works
1284 * oneway from nonlatin to ASCII and it works by simple character replacement
1285 * only. Specialities of each language are not supported.
1286 *
1287 * @author Andreas Gohr <andi@splitbrain.org>
1288 * @author Vitaly Blokhin <vitinfo@vitn.com>
1289 * @link   http://www.uconv.com/translit.htm
1290 * @author Bisqwit <bisqwit@iki.fi>
1291 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1292 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1293 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1294 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1295 * @author Arthit Suriyawongkul <arthit@gmail.com>
1296 * @author Denis Scheither <amorphis@uni-bremen.de>
1297 * @author Eivind Morland <eivind.morland@gmail.com>
1298 */
1299global $UTF8_ROMANIZATION;
1300if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1301  // scandinavian - differs from what we do in deaccent
1302  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1303
1304  //russian cyrillic
1305  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1306  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1307  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1308  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1309  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1310  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1311  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1312  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1313  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1314  // Ukrainian cyrillic
1315  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1316  // Georgian
1317  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1318  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1319  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1320  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1321  'ჰ'=>'xh',
1322  //Sanskrit
1323  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1324  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1325  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1326  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1327  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1328  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1329  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1330  //Sanskrit diacritics
1331  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1332  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1333  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1334  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1335  //Hebrew
1336  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1337  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1338  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1339  'ש'=>'sh','ת'=>'t',
1340  //Arabic
1341  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1342  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1343  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1344  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1345
1346  // Japanese characters  (last update: 2008-05-09)
1347
1348  // Japanese hiragana
1349
1350  // 3 character syllables, っ doubles the consonant after
1351  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1352  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1353  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1354  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1355  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1356  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1357  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1358  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1359  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1360  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1361  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1362
1363  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1364  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1365  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1366
1367   // 2 character syllables - normal
1368  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1369  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1370  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1371  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1372  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1373  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1374  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1375  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1376  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1377  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1378  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1379  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1380  'うぇ'=>'we','うぃ'=>'wi',
1381  'いぇ'=>'ye',
1382
1383  // 2 character syllables, っ doubles the consonant after
1384  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1385  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1386  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1387  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1388  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1389  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1390  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1391  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1392  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1393  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1394  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1395
1396  // 1 character syllabels
1397  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1398  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1399  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1400  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1401  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1402  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1403  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1404  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1405  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1406  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1407  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1408  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1409  'わ'=>'wa','を'=>'wo',
1410  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1411  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1412  // old characters
1413  'ゑ'=>'we','ゐ'=>'wi',
1414
1415  //  convert what's left (probably only kicks in when something's missing above)
1416  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1417  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1418
1419  // never seen one of those (disabled for the moment)
1420  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1421  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1422  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1423  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1424  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1425  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1426  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1427  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1428  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1429  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1430  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1431  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1432  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1433  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1434
1435  // 'spare' characters from other romanization systems
1436  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1437  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1438  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1439  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1440  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1441  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1442  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1443  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1444  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1445  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1446
1447
1448  // Japanese katakana
1449
1450  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1451  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1452  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1453  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1454  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1455  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1456  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1457  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1458  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1459  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1460  'ッティー'=>'ttii',
1461  'ッヂィー'=>'ddii',
1462
1463  // 3 character syllables - doubled vowels
1464  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1465  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1466  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1467  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1468  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1469  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1470  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1471  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1472  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1473  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1474  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1475  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1476  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1477  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1478  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1479  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1480  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1481  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1482  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1483  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1484  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1485  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1486  'ウェー'=>'wee','ウィー'=>'wii',
1487  'イェー'=>'yee',
1488  'ティー'=>'tii',
1489  'ヂィー'=>'dii',
1490
1491  // 3 character syllables - doubled consonants
1492  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1493  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1494  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1495  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1496  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1497  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1498  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1499  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1500  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1501  'ッティ'=>'tti',
1502  'ッヂィ'=>'ddi',
1503
1504  // 3 character syllables - doubled vowel and consonants
1505  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1506  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1507  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1508  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1509  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1510  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1511  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1512  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1513  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1514  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1515  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1516
1517  // 2 character syllables - normal
1518  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1519  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1520  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1521  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1522  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1523  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1524  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1525  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1526  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1527  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1528  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1529  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1530  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1531  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1532  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1533  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1534  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1535  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1536  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1537  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1538  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1539  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1540  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1541  'ウェ'=>'we','ウィ'=>'wi',
1542  'イェ'=>'ye',
1543  'ティ'=>'ti',
1544  'ヂィ'=>'di',
1545
1546  // 2 character syllables - doubled vocal
1547  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1548  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1549  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1550  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1551  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1552  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1553  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1554  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1555  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1556  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1557  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1558  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1559  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1560  'ワー'=>'waa','ヲー'=>'woo',
1561  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1562  'ヵー'=>'kaa','ヶー'=>'kee',
1563  // old characters
1564  'ヱー'=>'wee','ヰー'=>'wii',
1565
1566  // seperate katakana 'n'
1567  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1568  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1569
1570  // 2 character syllables - doubled consonants
1571  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1572  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1573  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1574  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1575  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1576  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1577  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1578  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1579  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1580  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1581  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1582
1583  // 1 character syllables
1584  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1585  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1586  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1587  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1588  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1589  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1590  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1591  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1592  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1593  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1594  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1595  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1596  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1597  'ワ'=>'wa','ヲ'=>'wo',
1598  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1599  'ヵ'=>'ka','ヶ'=>'ke',
1600  // old characters
1601  'ヱ'=>'we','ヰ'=>'wi',
1602
1603  //  convert what's left (probably only kicks in when something's missing above)
1604  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1605  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1606
1607  // special characters
1608  '・'=>'_','、'=>'_',
1609  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1610
1611  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1612  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1613  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1614  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1615  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1616  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1617  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1618  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1619  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1620  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1621  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1622  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1623
1624  // "Greeklish"
1625  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1626  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1627
1628  // Thai
1629  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1630  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1631  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1632  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1633  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1634  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1635  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1636  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1637  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1638  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1639  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1640  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1641  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1642  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1643  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1644  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1645  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1646  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1647
1648  // Korean
1649  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1650  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1651  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1652  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1653  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1654  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1655);
1656
1657
1658