xref: /dokuwiki/inc/utf8.php (revision 5a932e77b3c806514203323540cb30e5ab9c28cf)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_encodeFN')){
23    /**
24     * URL-Encode a filename to allow unicodecharacters
25     *
26     * Slashes are not encoded
27     *
28     * When the second parameter is true the string will
29     * be encoded only if non ASCII characters are detected -
30     * This makes it safe to run it multiple times on the
31     * same string (default is true)
32     *
33     * @author Andreas Gohr <andi@splitbrain.org>
34     * @see    urlencode
35     */
36    function utf8_encodeFN($file,$safe=true){
37        if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38            return $file;
39        }
40        $file = urlencode($file);
41        $file = str_replace('%2F','/',$file);
42        return $file;
43    }
44}
45
46if(!function_exists('utf8_decodeFN')){
47    /**
48     * URL-Decode a filename
49     *
50     * This is just a wrapper around urldecode
51     *
52     * @author Andreas Gohr <andi@splitbrain.org>
53     * @see    urldecode
54     */
55    function utf8_decodeFN($file){
56        $file = urldecode($file);
57        return $file;
58    }
59}
60
61if(!function_exists('utf8_isASCII')){
62    /**
63     * Checks if a string contains 7bit ASCII only
64     *
65     * @author Andreas Haerter <netzmeister@andreas-haerter.de>
66     */
67    function utf8_isASCII($str){
68        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
69    }
70}
71
72if(!function_exists('utf8_strip')){
73    /**
74     * Strips all highbyte chars
75     *
76     * Returns a pure ASCII7 string
77     *
78     * @author Andreas Gohr <andi@splitbrain.org>
79     */
80    function utf8_strip($str){
81        $ascii = '';
82        $len = strlen($str);
83        for($i=0; $i<$len; $i++){
84            if(ord($str{$i}) <128){
85                $ascii .= $str{$i};
86            }
87        }
88        return $ascii;
89    }
90}
91
92if(!function_exists('utf8_check')){
93    /**
94     * Tries to detect if a string is in Unicode encoding
95     *
96     * @author <bmorel@ssi.fr>
97     * @link   http://www.php.net/manual/en/function.utf8-encode.php
98     */
99    function utf8_check($Str) {
100        $len = strlen($Str);
101        for ($i=0; $i<$len; $i++) {
102            $b = ord($Str[$i]);
103            if ($b < 0x80) continue; # 0bbbbbbb
104            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
105            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
106            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
107            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
108            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
109            else return false; # Does not match any model
110
111            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
112                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
113                    return false;
114            }
115        }
116        return true;
117    }
118}
119
120if(!function_exists('utf8_strlen')){
121    /**
122     * Unicode aware replacement for strlen()
123     *
124     * utf8_decode() converts characters that are not in ISO-8859-1
125     * to '?', which, for the purpose of counting, is alright - It's
126     * even faster than mb_strlen.
127     *
128     * @author <chernyshevsky at hotmail dot com>
129     * @see    strlen()
130     * @see    utf8_decode()
131     */
132    function utf8_strlen($string){
133        return strlen(utf8_decode($string));
134    }
135}
136
137if(!function_exists('utf8_substr')){
138    /**
139     * UTF-8 aware alternative to substr
140     *
141     * Return part of a string given character offset (and optionally length)
142     *
143     * @author Harry Fuecks <hfuecks@gmail.com>
144     * @author Chris Smith <chris@jalakai.co.uk>
145     * @param string
146     * @param integer number of UTF-8 characters offset (from left)
147     * @param integer (optional) length in UTF-8 characters from offset
148     * @return mixed string or false if failure
149     */
150    function utf8_substr($str, $offset, $length = null) {
151        if(UTF8_MBSTRING){
152            if( $length === null ){
153                return mb_substr($str, $offset);
154            }else{
155                return mb_substr($str, $offset, $length);
156            }
157        }
158
159        /*
160         * Notes:
161         *
162         * no mb string support, so we'll use pcre regex's with 'u' flag
163         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
164         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
165         *
166         * substr documentation states false can be returned in some cases (e.g. offset > string length)
167         * mb_substr never returns false, it will return an empty string instead.
168         *
169         * calculating the number of characters in the string is a relatively expensive operation, so
170         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
171         */
172
173        // cast parameters to appropriate types to avoid multiple notices/warnings
174        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
175        $offset = (int)$offset;
176        if (!is_null($length)) $length = (int)$length;
177
178        // handle trivial cases
179        if ($length === 0) return '';
180        if ($offset < 0 && $length < 0 && $length < $offset) return '';
181
182        $offset_pattern = '';
183        $length_pattern = '';
184
185        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
186        if ($offset < 0) {
187            $strlen = strlen(utf8_decode($str));        // see notes
188            $offset = $strlen + $offset;
189            if ($offset < 0) $offset = 0;
190        }
191
192        // establish a pattern for offset, a non-captured group equal in length to offset
193        if ($offset > 0) {
194            $Ox = (int)($offset/65535);
195            $Oy = $offset%65535;
196
197            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
198            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
199        } else {
200            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
201        }
202
203        // establish a pattern for length
204        if (is_null($length)) {
205            $length_pattern = '(.*)$';                  // the rest of the string
206        } else {
207
208            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
209            if ($offset > $strlen) return '';           // another trivial case
210
211            if ($length > 0) {
212
213                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
214
215                $Lx = (int)($length/65535);
216                $Ly = $length%65535;
217
218                // +ve length requires ... a captured group of length characters
219                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
220                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
221
222            } else if ($length < 0) {
223
224                if ($length < ($offset - $strlen)) return '';
225
226                $Lx = (int)((-$length)/65535);
227                $Ly = (-$length)%65535;
228
229                // -ve length requires ... capture everything except a group of -length characters
230                //                         anchored at the tail-end of the string
231                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
232                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
233            }
234        }
235
236        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
237        return $match[1];
238    }
239}
240
241if(!function_exists('utf8_substr_replace')){
242    /**
243     * Unicode aware replacement for substr_replace()
244     *
245     * @author Andreas Gohr <andi@splitbrain.org>
246     * @see    substr_replace()
247     */
248    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
249        $ret = '';
250        if($start>0) $ret .= utf8_substr($string, 0, $start);
251        $ret .= $replacement;
252        $ret .= utf8_substr($string, $start+$length);
253        return $ret;
254    }
255}
256
257if(!function_exists('utf8_ltrim')){
258    /**
259     * Unicode aware replacement for ltrim()
260     *
261     * @author Andreas Gohr <andi@splitbrain.org>
262     * @see    ltrim()
263     * @return string
264     */
265    function utf8_ltrim($str,$charlist=''){
266        if($charlist == '') return ltrim($str);
267
268        //quote charlist for use in a characterclass
269        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
270
271        return preg_replace('/^['.$charlist.']+/u','',$str);
272    }
273}
274
275if(!function_exists('utf8_rtrim')){
276    /**
277     * Unicode aware replacement for rtrim()
278     *
279     * @author Andreas Gohr <andi@splitbrain.org>
280     * @see    rtrim()
281     * @return string
282     */
283    function  utf8_rtrim($str,$charlist=''){
284        if($charlist == '') return rtrim($str);
285
286        //quote charlist for use in a characterclass
287        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
288
289        return preg_replace('/['.$charlist.']+$/u','',$str);
290    }
291}
292
293if(!function_exists('utf8_trim')){
294    /**
295     * Unicode aware replacement for trim()
296     *
297     * @author Andreas Gohr <andi@splitbrain.org>
298     * @see    trim()
299     * @return string
300     */
301    function  utf8_trim($str,$charlist='') {
302        if($charlist == '') return trim($str);
303
304        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
305    }
306}
307
308if(!function_exists('utf8_strtolower')){
309    /**
310     * This is a unicode aware replacement for strtolower()
311     *
312     * Uses mb_string extension if available
313     *
314     * @author Leo Feyer <leo@typolight.org>
315     * @see    strtolower()
316     * @see    utf8_strtoupper()
317     */
318    function utf8_strtolower($string){
319        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
320
321        global $UTF8_UPPER_TO_LOWER;
322        return strtr($string,$UTF8_UPPER_TO_LOWER);
323    }
324}
325
326if(!function_exists('utf8_strtoupper')){
327    /**
328     * This is a unicode aware replacement for strtoupper()
329     *
330     * Uses mb_string extension if available
331     *
332     * @author Leo Feyer <leo@typolight.org>
333     * @see    strtoupper()
334     * @see    utf8_strtoupper()
335     */
336    function utf8_strtoupper($string){
337        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
338
339        global $UTF8_LOWER_TO_UPPER;
340        return strtr($string,$UTF8_LOWER_TO_UPPER);
341    }
342}
343
344if(!function_exists('utf8_ucfirst')){
345    /**
346     * UTF-8 aware alternative to ucfirst
347     * Make a string's first character uppercase
348     *
349     * @author Harry Fuecks
350     * @param string
351     * @return string with first character as upper case (if applicable)
352     */
353    function utf8_ucfirst($str){
354        switch ( utf8_strlen($str) ) {
355            case 0:
356                return '';
357            case 1:
358                return utf8_strtoupper($str);
359            default:
360                preg_match('/^(.{1})(.*)$/us', $str, $matches);
361                return utf8_strtoupper($matches[1]).$matches[2];
362        }
363    }
364}
365
366if(!function_exists('utf8_ucwords')){
367    /**
368     * UTF-8 aware alternative to ucwords
369     * Uppercase the first character of each word in a string
370     *
371     * @author Harry Fuecks
372     * @param string
373     * @return string with first char of each word uppercase
374     * @see http://www.php.net/ucwords
375     */
376    function utf8_ucwords($str) {
377        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
378        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
379        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
380        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
381
382        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
383    }
384
385    /**
386     * Callback function for preg_replace_callback call in utf8_ucwords
387     * You don't need to call this yourself
388     *
389     * @author Harry Fuecks
390     * @param array of matches corresponding to a single word
391     * @return string with first char of the word in uppercase
392     * @see utf8_ucwords
393     * @see utf8_strtoupper
394     */
395    function utf8_ucwords_callback($matches) {
396        $leadingws = $matches[2];
397        $ucfirst = utf8_strtoupper($matches[3]);
398        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
399        return $leadingws . $ucword;
400    }
401}
402
403if(!function_exists('utf8_deaccent')){
404    /**
405     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
406     *
407     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
408     * letters. Default is to deaccent both cases ($case = 0)
409     *
410     * @author Andreas Gohr <andi@splitbrain.org>
411     */
412    function utf8_deaccent($string,$case=0){
413        if($case <= 0){
414            global $UTF8_LOWER_ACCENTS;
415            $string = strtr($string,$UTF8_LOWER_ACCENTS);
416        }
417        if($case >= 0){
418            global $UTF8_UPPER_ACCENTS;
419            $string = strtr($string,$UTF8_UPPER_ACCENTS);
420        }
421        return $string;
422    }
423}
424
425if(!function_exists('utf8_romanize')){
426    /**
427     * Romanize a non-latin string
428     *
429     * @author Andreas Gohr <andi@splitbrain.org>
430     */
431    function utf8_romanize($string){
432        if(utf8_isASCII($string)) return $string; //nothing to do
433
434        global $UTF8_ROMANIZATION;
435        return strtr($string,$UTF8_ROMANIZATION);
436    }
437}
438
439if(!function_exists('utf8_stripspecials')){
440    /**
441     * Removes special characters (nonalphanumeric) from a UTF-8 string
442     *
443     * This function adds the controlchars 0x00 to 0x19 to the array of
444     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
445     *
446     * @author Andreas Gohr <andi@splitbrain.org>
447     * @param  string $string     The UTF8 string to strip of special chars
448     * @param  string $repl       Replace special with this string
449     * @param  string $additional Additional chars to strip (used in regexp char class)
450     */
451    function utf8_stripspecials($string,$repl='',$additional=''){
452        global $UTF8_SPECIAL_CHARS;
453        global $UTF8_SPECIAL_CHARS2;
454
455        static $specials = null;
456        if(is_null($specials)){
457            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
458            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
459        }
460
461        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
462    }
463}
464
465if(!function_exists('utf8_strpos')){
466    /**
467     * This is an Unicode aware replacement for strpos
468     *
469     * @author Leo Feyer <leo@typolight.org>
470     * @see    strpos()
471     * @param  string
472     * @param  string
473     * @param  integer
474     * @return integer
475     */
476    function utf8_strpos($haystack, $needle, $offset=0){
477        $comp = 0;
478        $length = null;
479
480        while (is_null($length) || $length < $offset) {
481            $pos = strpos($haystack, $needle, $offset + $comp);
482
483            if ($pos === false)
484                return false;
485
486            $length = utf8_strlen(substr($haystack, 0, $pos));
487
488            if ($length < $offset)
489                $comp = $pos - $length;
490        }
491
492        return $length;
493    }
494}
495
496if(!function_exists('utf8_tohtml')){
497    /**
498     * Encodes UTF-8 characters to HTML entities
499     *
500     * @author Tom N Harris <tnharris@whoopdedo.org>
501     * @author <vpribish at shopping dot com>
502     * @link   http://www.php.net/manual/en/function.utf8-decode.php
503     */
504    function utf8_tohtml ($str) {
505        $ret = '';
506        foreach (utf8_to_unicode($str) as $cp) {
507            if ($cp < 0x80)
508                $ret .= chr($cp);
509            elseif ($cp < 0x100)
510                $ret .= "&#$cp;";
511            else
512                $ret .= '&#x'.dechex($cp).';';
513        }
514        return $ret;
515    }
516}
517
518if(!function_exists('utf8_unhtml')){
519    /**
520     * Decodes HTML entities to UTF-8 characters
521     *
522     * Convert any &#..; entity to a codepoint,
523     * The entities flag defaults to only decoding numeric entities.
524     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
525     * are handled as well. Avoids the problem that would occur if you
526     * had to decode "&amp;#38;&#38;amp;#38;"
527     *
528     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
529     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
530     * what it should be                   -> "&#38;&amp#38;"
531     *
532     * @author Tom N Harris <tnharris@whoopdedo.org>
533     * @param  string  $str      UTF-8 encoded string
534     * @param  boolean $entities Flag controlling decoding of named entities.
535     * @return UTF-8 encoded string with numeric (and named) entities replaced.
536     */
537    function utf8_unhtml($str, $entities=null) {
538        static $decoder = null;
539        if (is_null($decoder))
540            $decoder = new utf8_entity_decoder();
541        if (is_null($entities))
542            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
543                                         'utf8_decode_numeric', $str);
544        else
545            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
546                                         array(&$decoder, 'decode'), $str);
547    }
548}
549
550if(!function_exists('utf8_decode_numeric')){
551    function utf8_decode_numeric($ent) {
552        switch ($ent[2]) {
553            case 'X':
554            case 'x':
555                $cp = hexdec($ent[3]);
556                break;
557            default:
558                $cp = intval($ent[3]);
559                break;
560        }
561        return unicode_to_utf8(array($cp));
562    }
563}
564
565if(!class_exists('utf8_entity_decoder')){
566    class utf8_entity_decoder {
567        var $table;
568        function utf8_entity_decoder() {
569            $table = get_html_translation_table(HTML_ENTITIES);
570            $table = array_flip($table);
571            $this->table = array_map(array(&$this,'makeutf8'), $table);
572        }
573        function makeutf8($c) {
574            return unicode_to_utf8(array(ord($c)));
575        }
576        function decode($ent) {
577            if ($ent[1] == '#') {
578                return utf8_decode_numeric($ent);
579            } elseif (array_key_exists($ent[0],$this->table)) {
580                return $this->table[$ent[0]];
581            } else {
582                return $ent[0];
583            }
584        }
585    }
586}
587
588if(!function_exists('utf8_to_unicode')){
589    /**
590     * Takes an UTF-8 string and returns an array of ints representing the
591     * Unicode characters. Astral planes are supported ie. the ints in the
592     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
593     * are not allowed.
594     *
595     * If $strict is set to true the function returns false if the input
596     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
597     * level E_USER_WARNING
598     *
599     * Note: this function has been modified slightly in this library to
600     * trigger errors on encountering bad bytes
601     *
602     * @author <hsivonen@iki.fi>
603     * @author Harry Fuecks <hfuecks@gmail.com>
604     * @param  string  UTF-8 encoded string
605     * @param  boolean Check for invalid sequences?
606     * @return mixed array of unicode code points or false if UTF-8 invalid
607     * @see    unicode_to_utf8
608     * @link   http://hsivonen.iki.fi/php-utf8/
609     * @link   http://sourceforge.net/projects/phputf8/
610     */
611    function utf8_to_unicode($str,$strict=false) {
612        $mState = 0;     // cached expected number of octets after the current octet
613                         // until the beginning of the next UTF8 character sequence
614        $mUcs4  = 0;     // cached Unicode character
615        $mBytes = 1;     // cached expected number of octets in the current sequence
616
617        $out = array();
618
619        $len = strlen($str);
620
621        for($i = 0; $i < $len; $i++) {
622
623            $in = ord($str{$i});
624
625            if ( $mState == 0) {
626
627                // When mState is zero we expect either a US-ASCII character or a
628                // multi-octet sequence.
629                if (0 == (0x80 & ($in))) {
630                    // US-ASCII, pass straight through.
631                    $out[] = $in;
632                    $mBytes = 1;
633
634                } else if (0xC0 == (0xE0 & ($in))) {
635                    // First octet of 2 octet sequence
636                    $mUcs4 = ($in);
637                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
638                    $mState = 1;
639                    $mBytes = 2;
640
641                } else if (0xE0 == (0xF0 & ($in))) {
642                    // First octet of 3 octet sequence
643                    $mUcs4 = ($in);
644                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
645                    $mState = 2;
646                    $mBytes = 3;
647
648                } else if (0xF0 == (0xF8 & ($in))) {
649                    // First octet of 4 octet sequence
650                    $mUcs4 = ($in);
651                    $mUcs4 = ($mUcs4 & 0x07) << 18;
652                    $mState = 3;
653                    $mBytes = 4;
654
655                } else if (0xF8 == (0xFC & ($in))) {
656                    /* First octet of 5 octet sequence.
657                     *
658                     * This is illegal because the encoded codepoint must be either
659                     * (a) not the shortest form or
660                     * (b) outside the Unicode range of 0-0x10FFFF.
661                     * Rather than trying to resynchronize, we will carry on until the end
662                     * of the sequence and let the later error handling code catch it.
663                     */
664                    $mUcs4 = ($in);
665                    $mUcs4 = ($mUcs4 & 0x03) << 24;
666                    $mState = 4;
667                    $mBytes = 5;
668
669                } else if (0xFC == (0xFE & ($in))) {
670                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
671                    $mUcs4 = ($in);
672                    $mUcs4 = ($mUcs4 & 1) << 30;
673                    $mState = 5;
674                    $mBytes = 6;
675
676                } elseif($strict) {
677                    /* Current octet is neither in the US-ASCII range nor a legal first
678                     * octet of a multi-octet sequence.
679                     */
680                    trigger_error(
681                            'utf8_to_unicode: Illegal sequence identifier '.
682                                'in UTF-8 at byte '.$i,
683                            E_USER_WARNING
684                        );
685                    return false;
686
687                }
688
689            } else {
690
691                // When mState is non-zero, we expect a continuation of the multi-octet
692                // sequence
693                if (0x80 == (0xC0 & ($in))) {
694
695                    // Legal continuation.
696                    $shift = ($mState - 1) * 6;
697                    $tmp = $in;
698                    $tmp = ($tmp & 0x0000003F) << $shift;
699                    $mUcs4 |= $tmp;
700
701                    /**
702                     * End of the multi-octet sequence. mUcs4 now contains the final
703                     * Unicode codepoint to be output
704                     */
705                    if (0 == --$mState) {
706
707                        /*
708                         * Check for illegal sequences and codepoints.
709                         */
710                        // From Unicode 3.1, non-shortest form is illegal
711                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
712                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
713                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
714                            (4 < $mBytes) ||
715                            // From Unicode 3.2, surrogate characters are illegal
716                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
717                            // Codepoints outside the Unicode range are illegal
718                            ($mUcs4 > 0x10FFFF)) {
719
720                            if($strict){
721                                trigger_error(
722                                        'utf8_to_unicode: Illegal sequence or codepoint '.
723                                            'in UTF-8 at byte '.$i,
724                                        E_USER_WARNING
725                                    );
726
727                                return false;
728                            }
729
730                        }
731
732                        if (0xFEFF != $mUcs4) {
733                            // BOM is legal but we don't want to output it
734                            $out[] = $mUcs4;
735                        }
736
737                        //initialize UTF8 cache
738                        $mState = 0;
739                        $mUcs4  = 0;
740                        $mBytes = 1;
741                    }
742
743                } elseif($strict) {
744                    /**
745                     *((0xC0 & (*in) != 0x80) && (mState != 0))
746                     * Incomplete multi-octet sequence.
747                     */
748                    trigger_error(
749                            'utf8_to_unicode: Incomplete multi-octet '.
750                            '   sequence in UTF-8 at byte '.$i,
751                            E_USER_WARNING
752                        );
753
754                    return false;
755                }
756            }
757        }
758        return $out;
759    }
760}
761
762if(!function_exists('unicode_to_utf8')){
763    /**
764     * Takes an array of ints representing the Unicode characters and returns
765     * a UTF-8 string. Astral planes are supported ie. the ints in the
766     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
767     * are not allowed.
768     *
769     * If $strict is set to true the function returns false if the input
770     * array contains ints that represent surrogates or are outside the
771     * Unicode range and raises a PHP error at level E_USER_WARNING
772     *
773     * Note: this function has been modified slightly in this library to use
774     * output buffering to concatenate the UTF-8 string (faster) as well as
775     * reference the array by it's keys
776     *
777     * @param  array of unicode code points representing a string
778     * @param  boolean Check for invalid sequences?
779     * @return mixed UTF-8 string or false if array contains invalid code points
780     * @author <hsivonen@iki.fi>
781     * @author Harry Fuecks <hfuecks@gmail.com>
782     * @see    utf8_to_unicode
783     * @link   http://hsivonen.iki.fi/php-utf8/
784     * @link   http://sourceforge.net/projects/phputf8/
785     */
786    function unicode_to_utf8($arr,$strict=false) {
787        if (!is_array($arr)) return '';
788        ob_start();
789
790        foreach (array_keys($arr) as $k) {
791
792            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
793                # ASCII range (including control chars)
794
795                echo chr($arr[$k]);
796
797            } else if ($arr[$k] <= 0x07ff) {
798                # 2 byte sequence
799
800                echo chr(0xc0 | ($arr[$k] >> 6));
801                echo chr(0x80 | ($arr[$k] & 0x003f));
802
803            } else if($arr[$k] == 0xFEFF) {
804                # Byte order mark (skip)
805
806                // nop -- zap the BOM
807
808            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
809                # Test for illegal surrogates
810
811                // found a surrogate
812                if($strict){
813                    trigger_error(
814                        'unicode_to_utf8: Illegal surrogate '.
815                            'at index: '.$k.', value: '.$arr[$k],
816                        E_USER_WARNING
817                        );
818                    return false;
819                }
820
821            } else if ($arr[$k] <= 0xffff) {
822                # 3 byte sequence
823
824                echo chr(0xe0 | ($arr[$k] >> 12));
825                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
826                echo chr(0x80 | ($arr[$k] & 0x003f));
827
828            } else if ($arr[$k] <= 0x10ffff) {
829                # 4 byte sequence
830
831                echo chr(0xf0 | ($arr[$k] >> 18));
832                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
833                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
834                echo chr(0x80 | ($arr[$k] & 0x3f));
835
836            } elseif($strict) {
837
838                trigger_error(
839                    'unicode_to_utf8: Codepoint out of Unicode range '.
840                        'at index: '.$k.', value: '.$arr[$k],
841                    E_USER_WARNING
842                    );
843
844                // out of range
845                return false;
846            }
847        }
848
849        $result = ob_get_contents();
850        ob_end_clean();
851        return $result;
852    }
853}
854
855if(!function_exists('utf8_to_utf16be')){
856    /**
857     * UTF-8 to UTF-16BE conversion.
858     *
859     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
860     */
861    function utf8_to_utf16be(&$str, $bom = false) {
862        $out = $bom ? "\xFE\xFF" : '';
863        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
864
865        $uni = utf8_to_unicode($str);
866        foreach($uni as $cp){
867            $out .= pack('n',$cp);
868        }
869        return $out;
870    }
871}
872
873if(!function_exists('utf16be_to_utf8')){
874    /**
875     * UTF-8 to UTF-16BE conversion.
876     *
877     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
878     */
879    function utf16be_to_utf8(&$str) {
880        $uni = unpack('n*',$str);
881        return unicode_to_utf8($uni);
882    }
883}
884
885if(!function_exists('utf8_bad_replace')){
886    /**
887     * Replace bad bytes with an alternative character
888     *
889     * ASCII character is recommended for replacement char
890     *
891     * PCRE Pattern to locate bad bytes in a UTF-8 string
892     * Comes from W3 FAQ: Multilingual Forms
893     * Note: modified to include full ASCII range including control chars
894     *
895     * @author Harry Fuecks <hfuecks@gmail.com>
896     * @see http://www.w3.org/International/questions/qa-forms-utf-8
897     * @param string to search
898     * @param string to replace bad bytes with (defaults to '?') - use ASCII
899     * @return string
900     */
901    function utf8_bad_replace($str, $replace = '') {
902        $UTF8_BAD =
903         '([\x00-\x7F]'.                          # ASCII (including control chars)
904         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
905         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
906         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
907         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
908         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
909         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
910         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
911         '|(.{1}))';                              # invalid byte
912        ob_start();
913        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
914            if ( !isset($matches[2])) {
915                echo $matches[0];
916            } else {
917                echo $replace;
918            }
919            $str = substr($str,strlen($matches[0]));
920        }
921        $result = ob_get_contents();
922        ob_end_clean();
923        return $result;
924    }
925}
926
927if(!function_exists('utf8_correctIdx')){
928    /**
929     * adjust a byte index into a utf8 string to a utf8 character boundary
930     *
931     * @param $str   string   utf8 character string
932     * @param $i     int      byte index into $str
933     * @param $next  bool     direction to search for boundary,
934     *                           false = up (current character)
935     *                           true = down (next character)
936     *
937     * @return int            byte index into $str now pointing to a utf8 character boundary
938     *
939     * @author       chris smith <chris@jalakai.co.uk>
940     */
941    function utf8_correctIdx(&$str,$i,$next=false) {
942
943        if ($i <= 0) return 0;
944
945        $limit = strlen($str);
946        if ($i>=$limit) return $limit;
947
948        if ($next) {
949            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
950        } else {
951            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
952        }
953
954        return $i;
955    }
956}
957
958// only needed if no mb_string available
959if(!UTF8_MBSTRING){
960    /**
961     * UTF-8 Case lookup table
962     *
963     * This lookuptable defines the upper case letters to their correspponding
964     * lower case letter in UTF-8
965     *
966     * @author Andreas Gohr <andi@splitbrain.org>
967     */
968    global $UTF8_LOWER_TO_UPPER;
969    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
970            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
971            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
972            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
973            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
974            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
975            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
976            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
977            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
978            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
979            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
980            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
981            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
982            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
983            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
984            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
985            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
986            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
987            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
988            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
989            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
990            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
991            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
992            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
993            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
994            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
995            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
996            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
997            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
998            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
999            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1000            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1001            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1002            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1003            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1004            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1005            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1006            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1007            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1008            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1009            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1010            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1011            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1012            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1013            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1014            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1015            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1016            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1017            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1018            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1019            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1020            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1021            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1022            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1023            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1024            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1025            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1026            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1027            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1028            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1029            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1030            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1031            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1032            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1033            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1034            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1035            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1036            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1037                );
1038
1039    /**
1040     * UTF-8 Case lookup table
1041     *
1042     * This lookuptable defines the lower case letters to their correspponding
1043     * upper case letter in UTF-8
1044     *
1045     * @author Andreas Gohr <andi@splitbrain.org>
1046     */
1047    global $UTF8_UPPER_TO_LOWER;
1048    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1049            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1050            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1051            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1052            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1053            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1054            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1055            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1056            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1057            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1058            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1059            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1060            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1061            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1062            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1063            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1064            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1065            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1066            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1067            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1068            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1069            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1070            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1071            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1072            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1073            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1074            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1075            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1076            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1077            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1078            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1079            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1080            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1081            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1082            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1083            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1084            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1085            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1086            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1087            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1088            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1089            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1090            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1091            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1092            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1093            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1094            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1095            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1096            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1097            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1098            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1099            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1100            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1101            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1102            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1103            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1104            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1105            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1106            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1107            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1108            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1109            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1110            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1111            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1112            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1113            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1114            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1115            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1116                );
1117}; // end of case lookup tables
1118
1119/**
1120 * UTF-8 lookup table for lower case accented letters
1121 *
1122 * This lookuptable defines replacements for accented characters from the ASCII-7
1123 * range. This are lower case letters only.
1124 *
1125 * @author Andreas Gohr <andi@splitbrain.org>
1126 * @see    utf8_deaccent()
1127 */
1128global $UTF8_LOWER_ACCENTS;
1129if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1130  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1131  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1132  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1133  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1134  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1135  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1136  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1137  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1138  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1139  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1140  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1141  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1142  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1143  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1144  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1145);
1146
1147/**
1148 * UTF-8 lookup table for upper case accented letters
1149 *
1150 * This lookuptable defines replacements for accented characters from the ASCII-7
1151 * range. This are upper case letters only.
1152 *
1153 * @author Andreas Gohr <andi@splitbrain.org>
1154 * @see    utf8_deaccent()
1155 */
1156global $UTF8_UPPER_ACCENTS;
1157if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1158  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1159  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1160  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1161  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1162  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1163  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1164  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1165  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1166  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1167  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1168  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1169  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1170  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1171  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1172  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1173);
1174
1175/**
1176 * UTF-8 array of common special characters
1177 *
1178 * This array should contain all special characters (not a letter or digit)
1179 * defined in the various local charsets - it's not a complete list of non-alphanum
1180 * characters in UTF-8. It's not perfect but should match most cases of special
1181 * chars.
1182 *
1183 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1184 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1185 *
1186 * @author Andreas Gohr <andi@splitbrain.org>
1187 * @see    utf8_stripspecials()
1188 */
1189global $UTF8_SPECIAL_CHARS;
1190if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1191  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1192  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1193          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1194  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1195  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1196  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1197  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1198  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1199  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1200  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1201  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1202  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1203  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1204  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1205  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1206  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1207  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1208  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1209  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1210  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1211  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1212  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1213  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1214  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1215  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1216  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1217  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1218  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1219  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1220  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1221  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1222  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1223  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1224  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1225  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1226  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1227  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1228  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1229  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1230  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1231  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1232  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1233  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1234  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1235  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1236  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1237  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1238  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1239  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1240  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1241  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1242  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1243  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1244          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1245  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1246  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1247  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1248  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1249  0xffeb, 0xffec, 0xffed, 0xffee,
1250  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1251  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1252  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1253  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1254  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1255);
1256
1257// utf8 version of above data
1258global $UTF8_SPECIAL_CHARS2;
1259if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1260    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1261    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1262    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1263    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1264    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1265    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1266    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1267    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1268    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1269    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1270    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1271    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1272    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1273    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1274    '➷➸➹➺➻➼➽➾'.
1275    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1276    '�'.
1277    '�ﹼﹽ'.
1278    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1279    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1280    '����������������������������������������������������������������'.
1281    '   ⁠';
1282
1283/**
1284 * Romanization lookup table
1285 *
1286 * This lookup tables provides a way to transform strings written in a language
1287 * different from the ones based upon latin letters into plain ASCII.
1288 *
1289 * Please note: this is not a scientific transliteration table. It only works
1290 * oneway from nonlatin to ASCII and it works by simple character replacement
1291 * only. Specialities of each language are not supported.
1292 *
1293 * @author Andreas Gohr <andi@splitbrain.org>
1294 * @author Vitaly Blokhin <vitinfo@vitn.com>
1295 * @link   http://www.uconv.com/translit.htm
1296 * @author Bisqwit <bisqwit@iki.fi>
1297 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1298 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1299 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1300 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1301 * @author Arthit Suriyawongkul <arthit@gmail.com>
1302 * @author Denis Scheither <amorphis@uni-bremen.de>
1303 */
1304global $UTF8_ROMANIZATION;
1305if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1306  // scandinavian - differs from what we do in deaccent
1307  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1308
1309  //russian cyrillic
1310  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1311  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1312  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1313  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1314  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1315  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1316  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1317  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1318  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1319  // Ukrainian cyrillic
1320  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1321  // Georgian
1322  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1323  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1324  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1325  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1326  'ჰ'=>'xh',
1327  //Sanskrit
1328  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1329  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1330  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1331  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1332  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1333  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1334  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1335  //Hebrew
1336  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1337  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1338  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1339  'ש'=>'sh','ת'=>'t',
1340  //Arabic
1341  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1342  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1343  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1344  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1345
1346  // Japanese characters  (last update: 2008-05-09)
1347
1348  // Japanese hiragana
1349
1350  // 3 character syllables, っ doubles the consonant after
1351  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1352  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1353  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1354  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1355  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1356  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1357  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1358  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1359  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1360  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1361  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1362
1363  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1364  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1365  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1366
1367   // 2 character syllables - normal
1368  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1369  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1370  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1371  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1372  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1373  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1374  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1375  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1376  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1377  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1378  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1379  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1380  'うぇ'=>'we','うぃ'=>'wi',
1381  'いぇ'=>'ye',
1382
1383  // 2 character syllables, っ doubles the consonant after
1384  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1385  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1386  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1387  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1388  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1389  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1390  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1391  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1392  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1393  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1394  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1395
1396  // 1 character syllabels
1397  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1398  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1399  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1400  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1401  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1402  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1403  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1404  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1405  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1406  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1407  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1408  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1409  'わ'=>'wa','を'=>'wo',
1410  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1411  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1412  // old characters
1413  'ゑ'=>'we','ゐ'=>'wi',
1414
1415  //  convert what's left (probably only kicks in when something's missing above)
1416  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1417  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1418
1419  // never seen one of those (disabled for the moment)
1420  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1421  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1422  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1423  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1424  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1425  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1426  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1427  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1428  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1429  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1430  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1431  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1432  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1433  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1434
1435  // 'spare' characters from other romanization systems
1436  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1437  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1438  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1439  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1440  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1441  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1442  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1443  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1444  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1445  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1446
1447
1448  // Japanese katakana
1449
1450  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1451  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1452  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1453  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1454  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1455  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1456  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1457  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1458  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1459  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1460  'ッティー'=>'ttii',
1461  'ッヂィー'=>'ddii',
1462
1463  // 3 character syllables - doubled vowels
1464  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1465  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1466  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1467  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1468  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1469  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1470  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1471  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1472  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1473  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1474  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1475  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1476  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1477  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1478  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1479  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1480  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1481  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1482  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1483  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1484  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1485  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1486  'ウェー'=>'wee','ウィー'=>'wii',
1487  'イェー'=>'yee',
1488  'ティー'=>'tii',
1489  'ヂィー'=>'dii',
1490
1491  // 3 character syllables - doubled consonants
1492  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1493  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1494  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1495  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1496  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1497  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1498  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1499  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1500  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1501  'ッティ'=>'tti',
1502  'ッヂィ'=>'ddi',
1503
1504  // 3 character syllables - doubled vowel and consonants
1505  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1506  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1507  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1508  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1509  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1510  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1511  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1512  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1513  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1514  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1515  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1516
1517  // 2 character syllables - normal
1518  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1519  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1520  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1521  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1522  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1523  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1524  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1525  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1526  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1527  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1528  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1529  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1530  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1531  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1532  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1533  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1534  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1535  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1536  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1537  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1538  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1539  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1540  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1541  'ウェ'=>'we','ウィ'=>'wi',
1542  'イェ'=>'ye',
1543  'ティ'=>'ti',
1544  'ヂィ'=>'di',
1545
1546  // 2 character syllables - doubled vocal
1547  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1548  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1549  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1550  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1551  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1552  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1553  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1554  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1555  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1556  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1557  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1558  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1559  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1560  'ワー'=>'waa','ヲー'=>'woo',
1561  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1562  'ヵー'=>'kaa','ヶー'=>'kee',
1563  // old characters
1564  'ヱー'=>'wee','ヰー'=>'wii',
1565
1566  // seperate katakana 'n'
1567  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1568  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1569
1570  // 2 character syllables - doubled consonants
1571  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1572  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1573  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1574  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1575  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1576  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1577  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1578  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1579  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1580  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1581  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1582
1583  // 1 character syllables
1584  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1585  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1586  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1587  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1588  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1589  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1590  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1591  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1592  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1593  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1594  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1595  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1596  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1597  'ワ'=>'wa','ヲ'=>'wo',
1598  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1599  'ヵ'=>'ka','ヶ'=>'ke',
1600  // old characters
1601  'ヱ'=>'we','ヰ'=>'wi',
1602
1603  //  convert what's left (probably only kicks in when something's missing above)
1604  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1605  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1606
1607  // special characters
1608  '・'=>'_','、'=>'_',
1609  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1610
1611  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1612  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1613  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1614  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1615  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1616  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1617  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1618  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1619  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1620  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1621  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1622  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1623
1624  // "Greeklish"
1625  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1626  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1627
1628  // Thai
1629  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1630  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1631  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1632  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1633  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1634  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1635  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1636  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1637  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1638  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1639  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1640  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1641  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1642  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1643  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1644  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1645  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1646  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1647
1648  // Korean
1649  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1650  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1651  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1652  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1653  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1654  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1655);
1656
1657
1658