xref: /dokuwiki/inc/utf8.php (revision a8c343f2838765f8034ee6b91263b1b417f3a7ec)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_isASCII')){
23    /**
24     * Checks if a string contains 7bit ASCII only
25     *
26     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
27     */
28    function utf8_isASCII($str){
29        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30    }
31}
32
33if(!function_exists('utf8_strip')){
34    /**
35     * Strips all highbyte chars
36     *
37     * Returns a pure ASCII7 string
38     *
39     * @author Andreas Gohr <andi@splitbrain.org>
40     */
41    function utf8_strip($str){
42        $ascii = '';
43        $len = strlen($str);
44        for($i=0; $i<$len; $i++){
45            if(ord($str{$i}) <128){
46                $ascii .= $str{$i};
47            }
48        }
49        return $ascii;
50    }
51}
52
53if(!function_exists('utf8_check')){
54    /**
55     * Tries to detect if a string is in Unicode encoding
56     *
57     * @author <bmorel@ssi.fr>
58     * @link   http://www.php.net/manual/en/function.utf8-encode.php
59     */
60    function utf8_check($Str) {
61        $len = strlen($Str);
62        for ($i=0; $i<$len; $i++) {
63            $b = ord($Str[$i]);
64            if ($b < 0x80) continue; # 0bbbbbbb
65            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70            else return false; # Does not match any model
71
72            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74                    return false;
75            }
76        }
77        return true;
78    }
79}
80
81if(!function_exists('utf8_basename')){
82    /**
83     * A locale independent basename() implementation
84     *
85     * works around a bug in PHP's basename() implementation
86     *
87     * @see basename()
88     * @link   https://bugs.php.net/bug.php?id=37738
89     * @param string $path     A path
90     * @param string $suffix   If the name component ends in suffix this will also be cut off
91     * @return string
92     */
93    function utf8_basename($path, $suffix=''){
94        $slashrpos = strrpos($path, '/');
95        $bslashrpos = strrpos($path, '\\');
96        $rpos = max($slashrpos === false ? -1 : $slashrpos, $bslashrpos === false ? -1 : $bslashrpos);
97        $path = substr($path, $rpos+1);
98
99        $suflen = strlen($suffix);
100        if($suflen && (substr($path, -$suflen) == $suffix)){
101            $path = substr($path, 0, -$suflen);
102        }
103
104        return $path;
105    }
106}
107
108if(!function_exists('utf8_strlen')){
109    /**
110     * Unicode aware replacement for strlen()
111     *
112     * utf8_decode() converts characters that are not in ISO-8859-1
113     * to '?', which, for the purpose of counting, is alright - It's
114     * even faster than mb_strlen.
115     *
116     * @author <chernyshevsky at hotmail dot com>
117     * @see    strlen()
118     * @see    utf8_decode()
119     */
120    function utf8_strlen($string){
121        return strlen(utf8_decode($string));
122    }
123}
124
125if(!function_exists('utf8_substr')){
126    /**
127     * UTF-8 aware alternative to substr
128     *
129     * Return part of a string given character offset (and optionally length)
130     *
131     * @author Harry Fuecks <hfuecks@gmail.com>
132     * @author Chris Smith <chris@jalakai.co.uk>
133     * @param string $str
134     * @param int $offset number of UTF-8 characters offset (from left)
135     * @param int $length (optional) length in UTF-8 characters from offset
136     * @return mixed string or false if failure
137     */
138    function utf8_substr($str, $offset, $length = null) {
139        if(UTF8_MBSTRING){
140            if( $length === null ){
141                return mb_substr($str, $offset);
142            }else{
143                return mb_substr($str, $offset, $length);
144            }
145        }
146
147        /*
148         * Notes:
149         *
150         * no mb string support, so we'll use pcre regex's with 'u' flag
151         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
152         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
153         *
154         * substr documentation states false can be returned in some cases (e.g. offset > string length)
155         * mb_substr never returns false, it will return an empty string instead.
156         *
157         * calculating the number of characters in the string is a relatively expensive operation, so
158         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
159         */
160
161        // cast parameters to appropriate types to avoid multiple notices/warnings
162        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
163        $offset = (int)$offset;
164        if (!is_null($length)) $length = (int)$length;
165
166        // handle trivial cases
167        if ($length === 0) return '';
168        if ($offset < 0 && $length < 0 && $length < $offset) return '';
169
170        $offset_pattern = '';
171        $length_pattern = '';
172
173        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
174        if ($offset < 0) {
175            $strlen = strlen(utf8_decode($str));        // see notes
176            $offset = $strlen + $offset;
177            if ($offset < 0) $offset = 0;
178        }
179
180        // establish a pattern for offset, a non-captured group equal in length to offset
181        if ($offset > 0) {
182            $Ox = (int)($offset/65535);
183            $Oy = $offset%65535;
184
185            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
186            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
187        } else {
188            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
189        }
190
191        // establish a pattern for length
192        if (is_null($length)) {
193            $length_pattern = '(.*)$';                  // the rest of the string
194        } else {
195
196            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
197            if ($offset > $strlen) return '';           // another trivial case
198
199            if ($length > 0) {
200
201                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
202
203                $Lx = (int)($length/65535);
204                $Ly = $length%65535;
205
206                // +ve length requires ... a captured group of length characters
207                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
208                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
209
210            } else if ($length < 0) {
211
212                if ($length < ($offset - $strlen)) return '';
213
214                $Lx = (int)((-$length)/65535);
215                $Ly = (-$length)%65535;
216
217                // -ve length requires ... capture everything except a group of -length characters
218                //                         anchored at the tail-end of the string
219                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
220                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
221            }
222        }
223
224        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
225        return $match[1];
226    }
227}
228
229if(!function_exists('utf8_substr_replace')){
230    /**
231     * Unicode aware replacement for substr_replace()
232     *
233     * @author Andreas Gohr <andi@splitbrain.org>
234     * @see    substr_replace()
235     */
236    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
237        $ret = '';
238        if($start>0) $ret .= utf8_substr($string, 0, $start);
239        $ret .= $replacement;
240        $ret .= utf8_substr($string, $start+$length);
241        return $ret;
242    }
243}
244
245if(!function_exists('utf8_ltrim')){
246    /**
247     * Unicode aware replacement for ltrim()
248     *
249     * @author Andreas Gohr <andi@splitbrain.org>
250     * @see    ltrim()
251     * @param  string $str
252     * @param  string $charlist
253     * @return string
254     */
255    function utf8_ltrim($str,$charlist=''){
256        if($charlist == '') return ltrim($str);
257
258        //quote charlist for use in a characterclass
259        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
260
261        return preg_replace('/^['.$charlist.']+/u','',$str);
262    }
263}
264
265if(!function_exists('utf8_rtrim')){
266    /**
267     * Unicode aware replacement for rtrim()
268     *
269     * @author Andreas Gohr <andi@splitbrain.org>
270     * @see    rtrim()
271     * @param  string $str
272     * @param  string $charlist
273     * @return string
274     */
275    function  utf8_rtrim($str,$charlist=''){
276        if($charlist == '') return rtrim($str);
277
278        //quote charlist for use in a characterclass
279        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
280
281        return preg_replace('/['.$charlist.']+$/u','',$str);
282    }
283}
284
285if(!function_exists('utf8_trim')){
286    /**
287     * Unicode aware replacement for trim()
288     *
289     * @author Andreas Gohr <andi@splitbrain.org>
290     * @see    trim()
291     * @param  string $str
292     * @param  string $charlist
293     * @return string
294     */
295    function  utf8_trim($str,$charlist='') {
296        if($charlist == '') return trim($str);
297
298        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
299    }
300}
301
302if(!function_exists('utf8_strtolower')){
303    /**
304     * This is a unicode aware replacement for strtolower()
305     *
306     * Uses mb_string extension if available
307     *
308     * @author Leo Feyer <leo@typolight.org>
309     * @see    strtolower()
310     * @see    utf8_strtoupper()
311     */
312    function utf8_strtolower($string){
313        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
314
315        global $UTF8_UPPER_TO_LOWER;
316        return strtr($string,$UTF8_UPPER_TO_LOWER);
317    }
318}
319
320if(!function_exists('utf8_strtoupper')){
321    /**
322     * This is a unicode aware replacement for strtoupper()
323     *
324     * Uses mb_string extension if available
325     *
326     * @author Leo Feyer <leo@typolight.org>
327     * @see    strtoupper()
328     * @see    utf8_strtoupper()
329     */
330    function utf8_strtoupper($string){
331        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
332
333        global $UTF8_LOWER_TO_UPPER;
334        return strtr($string,$UTF8_LOWER_TO_UPPER);
335    }
336}
337
338if(!function_exists('utf8_ucfirst')){
339    /**
340     * UTF-8 aware alternative to ucfirst
341     * Make a string's first character uppercase
342     *
343     * @author Harry Fuecks
344     * @param string
345     * @return string with first character as upper case (if applicable)
346     */
347    function utf8_ucfirst($str){
348        switch ( utf8_strlen($str) ) {
349            case 0:
350                return '';
351            case 1:
352                return utf8_strtoupper($str);
353            default:
354                preg_match('/^(.{1})(.*)$/us', $str, $matches);
355                return utf8_strtoupper($matches[1]).$matches[2];
356        }
357    }
358}
359
360if(!function_exists('utf8_ucwords')){
361    /**
362     * UTF-8 aware alternative to ucwords
363     * Uppercase the first character of each word in a string
364     *
365     * @author Harry Fuecks
366     * @param string
367     * @return string with first char of each word uppercase
368     * @see http://www.php.net/ucwords
369     */
370    function utf8_ucwords($str) {
371        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
372        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
373        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
374        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
375
376        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
377    }
378
379    /**
380     * Callback function for preg_replace_callback call in utf8_ucwords
381     * You don't need to call this yourself
382     *
383     * @author Harry Fuecks
384     * @param  array $matches matches corresponding to a single word
385     * @return string with first char of the word in uppercase
386     * @see utf8_ucwords
387     * @see utf8_strtoupper
388     */
389    function utf8_ucwords_callback($matches) {
390        $leadingws = $matches[2];
391        $ucfirst = utf8_strtoupper($matches[3]);
392        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
393        return $leadingws . $ucword;
394    }
395}
396
397if(!function_exists('utf8_deaccent')){
398    /**
399     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
400     *
401     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
402     * letters. Default is to deaccent both cases ($case = 0)
403     *
404     * @author Andreas Gohr <andi@splitbrain.org>
405     */
406    function utf8_deaccent($string,$case=0){
407        if($case <= 0){
408            global $UTF8_LOWER_ACCENTS;
409            $string = strtr($string,$UTF8_LOWER_ACCENTS);
410        }
411        if($case >= 0){
412            global $UTF8_UPPER_ACCENTS;
413            $string = strtr($string,$UTF8_UPPER_ACCENTS);
414        }
415        return $string;
416    }
417}
418
419if(!function_exists('utf8_romanize')){
420    /**
421     * Romanize a non-latin string
422     *
423     * @author Andreas Gohr <andi@splitbrain.org>
424     */
425    function utf8_romanize($string){
426        if(utf8_isASCII($string)) return $string; //nothing to do
427
428        global $UTF8_ROMANIZATION;
429        return strtr($string,$UTF8_ROMANIZATION);
430    }
431}
432
433if(!function_exists('utf8_stripspecials')){
434    /**
435     * Removes special characters (nonalphanumeric) from a UTF-8 string
436     *
437     * This function adds the controlchars 0x00 to 0x19 to the array of
438     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
439     *
440     * @author Andreas Gohr <andi@splitbrain.org>
441     * @param  string $string     The UTF8 string to strip of special chars
442     * @param  string $repl       Replace special with this string
443     * @param  string $additional Additional chars to strip (used in regexp char class)
444     * @return string
445     */
446    function utf8_stripspecials($string,$repl='',$additional=''){
447        global $UTF8_SPECIAL_CHARS2;
448
449        static $specials = null;
450        if(is_null($specials)){
451            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
452            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
453        }
454
455        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
456    }
457}
458
459if(!function_exists('utf8_strpos')){
460    /**
461     * This is an Unicode aware replacement for strpos
462     *
463     * @author Leo Feyer <leo@typolight.org>
464     * @see    strpos()
465     * @param  string
466     * @param  string
467     * @param  integer
468     * @return integer
469     */
470    function utf8_strpos($haystack, $needle, $offset=0){
471        $comp = 0;
472        $length = null;
473
474        while (is_null($length) || $length < $offset) {
475            $pos = strpos($haystack, $needle, $offset + $comp);
476
477            if ($pos === false)
478                return false;
479
480            $length = utf8_strlen(substr($haystack, 0, $pos));
481
482            if ($length < $offset)
483                $comp = $pos - $length;
484        }
485
486        return $length;
487    }
488}
489
490if(!function_exists('utf8_tohtml')){
491    /**
492     * Encodes UTF-8 characters to HTML entities
493     *
494     * @author Tom N Harris <tnharris@whoopdedo.org>
495     * @author <vpribish at shopping dot com>
496     * @link   http://www.php.net/manual/en/function.utf8-decode.php
497     */
498    function utf8_tohtml ($str) {
499        $ret = '';
500        foreach (utf8_to_unicode($str) as $cp) {
501            if ($cp < 0x80)
502                $ret .= chr($cp);
503            elseif ($cp < 0x100)
504                $ret .= "&#$cp;";
505            else
506                $ret .= '&#x'.dechex($cp).';';
507        }
508        return $ret;
509    }
510}
511
512if(!function_exists('utf8_unhtml')){
513    /**
514     * Decodes HTML entities to UTF-8 characters
515     *
516     * Convert any &#..; entity to a codepoint,
517     * The entities flag defaults to only decoding numeric entities.
518     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
519     * are handled as well. Avoids the problem that would occur if you
520     * had to decode "&amp;#38;&#38;amp;#38;"
521     *
522     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
523     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
524     * what it should be                   -> "&#38;&amp#38;"
525     *
526     * @author Tom N Harris <tnharris@whoopdedo.org>
527     * @param  string  $str      UTF-8 encoded string
528     * @param  boolean $entities Flag controlling decoding of named entities.
529     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
530     */
531    function utf8_unhtml($str, $entities=null) {
532        static $decoder = null;
533        if (is_null($decoder))
534            $decoder = new utf8_entity_decoder();
535        if (is_null($entities))
536            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
537                                         'utf8_decode_numeric', $str);
538        else
539            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
540                                         array(&$decoder, 'decode'), $str);
541    }
542}
543
544if(!function_exists('utf8_decode_numeric')){
545    /**
546     * Decodes numeric HTML entities to their correct UTF-8 characters
547     *
548     * @param $ent string A numeric entity
549     * @return string
550     */
551    function utf8_decode_numeric($ent) {
552        switch ($ent[2]) {
553            case 'X':
554            case 'x':
555                $cp = hexdec($ent[3]);
556                break;
557            default:
558                $cp = intval($ent[3]);
559                break;
560        }
561        return unicode_to_utf8(array($cp));
562    }
563}
564
565if(!class_exists('utf8_entity_decoder')){
566    /**
567     * Encapsulate HTML entity decoding tables
568     */
569    class utf8_entity_decoder {
570        var $table;
571
572        /**
573         * Initializes the decoding tables
574         */
575        function __construct() {
576            $table = get_html_translation_table(HTML_ENTITIES);
577            $table = array_flip($table);
578            $this->table = array_map(array(&$this,'makeutf8'), $table);
579        }
580
581        /**
582         * Wrapper aorund unicode_to_utf8()
583         *
584         * @param $c string
585         * @return mixed
586         */
587        function makeutf8($c) {
588            return unicode_to_utf8(array(ord($c)));
589        }
590
591        /**
592         * Decodes any HTML entity to it's correct UTF-8 char equivalent
593         *
594         * @param $ent string An entity
595         * @return string
596         */
597        function decode($ent) {
598            if ($ent[1] == '#') {
599                return utf8_decode_numeric($ent);
600            } elseif (array_key_exists($ent[0],$this->table)) {
601                return $this->table[$ent[0]];
602            } else {
603                return $ent[0];
604            }
605        }
606    }
607}
608
609if(!function_exists('utf8_to_unicode')){
610    /**
611     * Takes an UTF-8 string and returns an array of ints representing the
612     * Unicode characters. Astral planes are supported ie. the ints in the
613     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
614     * are not allowed.
615     *
616     * If $strict is set to true the function returns false if the input
617     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
618     * level E_USER_WARNING
619     *
620     * Note: this function has been modified slightly in this library to
621     * trigger errors on encountering bad bytes
622     *
623     * @author <hsivonen@iki.fi>
624     * @author Harry Fuecks <hfuecks@gmail.com>
625     * @param  string  $str UTF-8 encoded string
626     * @param  boolean $strict Check for invalid sequences?
627     * @return mixed array of unicode code points or false if UTF-8 invalid
628     * @see    unicode_to_utf8
629     * @link   http://hsivonen.iki.fi/php-utf8/
630     * @link   http://sourceforge.net/projects/phputf8/
631     */
632    function utf8_to_unicode($str,$strict=false) {
633        $mState = 0;     // cached expected number of octets after the current octet
634                         // until the beginning of the next UTF8 character sequence
635        $mUcs4  = 0;     // cached Unicode character
636        $mBytes = 1;     // cached expected number of octets in the current sequence
637
638        $out = array();
639
640        $len = strlen($str);
641
642        for($i = 0; $i < $len; $i++) {
643
644            $in = ord($str{$i});
645
646            if ( $mState == 0) {
647
648                // When mState is zero we expect either a US-ASCII character or a
649                // multi-octet sequence.
650                if (0 == (0x80 & ($in))) {
651                    // US-ASCII, pass straight through.
652                    $out[] = $in;
653                    $mBytes = 1;
654
655                } else if (0xC0 == (0xE0 & ($in))) {
656                    // First octet of 2 octet sequence
657                    $mUcs4 = ($in);
658                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
659                    $mState = 1;
660                    $mBytes = 2;
661
662                } else if (0xE0 == (0xF0 & ($in))) {
663                    // First octet of 3 octet sequence
664                    $mUcs4 = ($in);
665                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
666                    $mState = 2;
667                    $mBytes = 3;
668
669                } else if (0xF0 == (0xF8 & ($in))) {
670                    // First octet of 4 octet sequence
671                    $mUcs4 = ($in);
672                    $mUcs4 = ($mUcs4 & 0x07) << 18;
673                    $mState = 3;
674                    $mBytes = 4;
675
676                } else if (0xF8 == (0xFC & ($in))) {
677                    /* First octet of 5 octet sequence.
678                     *
679                     * This is illegal because the encoded codepoint must be either
680                     * (a) not the shortest form or
681                     * (b) outside the Unicode range of 0-0x10FFFF.
682                     * Rather than trying to resynchronize, we will carry on until the end
683                     * of the sequence and let the later error handling code catch it.
684                     */
685                    $mUcs4 = ($in);
686                    $mUcs4 = ($mUcs4 & 0x03) << 24;
687                    $mState = 4;
688                    $mBytes = 5;
689
690                } else if (0xFC == (0xFE & ($in))) {
691                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
692                    $mUcs4 = ($in);
693                    $mUcs4 = ($mUcs4 & 1) << 30;
694                    $mState = 5;
695                    $mBytes = 6;
696
697                } elseif($strict) {
698                    /* Current octet is neither in the US-ASCII range nor a legal first
699                     * octet of a multi-octet sequence.
700                     */
701                    trigger_error(
702                            'utf8_to_unicode: Illegal sequence identifier '.
703                                'in UTF-8 at byte '.$i,
704                            E_USER_WARNING
705                        );
706                    return false;
707
708                }
709
710            } else {
711
712                // When mState is non-zero, we expect a continuation of the multi-octet
713                // sequence
714                if (0x80 == (0xC0 & ($in))) {
715
716                    // Legal continuation.
717                    $shift = ($mState - 1) * 6;
718                    $tmp = $in;
719                    $tmp = ($tmp & 0x0000003F) << $shift;
720                    $mUcs4 |= $tmp;
721
722                    /**
723                     * End of the multi-octet sequence. mUcs4 now contains the final
724                     * Unicode codepoint to be output
725                     */
726                    if (0 == --$mState) {
727
728                        /*
729                         * Check for illegal sequences and codepoints.
730                         */
731                        // From Unicode 3.1, non-shortest form is illegal
732                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
733                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
734                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
735                            (4 < $mBytes) ||
736                            // From Unicode 3.2, surrogate characters are illegal
737                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
738                            // Codepoints outside the Unicode range are illegal
739                            ($mUcs4 > 0x10FFFF)) {
740
741                            if($strict){
742                                trigger_error(
743                                        'utf8_to_unicode: Illegal sequence or codepoint '.
744                                            'in UTF-8 at byte '.$i,
745                                        E_USER_WARNING
746                                    );
747
748                                return false;
749                            }
750
751                        }
752
753                        if (0xFEFF != $mUcs4) {
754                            // BOM is legal but we don't want to output it
755                            $out[] = $mUcs4;
756                        }
757
758                        //initialize UTF8 cache
759                        $mState = 0;
760                        $mUcs4  = 0;
761                        $mBytes = 1;
762                    }
763
764                } elseif($strict) {
765                    /**
766                     *((0xC0 & (*in) != 0x80) && (mState != 0))
767                     * Incomplete multi-octet sequence.
768                     */
769                    trigger_error(
770                            'utf8_to_unicode: Incomplete multi-octet '.
771                            '   sequence in UTF-8 at byte '.$i,
772                            E_USER_WARNING
773                        );
774
775                    return false;
776                }
777            }
778        }
779        return $out;
780    }
781}
782
783if(!function_exists('unicode_to_utf8')){
784    /**
785     * Takes an array of ints representing the Unicode characters and returns
786     * a UTF-8 string. Astral planes are supported ie. the ints in the
787     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
788     * are not allowed.
789     *
790     * If $strict is set to true the function returns false if the input
791     * array contains ints that represent surrogates or are outside the
792     * Unicode range and raises a PHP error at level E_USER_WARNING
793     *
794     * Note: this function has been modified slightly in this library to use
795     * output buffering to concatenate the UTF-8 string (faster) as well as
796     * reference the array by it's keys
797     *
798     * @param  array $arr of unicode code points representing a string
799     * @param  boolean $strict Check for invalid sequences?
800     * @return mixed UTF-8 string or false if array contains invalid code points
801     * @author <hsivonen@iki.fi>
802     * @author Harry Fuecks <hfuecks@gmail.com>
803     * @see    utf8_to_unicode
804     * @link   http://hsivonen.iki.fi/php-utf8/
805     * @link   http://sourceforge.net/projects/phputf8/
806     */
807    function unicode_to_utf8($arr,$strict=false) {
808        if (!is_array($arr)) return '';
809        ob_start();
810
811        foreach (array_keys($arr) as $k) {
812
813            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
814                # ASCII range (including control chars)
815
816                echo chr($arr[$k]);
817
818            } else if ($arr[$k] <= 0x07ff) {
819                # 2 byte sequence
820
821                echo chr(0xc0 | ($arr[$k] >> 6));
822                echo chr(0x80 | ($arr[$k] & 0x003f));
823
824            } else if($arr[$k] == 0xFEFF) {
825                # Byte order mark (skip)
826
827                // nop -- zap the BOM
828
829            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
830                # Test for illegal surrogates
831
832                // found a surrogate
833                if($strict){
834                    trigger_error(
835                        'unicode_to_utf8: Illegal surrogate '.
836                            'at index: '.$k.', value: '.$arr[$k],
837                        E_USER_WARNING
838                        );
839                    return false;
840                }
841
842            } else if ($arr[$k] <= 0xffff) {
843                # 3 byte sequence
844
845                echo chr(0xe0 | ($arr[$k] >> 12));
846                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
847                echo chr(0x80 | ($arr[$k] & 0x003f));
848
849            } else if ($arr[$k] <= 0x10ffff) {
850                # 4 byte sequence
851
852                echo chr(0xf0 | ($arr[$k] >> 18));
853                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
854                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
855                echo chr(0x80 | ($arr[$k] & 0x3f));
856
857            } elseif($strict) {
858
859                trigger_error(
860                    'unicode_to_utf8: Codepoint out of Unicode range '.
861                        'at index: '.$k.', value: '.$arr[$k],
862                    E_USER_WARNING
863                    );
864
865                // out of range
866                return false;
867            }
868        }
869
870        $result = ob_get_contents();
871        ob_end_clean();
872        return $result;
873    }
874}
875
876if(!function_exists('utf8_to_utf16be')){
877    /**
878     * UTF-8 to UTF-16BE conversion.
879     *
880     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
881     */
882    function utf8_to_utf16be(&$str, $bom = false) {
883        $out = $bom ? "\xFE\xFF" : '';
884        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
885
886        $uni = utf8_to_unicode($str);
887        foreach($uni as $cp){
888            $out .= pack('n',$cp);
889        }
890        return $out;
891    }
892}
893
894if(!function_exists('utf16be_to_utf8')){
895    /**
896     * UTF-8 to UTF-16BE conversion.
897     *
898     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
899     */
900    function utf16be_to_utf8(&$str) {
901        $uni = unpack('n*',$str);
902        return unicode_to_utf8($uni);
903    }
904}
905
906if(!function_exists('utf8_bad_replace')){
907    /**
908     * Replace bad bytes with an alternative character
909     *
910     * ASCII character is recommended for replacement char
911     *
912     * PCRE Pattern to locate bad bytes in a UTF-8 string
913     * Comes from W3 FAQ: Multilingual Forms
914     * Note: modified to include full ASCII range including control chars
915     *
916     * @author Harry Fuecks <hfuecks@gmail.com>
917     * @see http://www.w3.org/International/questions/qa-forms-utf-8
918     * @param string $str to search
919     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
920     * @return string
921     */
922    function utf8_bad_replace($str, $replace = '') {
923        $UTF8_BAD =
924         '([\x00-\x7F]'.                          # ASCII (including control chars)
925         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
926         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
927         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
928         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
929         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
930         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
931         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
932         '|(.{1}))';                              # invalid byte
933        ob_start();
934        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
935            if ( !isset($matches[2])) {
936                echo $matches[0];
937            } else {
938                echo $replace;
939            }
940            $str = substr($str,strlen($matches[0]));
941        }
942        $result = ob_get_contents();
943        ob_end_clean();
944        return $result;
945    }
946}
947
948if(!function_exists('utf8_correctIdx')){
949    /**
950     * adjust a byte index into a utf8 string to a utf8 character boundary
951     *
952     * @param $str   string   utf8 character string
953     * @param $i     int      byte index into $str
954     * @param $next  bool     direction to search for boundary,
955     *                           false = up (current character)
956     *                           true = down (next character)
957     *
958     * @return int            byte index into $str now pointing to a utf8 character boundary
959     *
960     * @author       chris smith <chris@jalakai.co.uk>
961     */
962    function utf8_correctIdx(&$str,$i,$next=false) {
963
964        if ($i <= 0) return 0;
965
966        $limit = strlen($str);
967        if ($i>=$limit) return $limit;
968
969        if ($next) {
970            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
971        } else {
972            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
973        }
974
975        return $i;
976    }
977}
978
979// only needed if no mb_string available
980if(!UTF8_MBSTRING){
981    /**
982     * UTF-8 Case lookup table
983     *
984     * This lookuptable defines the upper case letters to their correspponding
985     * lower case letter in UTF-8
986     *
987     * @author Andreas Gohr <andi@splitbrain.org>
988     */
989    global $UTF8_LOWER_TO_UPPER;
990    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
991            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
992            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
993            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
994            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
995            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
996            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
997            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
998            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
999            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1000            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1001            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1002            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1003            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1004            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1005            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1006            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1007            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1008            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1009            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1010            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1011            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1012            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1013            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1014            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1015            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1016            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1017            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1018            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1019            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1020            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1021            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1022            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1023            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1024            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1025            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1026            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1027            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1028            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1029            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1030            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1031            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1032            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1033            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1034            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1035            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1036            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1037            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1038            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1039            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1040            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1041            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1042            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1043            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1044            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1045            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1046            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1047            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1048            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1049            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1050            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1051            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1052            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1053            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1054            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1055            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1056            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1057            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1058                );
1059
1060    /**
1061     * UTF-8 Case lookup table
1062     *
1063     * This lookuptable defines the lower case letters to their corresponding
1064     * upper case letter in UTF-8
1065     *
1066     * @author Andreas Gohr <andi@splitbrain.org>
1067     */
1068    global $UTF8_UPPER_TO_LOWER;
1069    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1070            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1071            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1072            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1073            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1074            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1075            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1076            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1077            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1078            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1079            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1080            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1081            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1082            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1083            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1084            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1085            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1086            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1087            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1088            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1089            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1090            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1091            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1092            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1093            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1094            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1095            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1096            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1097            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1098            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1099            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1100            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1101            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1102            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1103            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1104            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1105            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1106            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1107            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1108            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1109            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1110            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1111            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1112            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1113            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1114            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1115            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1116            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1117            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1118            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1119            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1120            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1121            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1122            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1123            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1124            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1125            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1126            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1127            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1128            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1129            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1130            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1131            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1132            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1133            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1134            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1135            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1136            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1137                );
1138}; // end of case lookup tables
1139
1140/**
1141 * UTF-8 lookup table for lower case accented letters
1142 *
1143 * This lookuptable defines replacements for accented characters from the ASCII-7
1144 * range. This are lower case letters only.
1145 *
1146 * @author Andreas Gohr <andi@splitbrain.org>
1147 * @see    utf8_deaccent()
1148 */
1149global $UTF8_LOWER_ACCENTS;
1150if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1151  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1152  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1153  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1154  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1155  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1156  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1157  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1158  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1159  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1160  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1161  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1162  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1163  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1164  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1165  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1166);
1167
1168/**
1169 * UTF-8 lookup table for upper case accented letters
1170 *
1171 * This lookuptable defines replacements for accented characters from the ASCII-7
1172 * range. This are upper case letters only.
1173 *
1174 * @author Andreas Gohr <andi@splitbrain.org>
1175 * @see    utf8_deaccent()
1176 */
1177global $UTF8_UPPER_ACCENTS;
1178if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1179  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1180  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1181  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1182  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1183  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1184  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1185  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1186  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1187  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1188  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1189  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1190  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1191  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1192  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1193  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1194);
1195
1196/**
1197 * UTF-8 array of common special characters
1198 *
1199 * This array should contain all special characters (not a letter or digit)
1200 * defined in the various local charsets - it's not a complete list of non-alphanum
1201 * characters in UTF-8. It's not perfect but should match most cases of special
1202 * chars.
1203 *
1204 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1205 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1206 *
1207 * @author Andreas Gohr <andi@splitbrain.org>
1208 * @see    utf8_stripspecials()
1209 */
1210global $UTF8_SPECIAL_CHARS;
1211if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1212  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1213  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1214          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1215  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1216  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1217  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1218  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1219  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1220  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1221  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1222  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1223  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1224  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1225  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1226  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1227  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1228  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1229  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1230  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1231  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1232  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1233  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1234  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1235  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1236  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1237  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1238  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1239  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1240  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1241  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1242  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1243  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1244  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1245  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1246  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1247  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1248  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1249  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1250  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1251  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1252  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1253  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1254  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1255  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1256  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1257  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1258  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1259  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1260  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1261  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1262  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1263  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1264  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1265          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1266  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1267  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1268  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1269  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1270  0xffeb, 0xffec, 0xffed, 0xffee,
1271  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1272  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1273  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1274  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1275  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1276);
1277
1278// utf8 version of above data
1279global $UTF8_SPECIAL_CHARS2;
1280if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1281    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1282    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1283    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1284    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1285    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1286    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1287    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1288    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1289    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1290    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1291    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1292    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1293    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1294    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1295    '➷➸➹➺➻➼➽➾'.
1296    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1297    '�'.
1298    '�ﹼﹽ'.
1299    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1300    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1301    '����������������������������������������������������������������'.
1302    '   ⁠';
1303
1304/**
1305 * Romanization lookup table
1306 *
1307 * This lookup tables provides a way to transform strings written in a language
1308 * different from the ones based upon latin letters into plain ASCII.
1309 *
1310 * Please note: this is not a scientific transliteration table. It only works
1311 * oneway from nonlatin to ASCII and it works by simple character replacement
1312 * only. Specialities of each language are not supported.
1313 *
1314 * @author Andreas Gohr <andi@splitbrain.org>
1315 * @author Vitaly Blokhin <vitinfo@vitn.com>
1316 * @link   http://www.uconv.com/translit.htm
1317 * @author Bisqwit <bisqwit@iki.fi>
1318 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1319 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1320 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1321 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1322 * @author Arthit Suriyawongkul <arthit@gmail.com>
1323 * @author Denis Scheither <amorphis@uni-bremen.de>
1324 * @author Eivind Morland <eivind.morland@gmail.com>
1325 */
1326global $UTF8_ROMANIZATION;
1327if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1328  // scandinavian - differs from what we do in deaccent
1329  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1330
1331  //russian cyrillic
1332  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1333  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1334  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1335  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1336  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1337  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1338  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1339  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1340  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1341  // Ukrainian cyrillic
1342  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1343  // Georgian
1344  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1345  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1346  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1347  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1348  'ჰ'=>'xh',
1349  //Sanskrit
1350  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1351  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1352  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1353  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1354  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1355  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1356  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1357  //Sanskrit diacritics
1358  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1359  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1360  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1361  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1362  //Hebrew
1363  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1364  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1365  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1366  'ש'=>'sh','ת'=>'t',
1367  //Arabic
1368  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1369  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1370  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1371  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1372
1373  // Japanese characters  (last update: 2008-05-09)
1374
1375  // Japanese hiragana
1376
1377  // 3 character syllables, っ doubles the consonant after
1378  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1379  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1380  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1381  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1382  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1383  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1384  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1385  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1386  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1387  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1388  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1389
1390  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1391  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1392  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1393
1394   // 2 character syllables - normal
1395  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1396  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1397  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1398  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1399  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1400  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1401  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1402  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1403  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1404  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1405  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1406  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1407  'うぇ'=>'we','うぃ'=>'wi',
1408  'いぇ'=>'ye',
1409
1410  // 2 character syllables, っ doubles the consonant after
1411  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1412  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1413  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1414  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1415  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1416  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1417  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1418  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1419  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1420  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1421  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1422
1423  // 1 character syllabels
1424  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1425  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1426  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1427  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1428  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1429  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1430  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1431  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1432  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1433  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1434  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1435  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1436  'わ'=>'wa','を'=>'wo',
1437  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1438  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1439  // old characters
1440  'ゑ'=>'we','ゐ'=>'wi',
1441
1442  //  convert what's left (probably only kicks in when something's missing above)
1443  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1444  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1445
1446  // never seen one of those (disabled for the moment)
1447  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1448  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1449  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1450  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1451  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1452  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1453  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1454  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1455  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1456  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1457  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1458  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1459  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1460  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1461
1462  // 'spare' characters from other romanization systems
1463  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1464  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1465  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1466  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1467  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1468  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1469  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1470  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1471  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1472  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1473
1474
1475  // Japanese katakana
1476
1477  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1478  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1479  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1480  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1481  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1482  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1483  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1484  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1485  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1486  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1487  'ッティー'=>'ttii',
1488  'ッヂィー'=>'ddii',
1489
1490  // 3 character syllables - doubled vowels
1491  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1492  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1493  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1494  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1495  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1496  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1497  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1498  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1499  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1500  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1501  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1502  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1503  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1504  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1505  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1506  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1507  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1508  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1509  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1510  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1511  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1512  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1513  'ウェー'=>'wee','ウィー'=>'wii',
1514  'イェー'=>'yee',
1515  'ティー'=>'tii',
1516  'ヂィー'=>'dii',
1517
1518  // 3 character syllables - doubled consonants
1519  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1520  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1521  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1522  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1523  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1524  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1525  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1526  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1527  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1528  'ッティ'=>'tti',
1529  'ッヂィ'=>'ddi',
1530
1531  // 3 character syllables - doubled vowel and consonants
1532  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1533  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1534  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1535  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1536  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1537  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1538  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1539  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1540  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1541  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1542  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1543
1544  // 2 character syllables - normal
1545  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1546  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1547  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1548  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1549  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1550  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1551  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1552  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1553  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1554  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1555  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1556  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1557  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1558  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1559  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1560  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1561  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1562  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1563  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1564  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1565  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1566  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1567  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1568  'ウェ'=>'we','ウィ'=>'wi',
1569  'イェ'=>'ye',
1570  'ティ'=>'ti',
1571  'ヂィ'=>'di',
1572
1573  // 2 character syllables - doubled vocal
1574  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1575  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1576  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1577  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1578  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1579  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1580  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1581  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1582  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1583  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1584  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1585  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1586  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1587  'ワー'=>'waa','ヲー'=>'woo',
1588  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1589  'ヵー'=>'kaa','ヶー'=>'kee',
1590  // old characters
1591  'ヱー'=>'wee','ヰー'=>'wii',
1592
1593  // seperate katakana 'n'
1594  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1595  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1596
1597  // 2 character syllables - doubled consonants
1598  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1599  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1600  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1601  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1602  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1603  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1604  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1605  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1606  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1607  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1608  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1609
1610  // 1 character syllables
1611  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1612  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1613  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1614  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1615  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1616  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1617  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1618  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1619  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1620  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1621  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1622  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1623  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1624  'ワ'=>'wa','ヲ'=>'wo',
1625  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1626  'ヵ'=>'ka','ヶ'=>'ke',
1627  // old characters
1628  'ヱ'=>'we','ヰ'=>'wi',
1629
1630  //  convert what's left (probably only kicks in when something's missing above)
1631  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1632  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1633
1634  // special characters
1635  '・'=>'_','、'=>'_',
1636  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1637
1638  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1639  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1640  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1641  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1642  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1643  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1644  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1645  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1646  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1647  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1648  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1649  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1650
1651  // "Greeklish"
1652  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1653  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1654
1655  // Thai
1656  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1657  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1658  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1659  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1660  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1661  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1662  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1663  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1664  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1665  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1666  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1667  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1668  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1669  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1670  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1671  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1672  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1673  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1674
1675  // Korean
1676  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1677  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1678  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1679  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1680  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1681  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1682);
1683
1684
1685