xref: /dokuwiki/inc/utf8.php (revision e8b5a4f91c8a6e230a6cfe13c43dc9ddce31e253)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_isASCII')){
23    /**
24     * Checks if a string contains 7bit ASCII only
25     *
26     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
27     */
28    function utf8_isASCII($str){
29        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30    }
31}
32
33if(!function_exists('utf8_strip')){
34    /**
35     * Strips all highbyte chars
36     *
37     * Returns a pure ASCII7 string
38     *
39     * @author Andreas Gohr <andi@splitbrain.org>
40     */
41    function utf8_strip($str){
42        $ascii = '';
43        $len = strlen($str);
44        for($i=0; $i<$len; $i++){
45            if(ord($str{$i}) <128){
46                $ascii .= $str{$i};
47            }
48        }
49        return $ascii;
50    }
51}
52
53if(!function_exists('utf8_check')){
54    /**
55     * Tries to detect if a string is in Unicode encoding
56     *
57     * @author <bmorel@ssi.fr>
58     * @link   http://www.php.net/manual/en/function.utf8-encode.php
59     */
60    function utf8_check($Str) {
61        $len = strlen($Str);
62        for ($i=0; $i<$len; $i++) {
63            $b = ord($Str[$i]);
64            if ($b < 0x80) continue; # 0bbbbbbb
65            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70            else return false; # Does not match any model
71
72            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74                    return false;
75            }
76        }
77        return true;
78    }
79}
80
81if(!function_exists('utf8_basename')){
82    /**
83     * A locale independent basename() implementation
84     *
85     * works around a bug in PHP's basename() implementation
86     *
87     * @see basename()
88     * @link   https://bugs.php.net/bug.php?id=37738
89     * @param string $path     A path
90     * @param string $suffix   If the name component ends in suffix this will also be cut off
91     * @return string
92     */
93    function utf8_basename($path, $suffix=''){
94        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
95        $file = substr($path, $rpos+1);
96
97        $suflen = strlen($suffix);
98        if($suflen && (substr($file, -$suflen) == $suffix)){
99            $file = substr($file, 0, -$suflen);
100        }
101
102        return $file;
103    }
104}
105
106if(!function_exists('utf8_strlen')){
107    /**
108     * Unicode aware replacement for strlen()
109     *
110     * utf8_decode() converts characters that are not in ISO-8859-1
111     * to '?', which, for the purpose of counting, is alright - It's
112     * even faster than mb_strlen.
113     *
114     * @author <chernyshevsky at hotmail dot com>
115     * @see    strlen()
116     * @see    utf8_decode()
117     */
118    function utf8_strlen($string){
119        return strlen(utf8_decode($string));
120    }
121}
122
123if(!function_exists('utf8_substr')){
124    /**
125     * UTF-8 aware alternative to substr
126     *
127     * Return part of a string given character offset (and optionally length)
128     *
129     * @author Harry Fuecks <hfuecks@gmail.com>
130     * @author Chris Smith <chris@jalakai.co.uk>
131     * @param string $str
132     * @param int $offset number of UTF-8 characters offset (from left)
133     * @param int $length (optional) length in UTF-8 characters from offset
134     * @return mixed string or false if failure
135     */
136    function utf8_substr($str, $offset, $length = null) {
137        if(UTF8_MBSTRING){
138            if( $length === null ){
139                return mb_substr($str, $offset);
140            }else{
141                return mb_substr($str, $offset, $length);
142            }
143        }
144
145        /*
146         * Notes:
147         *
148         * no mb string support, so we'll use pcre regex's with 'u' flag
149         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
150         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
151         *
152         * substr documentation states false can be returned in some cases (e.g. offset > string length)
153         * mb_substr never returns false, it will return an empty string instead.
154         *
155         * calculating the number of characters in the string is a relatively expensive operation, so
156         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
157         */
158
159        // cast parameters to appropriate types to avoid multiple notices/warnings
160        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
161        $offset = (int)$offset;
162        if (!is_null($length)) $length = (int)$length;
163
164        // handle trivial cases
165        if ($length === 0) return '';
166        if ($offset < 0 && $length < 0 && $length < $offset) return '';
167
168        $offset_pattern = '';
169        $length_pattern = '';
170
171        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
172        if ($offset < 0) {
173            $strlen = strlen(utf8_decode($str));        // see notes
174            $offset = $strlen + $offset;
175            if ($offset < 0) $offset = 0;
176        }
177
178        // establish a pattern for offset, a non-captured group equal in length to offset
179        if ($offset > 0) {
180            $Ox = (int)($offset/65535);
181            $Oy = $offset%65535;
182
183            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
184            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
185        } else {
186            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
187        }
188
189        // establish a pattern for length
190        if (is_null($length)) {
191            $length_pattern = '(.*)$';                  // the rest of the string
192        } else {
193
194            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
195            if ($offset > $strlen) return '';           // another trivial case
196
197            if ($length > 0) {
198
199                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
200
201                $Lx = (int)($length/65535);
202                $Ly = $length%65535;
203
204                // +ve length requires ... a captured group of length characters
205                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
206                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
207
208            } else if ($length < 0) {
209
210                if ($length < ($offset - $strlen)) return '';
211
212                $Lx = (int)((-$length)/65535);
213                $Ly = (-$length)%65535;
214
215                // -ve length requires ... capture everything except a group of -length characters
216                //                         anchored at the tail-end of the string
217                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
218                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
219            }
220        }
221
222        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
223        return $match[1];
224    }
225}
226
227if(!function_exists('utf8_substr_replace')){
228    /**
229     * Unicode aware replacement for substr_replace()
230     *
231     * @author Andreas Gohr <andi@splitbrain.org>
232     * @see    substr_replace()
233     */
234    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
235        $ret = '';
236        if($start>0) $ret .= utf8_substr($string, 0, $start);
237        $ret .= $replacement;
238        $ret .= utf8_substr($string, $start+$length);
239        return $ret;
240    }
241}
242
243if(!function_exists('utf8_ltrim')){
244    /**
245     * Unicode aware replacement for ltrim()
246     *
247     * @author Andreas Gohr <andi@splitbrain.org>
248     * @see    ltrim()
249     * @param  string $str
250     * @param  string $charlist
251     * @return string
252     */
253    function utf8_ltrim($str,$charlist=''){
254        if($charlist == '') return ltrim($str);
255
256        //quote charlist for use in a characterclass
257        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
258
259        return preg_replace('/^['.$charlist.']+/u','',$str);
260    }
261}
262
263if(!function_exists('utf8_rtrim')){
264    /**
265     * Unicode aware replacement for rtrim()
266     *
267     * @author Andreas Gohr <andi@splitbrain.org>
268     * @see    rtrim()
269     * @param  string $str
270     * @param  string $charlist
271     * @return string
272     */
273    function  utf8_rtrim($str,$charlist=''){
274        if($charlist == '') return rtrim($str);
275
276        //quote charlist for use in a characterclass
277        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
278
279        return preg_replace('/['.$charlist.']+$/u','',$str);
280    }
281}
282
283if(!function_exists('utf8_trim')){
284    /**
285     * Unicode aware replacement for trim()
286     *
287     * @author Andreas Gohr <andi@splitbrain.org>
288     * @see    trim()
289     * @param  string $str
290     * @param  string $charlist
291     * @return string
292     */
293    function  utf8_trim($str,$charlist='') {
294        if($charlist == '') return trim($str);
295
296        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
297    }
298}
299
300if(!function_exists('utf8_strtolower')){
301    /**
302     * This is a unicode aware replacement for strtolower()
303     *
304     * Uses mb_string extension if available
305     *
306     * @author Leo Feyer <leo@typolight.org>
307     * @see    strtolower()
308     * @see    utf8_strtoupper()
309     */
310    function utf8_strtolower($string){
311        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
312
313        global $UTF8_UPPER_TO_LOWER;
314        return strtr($string,$UTF8_UPPER_TO_LOWER);
315    }
316}
317
318if(!function_exists('utf8_strtoupper')){
319    /**
320     * This is a unicode aware replacement for strtoupper()
321     *
322     * Uses mb_string extension if available
323     *
324     * @author Leo Feyer <leo@typolight.org>
325     * @see    strtoupper()
326     * @see    utf8_strtoupper()
327     */
328    function utf8_strtoupper($string){
329        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
330
331        global $UTF8_LOWER_TO_UPPER;
332        return strtr($string,$UTF8_LOWER_TO_UPPER);
333    }
334}
335
336if(!function_exists('utf8_ucfirst')){
337    /**
338     * UTF-8 aware alternative to ucfirst
339     * Make a string's first character uppercase
340     *
341     * @author Harry Fuecks
342     * @param string
343     * @return string with first character as upper case (if applicable)
344     */
345    function utf8_ucfirst($str){
346        switch ( utf8_strlen($str) ) {
347            case 0:
348                return '';
349            case 1:
350                return utf8_strtoupper($str);
351            default:
352                preg_match('/^(.{1})(.*)$/us', $str, $matches);
353                return utf8_strtoupper($matches[1]).$matches[2];
354        }
355    }
356}
357
358if(!function_exists('utf8_ucwords')){
359    /**
360     * UTF-8 aware alternative to ucwords
361     * Uppercase the first character of each word in a string
362     *
363     * @author Harry Fuecks
364     * @param string
365     * @return string with first char of each word uppercase
366     * @see http://www.php.net/ucwords
367     */
368    function utf8_ucwords($str) {
369        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
370        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
371        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
372        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
373
374        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
375    }
376
377    /**
378     * Callback function for preg_replace_callback call in utf8_ucwords
379     * You don't need to call this yourself
380     *
381     * @author Harry Fuecks
382     * @param  array $matches matches corresponding to a single word
383     * @return string with first char of the word in uppercase
384     * @see utf8_ucwords
385     * @see utf8_strtoupper
386     */
387    function utf8_ucwords_callback($matches) {
388        $leadingws = $matches[2];
389        $ucfirst = utf8_strtoupper($matches[3]);
390        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
391        return $leadingws . $ucword;
392    }
393}
394
395if(!function_exists('utf8_deaccent')){
396    /**
397     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
398     *
399     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
400     * letters. Default is to deaccent both cases ($case = 0)
401     *
402     * @author Andreas Gohr <andi@splitbrain.org>
403     */
404    function utf8_deaccent($string,$case=0){
405        if($case <= 0){
406            global $UTF8_LOWER_ACCENTS;
407            $string = strtr($string,$UTF8_LOWER_ACCENTS);
408        }
409        if($case >= 0){
410            global $UTF8_UPPER_ACCENTS;
411            $string = strtr($string,$UTF8_UPPER_ACCENTS);
412        }
413        return $string;
414    }
415}
416
417if(!function_exists('utf8_romanize')){
418    /**
419     * Romanize a non-latin string
420     *
421     * @author Andreas Gohr <andi@splitbrain.org>
422     */
423    function utf8_romanize($string){
424        if(utf8_isASCII($string)) return $string; //nothing to do
425
426        global $UTF8_ROMANIZATION;
427        return strtr($string,$UTF8_ROMANIZATION);
428    }
429}
430
431if(!function_exists('utf8_stripspecials')){
432    /**
433     * Removes special characters (nonalphanumeric) from a UTF-8 string
434     *
435     * This function adds the controlchars 0x00 to 0x19 to the array of
436     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
437     *
438     * @author Andreas Gohr <andi@splitbrain.org>
439     * @param  string $string     The UTF8 string to strip of special chars
440     * @param  string $repl       Replace special with this string
441     * @param  string $additional Additional chars to strip (used in regexp char class)
442     * @return string
443     */
444    function utf8_stripspecials($string,$repl='',$additional=''){
445        global $UTF8_SPECIAL_CHARS2;
446
447        static $specials = null;
448        if(is_null($specials)){
449            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
450            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
451        }
452
453        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
454    }
455}
456
457if(!function_exists('utf8_strpos')){
458    /**
459     * This is an Unicode aware replacement for strpos
460     *
461     * @author Leo Feyer <leo@typolight.org>
462     * @see    strpos()
463     * @param  string
464     * @param  string
465     * @param  integer
466     * @return integer
467     */
468    function utf8_strpos($haystack, $needle, $offset=0){
469        $comp = 0;
470        $length = null;
471
472        while (is_null($length) || $length < $offset) {
473            $pos = strpos($haystack, $needle, $offset + $comp);
474
475            if ($pos === false)
476                return false;
477
478            $length = utf8_strlen(substr($haystack, 0, $pos));
479
480            if ($length < $offset)
481                $comp = $pos - $length;
482        }
483
484        return $length;
485    }
486}
487
488if(!function_exists('utf8_tohtml')){
489    /**
490     * Encodes UTF-8 characters to HTML entities
491     *
492     * @author Tom N Harris <tnharris@whoopdedo.org>
493     * @author <vpribish at shopping dot com>
494     * @link   http://www.php.net/manual/en/function.utf8-decode.php
495     */
496    function utf8_tohtml ($str) {
497        $ret = '';
498        foreach (utf8_to_unicode($str) as $cp) {
499            if ($cp < 0x80)
500                $ret .= chr($cp);
501            elseif ($cp < 0x100)
502                $ret .= "&#$cp;";
503            else
504                $ret .= '&#x'.dechex($cp).';';
505        }
506        return $ret;
507    }
508}
509
510if(!function_exists('utf8_unhtml')){
511    /**
512     * Decodes HTML entities to UTF-8 characters
513     *
514     * Convert any &#..; entity to a codepoint,
515     * The entities flag defaults to only decoding numeric entities.
516     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
517     * are handled as well. Avoids the problem that would occur if you
518     * had to decode "&amp;#38;&#38;amp;#38;"
519     *
520     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
521     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
522     * what it should be                   -> "&#38;&amp#38;"
523     *
524     * @author Tom N Harris <tnharris@whoopdedo.org>
525     * @param  string  $str      UTF-8 encoded string
526     * @param  boolean $entities Flag controlling decoding of named entities.
527     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
528     */
529    function utf8_unhtml($str, $entities=null) {
530        static $decoder = null;
531        if (is_null($decoder))
532            $decoder = new utf8_entity_decoder();
533        if (is_null($entities))
534            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
535                                         'utf8_decode_numeric', $str);
536        else
537            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
538                                         array(&$decoder, 'decode'), $str);
539    }
540}
541
542if(!function_exists('utf8_decode_numeric')){
543    /**
544     * Decodes numeric HTML entities to their correct UTF-8 characters
545     *
546     * @param $ent string A numeric entity
547     * @return string
548     */
549    function utf8_decode_numeric($ent) {
550        switch ($ent[2]) {
551            case 'X':
552            case 'x':
553                $cp = hexdec($ent[3]);
554                break;
555            default:
556                $cp = intval($ent[3]);
557                break;
558        }
559        return unicode_to_utf8(array($cp));
560    }
561}
562
563if(!class_exists('utf8_entity_decoder')){
564    /**
565     * Encapsulate HTML entity decoding tables
566     */
567    class utf8_entity_decoder {
568        var $table;
569
570        /**
571         * Initializes the decoding tables
572         */
573        function __construct() {
574            $table = get_html_translation_table(HTML_ENTITIES);
575            $table = array_flip($table);
576            $this->table = array_map(array(&$this,'makeutf8'), $table);
577        }
578
579        /**
580         * Wrapper aorund unicode_to_utf8()
581         *
582         * @param $c string
583         * @return mixed
584         */
585        function makeutf8($c) {
586            return unicode_to_utf8(array(ord($c)));
587        }
588
589        /**
590         * Decodes any HTML entity to it's correct UTF-8 char equivalent
591         *
592         * @param $ent string An entity
593         * @return string
594         */
595        function decode($ent) {
596            if ($ent[1] == '#') {
597                return utf8_decode_numeric($ent);
598            } elseif (array_key_exists($ent[0],$this->table)) {
599                return $this->table[$ent[0]];
600            } else {
601                return $ent[0];
602            }
603        }
604    }
605}
606
607if(!function_exists('utf8_to_unicode')){
608    /**
609     * Takes an UTF-8 string and returns an array of ints representing the
610     * Unicode characters. Astral planes are supported ie. the ints in the
611     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
612     * are not allowed.
613     *
614     * If $strict is set to true the function returns false if the input
615     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
616     * level E_USER_WARNING
617     *
618     * Note: this function has been modified slightly in this library to
619     * trigger errors on encountering bad bytes
620     *
621     * @author <hsivonen@iki.fi>
622     * @author Harry Fuecks <hfuecks@gmail.com>
623     * @param  string  $str UTF-8 encoded string
624     * @param  boolean $strict Check for invalid sequences?
625     * @return mixed array of unicode code points or false if UTF-8 invalid
626     * @see    unicode_to_utf8
627     * @link   http://hsivonen.iki.fi/php-utf8/
628     * @link   http://sourceforge.net/projects/phputf8/
629     */
630    function utf8_to_unicode($str,$strict=false) {
631        $mState = 0;     // cached expected number of octets after the current octet
632                         // until the beginning of the next UTF8 character sequence
633        $mUcs4  = 0;     // cached Unicode character
634        $mBytes = 1;     // cached expected number of octets in the current sequence
635
636        $out = array();
637
638        $len = strlen($str);
639
640        for($i = 0; $i < $len; $i++) {
641
642            $in = ord($str{$i});
643
644            if ( $mState == 0) {
645
646                // When mState is zero we expect either a US-ASCII character or a
647                // multi-octet sequence.
648                if (0 == (0x80 & ($in))) {
649                    // US-ASCII, pass straight through.
650                    $out[] = $in;
651                    $mBytes = 1;
652
653                } else if (0xC0 == (0xE0 & ($in))) {
654                    // First octet of 2 octet sequence
655                    $mUcs4 = ($in);
656                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
657                    $mState = 1;
658                    $mBytes = 2;
659
660                } else if (0xE0 == (0xF0 & ($in))) {
661                    // First octet of 3 octet sequence
662                    $mUcs4 = ($in);
663                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
664                    $mState = 2;
665                    $mBytes = 3;
666
667                } else if (0xF0 == (0xF8 & ($in))) {
668                    // First octet of 4 octet sequence
669                    $mUcs4 = ($in);
670                    $mUcs4 = ($mUcs4 & 0x07) << 18;
671                    $mState = 3;
672                    $mBytes = 4;
673
674                } else if (0xF8 == (0xFC & ($in))) {
675                    /* First octet of 5 octet sequence.
676                     *
677                     * This is illegal because the encoded codepoint must be either
678                     * (a) not the shortest form or
679                     * (b) outside the Unicode range of 0-0x10FFFF.
680                     * Rather than trying to resynchronize, we will carry on until the end
681                     * of the sequence and let the later error handling code catch it.
682                     */
683                    $mUcs4 = ($in);
684                    $mUcs4 = ($mUcs4 & 0x03) << 24;
685                    $mState = 4;
686                    $mBytes = 5;
687
688                } else if (0xFC == (0xFE & ($in))) {
689                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
690                    $mUcs4 = ($in);
691                    $mUcs4 = ($mUcs4 & 1) << 30;
692                    $mState = 5;
693                    $mBytes = 6;
694
695                } elseif($strict) {
696                    /* Current octet is neither in the US-ASCII range nor a legal first
697                     * octet of a multi-octet sequence.
698                     */
699                    trigger_error(
700                            'utf8_to_unicode: Illegal sequence identifier '.
701                                'in UTF-8 at byte '.$i,
702                            E_USER_WARNING
703                        );
704                    return false;
705
706                }
707
708            } else {
709
710                // When mState is non-zero, we expect a continuation of the multi-octet
711                // sequence
712                if (0x80 == (0xC0 & ($in))) {
713
714                    // Legal continuation.
715                    $shift = ($mState - 1) * 6;
716                    $tmp = $in;
717                    $tmp = ($tmp & 0x0000003F) << $shift;
718                    $mUcs4 |= $tmp;
719
720                    /**
721                     * End of the multi-octet sequence. mUcs4 now contains the final
722                     * Unicode codepoint to be output
723                     */
724                    if (0 == --$mState) {
725
726                        /*
727                         * Check for illegal sequences and codepoints.
728                         */
729                        // From Unicode 3.1, non-shortest form is illegal
730                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
731                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
732                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
733                            (4 < $mBytes) ||
734                            // From Unicode 3.2, surrogate characters are illegal
735                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
736                            // Codepoints outside the Unicode range are illegal
737                            ($mUcs4 > 0x10FFFF)) {
738
739                            if($strict){
740                                trigger_error(
741                                        'utf8_to_unicode: Illegal sequence or codepoint '.
742                                            'in UTF-8 at byte '.$i,
743                                        E_USER_WARNING
744                                    );
745
746                                return false;
747                            }
748
749                        }
750
751                        if (0xFEFF != $mUcs4) {
752                            // BOM is legal but we don't want to output it
753                            $out[] = $mUcs4;
754                        }
755
756                        //initialize UTF8 cache
757                        $mState = 0;
758                        $mUcs4  = 0;
759                        $mBytes = 1;
760                    }
761
762                } elseif($strict) {
763                    /**
764                     *((0xC0 & (*in) != 0x80) && (mState != 0))
765                     * Incomplete multi-octet sequence.
766                     */
767                    trigger_error(
768                            'utf8_to_unicode: Incomplete multi-octet '.
769                            '   sequence in UTF-8 at byte '.$i,
770                            E_USER_WARNING
771                        );
772
773                    return false;
774                }
775            }
776        }
777        return $out;
778    }
779}
780
781if(!function_exists('unicode_to_utf8')){
782    /**
783     * Takes an array of ints representing the Unicode characters and returns
784     * a UTF-8 string. Astral planes are supported ie. the ints in the
785     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
786     * are not allowed.
787     *
788     * If $strict is set to true the function returns false if the input
789     * array contains ints that represent surrogates or are outside the
790     * Unicode range and raises a PHP error at level E_USER_WARNING
791     *
792     * Note: this function has been modified slightly in this library to use
793     * output buffering to concatenate the UTF-8 string (faster) as well as
794     * reference the array by it's keys
795     *
796     * @param  array $arr of unicode code points representing a string
797     * @param  boolean $strict Check for invalid sequences?
798     * @return mixed UTF-8 string or false if array contains invalid code points
799     * @author <hsivonen@iki.fi>
800     * @author Harry Fuecks <hfuecks@gmail.com>
801     * @see    utf8_to_unicode
802     * @link   http://hsivonen.iki.fi/php-utf8/
803     * @link   http://sourceforge.net/projects/phputf8/
804     */
805    function unicode_to_utf8($arr,$strict=false) {
806        if (!is_array($arr)) return '';
807        ob_start();
808
809        foreach (array_keys($arr) as $k) {
810
811            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
812                # ASCII range (including control chars)
813
814                echo chr($arr[$k]);
815
816            } else if ($arr[$k] <= 0x07ff) {
817                # 2 byte sequence
818
819                echo chr(0xc0 | ($arr[$k] >> 6));
820                echo chr(0x80 | ($arr[$k] & 0x003f));
821
822            } else if($arr[$k] == 0xFEFF) {
823                # Byte order mark (skip)
824
825                // nop -- zap the BOM
826
827            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
828                # Test for illegal surrogates
829
830                // found a surrogate
831                if($strict){
832                    trigger_error(
833                        'unicode_to_utf8: Illegal surrogate '.
834                            'at index: '.$k.', value: '.$arr[$k],
835                        E_USER_WARNING
836                        );
837                    return false;
838                }
839
840            } else if ($arr[$k] <= 0xffff) {
841                # 3 byte sequence
842
843                echo chr(0xe0 | ($arr[$k] >> 12));
844                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
845                echo chr(0x80 | ($arr[$k] & 0x003f));
846
847            } else if ($arr[$k] <= 0x10ffff) {
848                # 4 byte sequence
849
850                echo chr(0xf0 | ($arr[$k] >> 18));
851                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
852                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
853                echo chr(0x80 | ($arr[$k] & 0x3f));
854
855            } elseif($strict) {
856
857                trigger_error(
858                    'unicode_to_utf8: Codepoint out of Unicode range '.
859                        'at index: '.$k.', value: '.$arr[$k],
860                    E_USER_WARNING
861                    );
862
863                // out of range
864                return false;
865            }
866        }
867
868        $result = ob_get_contents();
869        ob_end_clean();
870        return $result;
871    }
872}
873
874if(!function_exists('utf8_to_utf16be')){
875    /**
876     * UTF-8 to UTF-16BE conversion.
877     *
878     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
879     */
880    function utf8_to_utf16be(&$str, $bom = false) {
881        $out = $bom ? "\xFE\xFF" : '';
882        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
883
884        $uni = utf8_to_unicode($str);
885        foreach($uni as $cp){
886            $out .= pack('n',$cp);
887        }
888        return $out;
889    }
890}
891
892if(!function_exists('utf16be_to_utf8')){
893    /**
894     * UTF-8 to UTF-16BE conversion.
895     *
896     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
897     */
898    function utf16be_to_utf8(&$str) {
899        $uni = unpack('n*',$str);
900        return unicode_to_utf8($uni);
901    }
902}
903
904if(!function_exists('utf8_bad_replace')){
905    /**
906     * Replace bad bytes with an alternative character
907     *
908     * ASCII character is recommended for replacement char
909     *
910     * PCRE Pattern to locate bad bytes in a UTF-8 string
911     * Comes from W3 FAQ: Multilingual Forms
912     * Note: modified to include full ASCII range including control chars
913     *
914     * @author Harry Fuecks <hfuecks@gmail.com>
915     * @see http://www.w3.org/International/questions/qa-forms-utf-8
916     * @param string $str to search
917     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
918     * @return string
919     */
920    function utf8_bad_replace($str, $replace = '') {
921        $UTF8_BAD =
922         '([\x00-\x7F]'.                          # ASCII (including control chars)
923         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
924         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
925         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
926         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
927         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
928         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
929         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
930         '|(.{1}))';                              # invalid byte
931        ob_start();
932        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
933            if ( !isset($matches[2])) {
934                echo $matches[0];
935            } else {
936                echo $replace;
937            }
938            $str = substr($str,strlen($matches[0]));
939        }
940        $result = ob_get_contents();
941        ob_end_clean();
942        return $result;
943    }
944}
945
946if(!function_exists('utf8_correctIdx')){
947    /**
948     * adjust a byte index into a utf8 string to a utf8 character boundary
949     *
950     * @param $str   string   utf8 character string
951     * @param $i     int      byte index into $str
952     * @param $next  bool     direction to search for boundary,
953     *                           false = up (current character)
954     *                           true = down (next character)
955     *
956     * @return int            byte index into $str now pointing to a utf8 character boundary
957     *
958     * @author       chris smith <chris@jalakai.co.uk>
959     */
960    function utf8_correctIdx(&$str,$i,$next=false) {
961
962        if ($i <= 0) return 0;
963
964        $limit = strlen($str);
965        if ($i>=$limit) return $limit;
966
967        if ($next) {
968            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
969        } else {
970            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
971        }
972
973        return $i;
974    }
975}
976
977// only needed if no mb_string available
978if(!UTF8_MBSTRING){
979    /**
980     * UTF-8 Case lookup table
981     *
982     * This lookuptable defines the upper case letters to their correspponding
983     * lower case letter in UTF-8
984     *
985     * @author Andreas Gohr <andi@splitbrain.org>
986     */
987    global $UTF8_LOWER_TO_UPPER;
988    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
989            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
990            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
991            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
992            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
993            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
994            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
995            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
996            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
997            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
998            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
999            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1000            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1001            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1002            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1003            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1004            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1005            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1006            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1007            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1008            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1009            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1010            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1011            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1012            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1013            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1014            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1015            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1016            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1017            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1018            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1019            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1020            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1021            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1022            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1023            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1024            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1025            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1026            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1027            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1028            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1029            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1030            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1031            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1032            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1033            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1034            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1035            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1036            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1037            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1038            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1039            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1040            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1041            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1042            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1043            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1044            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1045            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1046            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1047            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1048            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1049            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1050            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1051            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1052            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1053            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1054            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1055            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1056                );
1057
1058    /**
1059     * UTF-8 Case lookup table
1060     *
1061     * This lookuptable defines the lower case letters to their corresponding
1062     * upper case letter in UTF-8
1063     *
1064     * @author Andreas Gohr <andi@splitbrain.org>
1065     */
1066    global $UTF8_UPPER_TO_LOWER;
1067    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1068            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1069            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1070            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1071            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1072            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1073            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1074            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1075            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1076            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1077            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1078            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1079            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1080            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1081            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1082            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1083            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1084            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1085            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1086            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1087            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1088            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1089            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1090            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1091            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1092            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1093            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1094            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1095            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1096            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1097            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1098            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1099            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1100            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1101            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1102            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1103            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1104            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1105            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1106            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1107            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1108            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1109            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1110            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1111            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1112            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1113            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1114            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1115            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1116            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1117            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1118            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1119            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1120            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1121            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1122            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1123            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1124            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1125            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1126            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1127            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1128            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1129            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1130            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1131            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1132            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1133            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1134            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1135                );
1136}; // end of case lookup tables
1137
1138/**
1139 * UTF-8 lookup table for lower case accented letters
1140 *
1141 * This lookuptable defines replacements for accented characters from the ASCII-7
1142 * range. This are lower case letters only.
1143 *
1144 * @author Andreas Gohr <andi@splitbrain.org>
1145 * @see    utf8_deaccent()
1146 */
1147global $UTF8_LOWER_ACCENTS;
1148if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1149  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1150  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1151  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1152  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1153  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1154  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1155  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1156  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1157  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1158  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1159  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1160  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1161  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1162  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1163  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1164);
1165
1166/**
1167 * UTF-8 lookup table for upper case accented letters
1168 *
1169 * This lookuptable defines replacements for accented characters from the ASCII-7
1170 * range. This are upper case letters only.
1171 *
1172 * @author Andreas Gohr <andi@splitbrain.org>
1173 * @see    utf8_deaccent()
1174 */
1175global $UTF8_UPPER_ACCENTS;
1176if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1177  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1178  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1179  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1180  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1181  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1182  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1183  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1184  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1185  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1186  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1187  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1188  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1189  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1190  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1191  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1192);
1193
1194/**
1195 * UTF-8 array of common special characters
1196 *
1197 * This array should contain all special characters (not a letter or digit)
1198 * defined in the various local charsets - it's not a complete list of non-alphanum
1199 * characters in UTF-8. It's not perfect but should match most cases of special
1200 * chars.
1201 *
1202 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1203 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1204 *
1205 * @author Andreas Gohr <andi@splitbrain.org>
1206 * @see    utf8_stripspecials()
1207 */
1208global $UTF8_SPECIAL_CHARS;
1209if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1210  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1211  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1212          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1213  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1214  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1215  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1216  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1217  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1218  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1219  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1220  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1221  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1222  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1223  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1224  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1225  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1226  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1227  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1228  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1229  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1230  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1231  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1232  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1233  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1234  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1235  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1236  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1237  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1238  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1239  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1240  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1241  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1242  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1243  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1244  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1245  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1246  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1247  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1248  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1249  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1250  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1251  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1252  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1253  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1254  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1255  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1256  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1257  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1258  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1259  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1260  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1261  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1262  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1263          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1264  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1265  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1266  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1267  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1268  0xffeb, 0xffec, 0xffed, 0xffee,
1269  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1270  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1271  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1272  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1273  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1274);
1275
1276// utf8 version of above data
1277global $UTF8_SPECIAL_CHARS2;
1278if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1279    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1280    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1281    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1282    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1283    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1284    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1285    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1286    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1287    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1288    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1289    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1290    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1291    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1292    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1293    '➷➸➹➺➻➼➽➾'.
1294    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1295    '�'.
1296    '�ﹼﹽ'.
1297    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1298    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1299    '����������������������������������������������������������������'.
1300    '   ⁠';
1301
1302/**
1303 * Romanization lookup table
1304 *
1305 * This lookup tables provides a way to transform strings written in a language
1306 * different from the ones based upon latin letters into plain ASCII.
1307 *
1308 * Please note: this is not a scientific transliteration table. It only works
1309 * oneway from nonlatin to ASCII and it works by simple character replacement
1310 * only. Specialities of each language are not supported.
1311 *
1312 * @author Andreas Gohr <andi@splitbrain.org>
1313 * @author Vitaly Blokhin <vitinfo@vitn.com>
1314 * @link   http://www.uconv.com/translit.htm
1315 * @author Bisqwit <bisqwit@iki.fi>
1316 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1317 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1318 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1319 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1320 * @author Arthit Suriyawongkul <arthit@gmail.com>
1321 * @author Denis Scheither <amorphis@uni-bremen.de>
1322 * @author Eivind Morland <eivind.morland@gmail.com>
1323 */
1324global $UTF8_ROMANIZATION;
1325if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1326  // scandinavian - differs from what we do in deaccent
1327  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1328
1329  //russian cyrillic
1330  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1331  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1332  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1333  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1334  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1335  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1336  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1337  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1338  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1339  // Ukrainian cyrillic
1340  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1341  // Georgian
1342  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1343  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1344  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1345  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1346  'ჰ'=>'xh',
1347  //Sanskrit
1348  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1349  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1350  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1351  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1352  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1353  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1354  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1355  //Sanskrit diacritics
1356  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1357  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1358  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1359  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1360  //Hebrew
1361  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1362  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1363  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1364  'ש'=>'sh','ת'=>'t',
1365  //Arabic
1366  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1367  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1368  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1369  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1370
1371  // Japanese characters  (last update: 2008-05-09)
1372
1373  // Japanese hiragana
1374
1375  // 3 character syllables, っ doubles the consonant after
1376  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1377  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1378  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1379  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1380  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1381  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1382  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1383  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1384  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1385  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1386  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1387
1388  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1389  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1390  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1391
1392   // 2 character syllables - normal
1393  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1394  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1395  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1396  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1397  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1398  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1399  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1400  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1401  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1402  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1403  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1404  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1405  'うぇ'=>'we','うぃ'=>'wi',
1406  'いぇ'=>'ye',
1407
1408  // 2 character syllables, っ doubles the consonant after
1409  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1410  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1411  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1412  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1413  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1414  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1415  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1416  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1417  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1418  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1419  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1420
1421  // 1 character syllabels
1422  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1423  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1424  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1425  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1426  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1427  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1428  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1429  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1430  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1431  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1432  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1433  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1434  'わ'=>'wa','を'=>'wo',
1435  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1436  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1437  // old characters
1438  'ゑ'=>'we','ゐ'=>'wi',
1439
1440  //  convert what's left (probably only kicks in when something's missing above)
1441  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1442  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1443
1444  // never seen one of those (disabled for the moment)
1445  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1446  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1447  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1448  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1449  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1450  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1451  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1452  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1453  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1454  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1455  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1456  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1457  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1458  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1459
1460  // 'spare' characters from other romanization systems
1461  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1462  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1463  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1464  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1465  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1466  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1467  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1468  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1469  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1470  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1471
1472
1473  // Japanese katakana
1474
1475  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1476  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1477  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1478  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1479  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1480  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1481  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1482  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1483  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1484  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1485  'ッティー'=>'ttii',
1486  'ッヂィー'=>'ddii',
1487
1488  // 3 character syllables - doubled vowels
1489  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1490  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1491  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1492  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1493  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1494  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1495  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1496  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1497  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1498  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1499  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1500  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1501  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1502  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1503  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1504  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1505  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1506  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1507  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1508  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1509  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1510  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1511  'ウェー'=>'wee','ウィー'=>'wii',
1512  'イェー'=>'yee',
1513  'ティー'=>'tii',
1514  'ヂィー'=>'dii',
1515
1516  // 3 character syllables - doubled consonants
1517  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1518  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1519  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1520  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1521  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1522  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1523  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1524  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1525  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1526  'ッティ'=>'tti',
1527  'ッヂィ'=>'ddi',
1528
1529  // 3 character syllables - doubled vowel and consonants
1530  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1531  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1532  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1533  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1534  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1535  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1536  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1537  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1538  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1539  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1540  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1541
1542  // 2 character syllables - normal
1543  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1544  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1545  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1546  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1547  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1548  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1549  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1550  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1551  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1552  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1553  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1554  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1555  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1556  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1557  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1558  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1559  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1560  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1561  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1562  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1563  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1564  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1565  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1566  'ウェ'=>'we','ウィ'=>'wi',
1567  'イェ'=>'ye',
1568  'ティ'=>'ti',
1569  'ヂィ'=>'di',
1570
1571  // 2 character syllables - doubled vocal
1572  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1573  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1574  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1575  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1576  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1577  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1578  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1579  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1580  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1581  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1582  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1583  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1584  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1585  'ワー'=>'waa','ヲー'=>'woo',
1586  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1587  'ヵー'=>'kaa','ヶー'=>'kee',
1588  // old characters
1589  'ヱー'=>'wee','ヰー'=>'wii',
1590
1591  // seperate katakana 'n'
1592  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1593  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1594
1595  // 2 character syllables - doubled consonants
1596  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1597  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1598  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1599  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1600  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1601  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1602  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1603  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1604  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1605  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1606  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1607
1608  // 1 character syllables
1609  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1610  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1611  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1612  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1613  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1614  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1615  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1616  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1617  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1618  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1619  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1620  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1621  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1622  'ワ'=>'wa','ヲ'=>'wo',
1623  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1624  'ヵ'=>'ka','ヶ'=>'ke',
1625  // old characters
1626  'ヱ'=>'we','ヰ'=>'wi',
1627
1628  //  convert what's left (probably only kicks in when something's missing above)
1629  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1630  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1631
1632  // special characters
1633  '・'=>'_','、'=>'_',
1634  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1635
1636  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1637  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1638  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1639  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1640  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1641  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1642  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1643  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1644  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1645  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1646  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1647  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1648
1649  // "Greeklish"
1650  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1651  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1652
1653  // Thai
1654  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1655  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1656  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1657  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1658  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1659  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1660  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1661  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1662  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1663  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1664  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1665  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1666  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1667  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1668  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1669  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1670  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1671  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1672
1673  // Korean
1674  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1675  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1676  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1677  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1678  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1679  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1680);
1681
1682
1683