xref: /dokuwiki/inc/utf8.php (revision dca6aaca5c13a4a9f3fd386d65cb44122cedaaf7)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20/**
21 * Check if PREG was compiled with UTF-8 support
22 *
23 * Without this many of the functions below will not work, so this is a minimal requirement
24 */
25if(!defined('UTF8_PREGSUPPORT')){
26    define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
27}
28
29/**
30 * Check if PREG was compiled with Unicode Property support
31 *
32 * This is not required for the functions below, but might be needed in a UTF-8 aware application
33 */
34if(!defined('UTF8_PROPERTYSUPPORT')){
35    define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
36}
37
38
39if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
40
41if(!function_exists('utf8_isASCII')){
42    /**
43     * Checks if a string contains 7bit ASCII only
44     *
45     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
46     *
47     * @param string $str
48     * @return bool
49     */
50    function utf8_isASCII($str){
51        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
52    }
53}
54
55if(!function_exists('utf8_strip')){
56    /**
57     * Strips all highbyte chars
58     *
59     * Returns a pure ASCII7 string
60     *
61     * @author Andreas Gohr <andi@splitbrain.org>
62     *
63     * @param string $str
64     * @return string
65     */
66    function utf8_strip($str){
67        $ascii = '';
68        $len = strlen($str);
69        for($i=0; $i<$len; $i++){
70            if(ord($str{$i}) <128){
71                $ascii .= $str{$i};
72            }
73        }
74        return $ascii;
75    }
76}
77
78if(!function_exists('utf8_check')){
79    /**
80     * Tries to detect if a string is in Unicode encoding
81     *
82     * @author <bmorel@ssi.fr>
83     * @link   http://www.php.net/manual/en/function.utf8-encode.php
84     *
85     * @param string $Str
86     * @return bool
87     */
88    function utf8_check($Str) {
89        $len = strlen($Str);
90        for ($i=0; $i<$len; $i++) {
91            $b = ord($Str[$i]);
92            if ($b < 0x80) continue; # 0bbbbbbb
93            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
94            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
95            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
96            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
97            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
98            else return false; # Does not match any model
99
100            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
101                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
102                    return false;
103            }
104        }
105        return true;
106    }
107}
108
109if(!function_exists('utf8_basename')){
110    /**
111     * A locale independent basename() implementation
112     *
113     * works around a bug in PHP's basename() implementation
114     *
115     * @see basename()
116     * @link   https://bugs.php.net/bug.php?id=37738
117     *
118     * @param string $path     A path
119     * @param string $suffix   If the name component ends in suffix this will also be cut off
120     * @return string
121     */
122    function utf8_basename($path, $suffix=''){
123        $path = trim($path,'\\/');
124        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
125        if($rpos) $path = substr($path, $rpos+1);
126
127        $suflen = strlen($suffix);
128        if($suflen && (substr($path, -$suflen) == $suffix)){
129            $path = substr($path, 0, -$suflen);
130        }
131
132        return $path;
133    }
134}
135
136if(!function_exists('utf8_strlen')){
137    /**
138     * Unicode aware replacement for strlen()
139     *
140     * utf8_decode() converts characters that are not in ISO-8859-1
141     * to '?', which, for the purpose of counting, is alright - It's
142     * even faster than mb_strlen.
143     *
144     * @author <chernyshevsky at hotmail dot com>
145     * @see    strlen()
146     * @see    utf8_decode()
147     *
148     * @param string $string
149     * @return int
150     */
151    function utf8_strlen($string){
152        return strlen(utf8_decode($string));
153    }
154}
155
156if(!function_exists('utf8_substr')){
157    /**
158     * UTF-8 aware alternative to substr
159     *
160     * Return part of a string given character offset (and optionally length)
161     *
162     * @author Harry Fuecks <hfuecks@gmail.com>
163     * @author Chris Smith <chris@jalakai.co.uk>
164     *
165     * @param string $str
166     * @param int $offset number of UTF-8 characters offset (from left)
167     * @param int $length (optional) length in UTF-8 characters from offset
168     * @return string
169     */
170    function utf8_substr($str, $offset, $length = null) {
171        if(UTF8_MBSTRING){
172            if( $length === null ){
173                return mb_substr($str, $offset);
174            }else{
175                return mb_substr($str, $offset, $length);
176            }
177        }
178
179        /*
180         * Notes:
181         *
182         * no mb string support, so we'll use pcre regex's with 'u' flag
183         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
184         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
185         *
186         * substr documentation states false can be returned in some cases (e.g. offset > string length)
187         * mb_substr never returns false, it will return an empty string instead.
188         *
189         * calculating the number of characters in the string is a relatively expensive operation, so
190         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
191         */
192
193        // cast parameters to appropriate types to avoid multiple notices/warnings
194        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
195        $offset = (int)$offset;
196        if (!is_null($length)) $length = (int)$length;
197
198        // handle trivial cases
199        if ($length === 0) return '';
200        if ($offset < 0 && $length < 0 && $length < $offset) return '';
201
202        $offset_pattern = '';
203        $length_pattern = '';
204
205        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
206        if ($offset < 0) {
207            $strlen = strlen(utf8_decode($str));        // see notes
208            $offset = $strlen + $offset;
209            if ($offset < 0) $offset = 0;
210        }
211
212        // establish a pattern for offset, a non-captured group equal in length to offset
213        if ($offset > 0) {
214            $Ox = (int)($offset/65535);
215            $Oy = $offset%65535;
216
217            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
218            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
219        } else {
220            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
221        }
222
223        // establish a pattern for length
224        if (is_null($length)) {
225            $length_pattern = '(.*)$';                  // the rest of the string
226        } else {
227
228            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
229            if ($offset > $strlen) return '';           // another trivial case
230
231            if ($length > 0) {
232
233                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
234
235                $Lx = (int)($length/65535);
236                $Ly = $length%65535;
237
238                // +ve length requires ... a captured group of length characters
239                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
240                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
241
242            } else if ($length < 0) {
243
244                if ($length < ($offset - $strlen)) return '';
245
246                $Lx = (int)((-$length)/65535);
247                $Ly = (-$length)%65535;
248
249                // -ve length requires ... capture everything except a group of -length characters
250                //                         anchored at the tail-end of the string
251                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
252                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
253            }
254        }
255
256        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
257        return $match[1];
258    }
259}
260
261if(!function_exists('utf8_substr_replace')){
262    /**
263     * Unicode aware replacement for substr_replace()
264     *
265     * @author Andreas Gohr <andi@splitbrain.org>
266     * @see    substr_replace()
267     *
268     * @param string $string      input string
269     * @param string $replacement the replacement
270     * @param int    $start       the replacing will begin at the start'th offset into string.
271     * @param int    $length      If given and is positive, it represents the length of the portion of string which is
272     *                            to be replaced. If length is zero then this function will have the effect of inserting
273     *                            replacement into string at the given start offset.
274     * @return string
275     */
276    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
277        $ret = '';
278        if($start>0) $ret .= utf8_substr($string, 0, $start);
279        $ret .= $replacement;
280        $ret .= utf8_substr($string, $start+$length);
281        return $ret;
282    }
283}
284
285if(!function_exists('utf8_ltrim')){
286    /**
287     * Unicode aware replacement for ltrim()
288     *
289     * @author Andreas Gohr <andi@splitbrain.org>
290     * @see    ltrim()
291     *
292     * @param  string $str
293     * @param  string $charlist
294     * @return string
295     */
296    function utf8_ltrim($str,$charlist=''){
297        if($charlist == '') return ltrim($str);
298
299        //quote charlist for use in a characterclass
300        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
301
302        return preg_replace('/^['.$charlist.']+/u','',$str);
303    }
304}
305
306if(!function_exists('utf8_rtrim')){
307    /**
308     * Unicode aware replacement for rtrim()
309     *
310     * @author Andreas Gohr <andi@splitbrain.org>
311     * @see    rtrim()
312     *
313     * @param  string $str
314     * @param  string $charlist
315     * @return string
316     */
317    function  utf8_rtrim($str,$charlist=''){
318        if($charlist == '') return rtrim($str);
319
320        //quote charlist for use in a characterclass
321        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
322
323        return preg_replace('/['.$charlist.']+$/u','',$str);
324    }
325}
326
327if(!function_exists('utf8_trim')){
328    /**
329     * Unicode aware replacement for trim()
330     *
331     * @author Andreas Gohr <andi@splitbrain.org>
332     * @see    trim()
333     *
334     * @param  string $str
335     * @param  string $charlist
336     * @return string
337     */
338    function  utf8_trim($str,$charlist='') {
339        if($charlist == '') return trim($str);
340
341        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
342    }
343}
344
345if(!function_exists('utf8_strtolower')){
346    /**
347     * This is a unicode aware replacement for strtolower()
348     *
349     * Uses mb_string extension if available
350     *
351     * @author Leo Feyer <leo@typolight.org>
352     * @see    strtolower()
353     * @see    utf8_strtoupper()
354     *
355     * @param string $string
356     * @return string
357     */
358    function utf8_strtolower($string){
359        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
360
361        global $UTF8_UPPER_TO_LOWER;
362        return strtr($string,$UTF8_UPPER_TO_LOWER);
363    }
364}
365
366if(!function_exists('utf8_strtoupper')){
367    /**
368     * This is a unicode aware replacement for strtoupper()
369     *
370     * Uses mb_string extension if available
371     *
372     * @author Leo Feyer <leo@typolight.org>
373     * @see    strtoupper()
374     * @see    utf8_strtoupper()
375     *
376     * @param string $string
377     * @return string
378     */
379    function utf8_strtoupper($string){
380        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
381
382        global $UTF8_LOWER_TO_UPPER;
383        return strtr($string,$UTF8_LOWER_TO_UPPER);
384    }
385}
386
387if(!function_exists('utf8_ucfirst')){
388    /**
389     * UTF-8 aware alternative to ucfirst
390     * Make a string's first character uppercase
391     *
392     * @author Harry Fuecks
393     *
394     * @param string $str
395     * @return string with first character as upper case (if applicable)
396     */
397    function utf8_ucfirst($str){
398        switch ( utf8_strlen($str) ) {
399            case 0:
400                return '';
401            case 1:
402                return utf8_strtoupper($str);
403            default:
404                preg_match('/^(.{1})(.*)$/us', $str, $matches);
405                return utf8_strtoupper($matches[1]).$matches[2];
406        }
407    }
408}
409
410if(!function_exists('utf8_ucwords')){
411    /**
412     * UTF-8 aware alternative to ucwords
413     * Uppercase the first character of each word in a string
414     *
415     * @author Harry Fuecks
416     * @see http://www.php.net/ucwords
417     *
418     * @param string $str
419     * @return string with first char of each word uppercase
420     */
421    function utf8_ucwords($str) {
422        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
423        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
424        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
425        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
426
427        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
428    }
429
430    /**
431     * Callback function for preg_replace_callback call in utf8_ucwords
432     * You don't need to call this yourself
433     *
434     * @author Harry Fuecks
435     * @see utf8_ucwords
436     * @see utf8_strtoupper
437     *
438     * @param  array $matches matches corresponding to a single word
439     * @return string with first char of the word in uppercase
440     */
441    function utf8_ucwords_callback($matches) {
442        $leadingws = $matches[2];
443        $ucfirst = utf8_strtoupper($matches[3]);
444        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
445        return $leadingws . $ucword;
446    }
447}
448
449if(!function_exists('utf8_deaccent')){
450    /**
451     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
452     *
453     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
454     * letters. Default is to deaccent both cases ($case = 0)
455     *
456     * @author Andreas Gohr <andi@splitbrain.org>
457     *
458     * @param string $string
459     * @param int $case
460     * @return string
461     */
462    function utf8_deaccent($string,$case=0){
463        if($case <= 0){
464            global $UTF8_LOWER_ACCENTS;
465            $string = strtr($string,$UTF8_LOWER_ACCENTS);
466        }
467        if($case >= 0){
468            global $UTF8_UPPER_ACCENTS;
469            $string = strtr($string,$UTF8_UPPER_ACCENTS);
470        }
471        return $string;
472    }
473}
474
475if(!function_exists('utf8_romanize')){
476    /**
477     * Romanize a non-latin string
478     *
479     * @author Andreas Gohr <andi@splitbrain.org>
480     *
481     * @param string $string
482     * @return string
483     */
484    function utf8_romanize($string){
485        if(utf8_isASCII($string)) return $string; //nothing to do
486
487        global $UTF8_ROMANIZATION;
488        return strtr($string,$UTF8_ROMANIZATION);
489    }
490}
491
492if(!function_exists('utf8_stripspecials')){
493    /**
494     * Removes special characters (nonalphanumeric) from a UTF-8 string
495     *
496     * This function adds the controlchars 0x00 to 0x19 to the array of
497     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
498     *
499     * @author Andreas Gohr <andi@splitbrain.org>
500     *
501     * @param  string $string     The UTF8 string to strip of special chars
502     * @param  string $repl       Replace special with this string
503     * @param  string $additional Additional chars to strip (used in regexp char class)
504     * @return string
505     */
506    function utf8_stripspecials($string,$repl='',$additional=''){
507        global $UTF8_SPECIAL_CHARS2;
508
509        static $specials = null;
510        if(is_null($specials)){
511            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
512            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
513        }
514
515        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
516    }
517}
518
519if(!function_exists('utf8_strpos')){
520    /**
521     * This is an Unicode aware replacement for strpos
522     *
523     * @author Leo Feyer <leo@typolight.org>
524     * @see    strpos()
525     *
526     * @param  string  $haystack
527     * @param  string  $needle
528     * @param  integer $offset
529     * @return integer
530     */
531    function utf8_strpos($haystack, $needle, $offset=0){
532        $comp = 0;
533        $length = null;
534
535        while (is_null($length) || $length < $offset) {
536            $pos = strpos($haystack, $needle, $offset + $comp);
537
538            if ($pos === false)
539                return false;
540
541            $length = utf8_strlen(substr($haystack, 0, $pos));
542
543            if ($length < $offset)
544                $comp = $pos - $length;
545        }
546
547        return $length;
548    }
549}
550
551if(!function_exists('utf8_tohtml')){
552    /**
553     * Encodes UTF-8 characters to HTML entities
554     *
555     * @author Tom N Harris <tnharris@whoopdedo.org>
556     * @author <vpribish at shopping dot com>
557     * @link   http://www.php.net/manual/en/function.utf8-decode.php
558     *
559     * @param string $str
560     * @return string
561     */
562    function utf8_tohtml ($str) {
563        $ret = '';
564        foreach (utf8_to_unicode($str) as $cp) {
565            if ($cp < 0x80)
566                $ret .= chr($cp);
567            elseif ($cp < 0x100)
568                $ret .= "&#$cp;";
569            else
570                $ret .= '&#x'.dechex($cp).';';
571        }
572        return $ret;
573    }
574}
575
576if(!function_exists('utf8_unhtml')){
577    /**
578     * Decodes HTML entities to UTF-8 characters
579     *
580     * Convert any &#..; entity to a codepoint,
581     * The entities flag defaults to only decoding numeric entities.
582     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
583     * are handled as well. Avoids the problem that would occur if you
584     * had to decode "&amp;#38;&#38;amp;#38;"
585     *
586     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
587     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
588     * what it should be                   -> "&#38;&amp#38;"
589     *
590     * @author Tom N Harris <tnharris@whoopdedo.org>
591     *
592     * @param  string  $str      UTF-8 encoded string
593     * @param  boolean $entities Flag controlling decoding of named entities.
594     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
595     */
596    function utf8_unhtml($str, $entities=null) {
597        static $decoder = null;
598        if (is_null($decoder))
599            $decoder = new utf8_entity_decoder();
600        if (is_null($entities))
601            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
602                                         'utf8_decode_numeric', $str);
603        else
604            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
605                                         array(&$decoder, 'decode'), $str);
606    }
607}
608
609if(!function_exists('utf8_decode_numeric')){
610    /**
611     * Decodes numeric HTML entities to their correct UTF-8 characters
612     *
613     * @param $ent string A numeric entity
614     * @return string|false
615     */
616    function utf8_decode_numeric($ent) {
617        switch ($ent[2]) {
618            case 'X':
619            case 'x':
620                $cp = hexdec($ent[3]);
621                break;
622            default:
623                $cp = intval($ent[3]);
624                break;
625        }
626        return unicode_to_utf8(array($cp));
627    }
628}
629
630if(!class_exists('utf8_entity_decoder')){
631    /**
632     * Encapsulate HTML entity decoding tables
633     */
634    class utf8_entity_decoder {
635        var $table;
636
637        /**
638         * Initializes the decoding tables
639         */
640        function __construct() {
641            $table = get_html_translation_table(HTML_ENTITIES);
642            $table = array_flip($table);
643            $this->table = array_map(array(&$this,'makeutf8'), $table);
644        }
645
646        /**
647         * Wrapper around unicode_to_utf8()
648         *
649         * @param string $c
650         * @return string|false
651         */
652        function makeutf8($c) {
653            return unicode_to_utf8(array(ord($c)));
654        }
655
656        /**
657         * Decodes any HTML entity to it's correct UTF-8 char equivalent
658         *
659         * @param string $ent An entity
660         * @return string|false
661         */
662        function decode($ent) {
663            if ($ent[1] == '#') {
664                return utf8_decode_numeric($ent);
665            } elseif (array_key_exists($ent[0],$this->table)) {
666                return $this->table[$ent[0]];
667            } else {
668                return $ent[0];
669            }
670        }
671    }
672}
673
674if(!function_exists('utf8_to_unicode')){
675    /**
676     * Takes an UTF-8 string and returns an array of ints representing the
677     * Unicode characters. Astral planes are supported ie. the ints in the
678     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
679     * are not allowed.
680     *
681     * If $strict is set to true the function returns false if the input
682     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
683     * level E_USER_WARNING
684     *
685     * Note: this function has been modified slightly in this library to
686     * trigger errors on encountering bad bytes
687     *
688     * @author <hsivonen@iki.fi>
689     * @author Harry Fuecks <hfuecks@gmail.com>
690     * @see    unicode_to_utf8
691     * @link   http://hsivonen.iki.fi/php-utf8/
692     * @link   http://sourceforge.net/projects/phputf8/
693     *
694     * @param  string  $str UTF-8 encoded string
695     * @param  boolean $strict Check for invalid sequences?
696     * @return mixed array of unicode code points or false if UTF-8 invalid
697     */
698    function utf8_to_unicode($str,$strict=false) {
699        $mState = 0;     // cached expected number of octets after the current octet
700                         // until the beginning of the next UTF8 character sequence
701        $mUcs4  = 0;     // cached Unicode character
702        $mBytes = 1;     // cached expected number of octets in the current sequence
703
704        $out = array();
705
706        $len = strlen($str);
707
708        for($i = 0; $i < $len; $i++) {
709
710            $in = ord($str{$i});
711
712            if ( $mState == 0) {
713
714                // When mState is zero we expect either a US-ASCII character or a
715                // multi-octet sequence.
716                if (0 == (0x80 & ($in))) {
717                    // US-ASCII, pass straight through.
718                    $out[] = $in;
719                    $mBytes = 1;
720
721                } else if (0xC0 == (0xE0 & ($in))) {
722                    // First octet of 2 octet sequence
723                    $mUcs4 = ($in);
724                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
725                    $mState = 1;
726                    $mBytes = 2;
727
728                } else if (0xE0 == (0xF0 & ($in))) {
729                    // First octet of 3 octet sequence
730                    $mUcs4 = ($in);
731                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
732                    $mState = 2;
733                    $mBytes = 3;
734
735                } else if (0xF0 == (0xF8 & ($in))) {
736                    // First octet of 4 octet sequence
737                    $mUcs4 = ($in);
738                    $mUcs4 = ($mUcs4 & 0x07) << 18;
739                    $mState = 3;
740                    $mBytes = 4;
741
742                } else if (0xF8 == (0xFC & ($in))) {
743                    /* First octet of 5 octet sequence.
744                     *
745                     * This is illegal because the encoded codepoint must be either
746                     * (a) not the shortest form or
747                     * (b) outside the Unicode range of 0-0x10FFFF.
748                     * Rather than trying to resynchronize, we will carry on until the end
749                     * of the sequence and let the later error handling code catch it.
750                     */
751                    $mUcs4 = ($in);
752                    $mUcs4 = ($mUcs4 & 0x03) << 24;
753                    $mState = 4;
754                    $mBytes = 5;
755
756                } else if (0xFC == (0xFE & ($in))) {
757                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
758                    $mUcs4 = ($in);
759                    $mUcs4 = ($mUcs4 & 1) << 30;
760                    $mState = 5;
761                    $mBytes = 6;
762
763                } elseif($strict) {
764                    /* Current octet is neither in the US-ASCII range nor a legal first
765                     * octet of a multi-octet sequence.
766                     */
767                    trigger_error(
768                            'utf8_to_unicode: Illegal sequence identifier '.
769                                'in UTF-8 at byte '.$i,
770                            E_USER_WARNING
771                        );
772                    return false;
773
774                }
775
776            } else {
777
778                // When mState is non-zero, we expect a continuation of the multi-octet
779                // sequence
780                if (0x80 == (0xC0 & ($in))) {
781
782                    // Legal continuation.
783                    $shift = ($mState - 1) * 6;
784                    $tmp = $in;
785                    $tmp = ($tmp & 0x0000003F) << $shift;
786                    $mUcs4 |= $tmp;
787
788                    /**
789                     * End of the multi-octet sequence. mUcs4 now contains the final
790                     * Unicode codepoint to be output
791                     */
792                    if (0 == --$mState) {
793
794                        /*
795                         * Check for illegal sequences and codepoints.
796                         */
797                        // From Unicode 3.1, non-shortest form is illegal
798                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
799                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
800                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
801                            (4 < $mBytes) ||
802                            // From Unicode 3.2, surrogate characters are illegal
803                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
804                            // Codepoints outside the Unicode range are illegal
805                            ($mUcs4 > 0x10FFFF)) {
806
807                            if($strict){
808                                trigger_error(
809                                        'utf8_to_unicode: Illegal sequence or codepoint '.
810                                            'in UTF-8 at byte '.$i,
811                                        E_USER_WARNING
812                                    );
813
814                                return false;
815                            }
816
817                        }
818
819                        if (0xFEFF != $mUcs4) {
820                            // BOM is legal but we don't want to output it
821                            $out[] = $mUcs4;
822                        }
823
824                        //initialize UTF8 cache
825                        $mState = 0;
826                        $mUcs4  = 0;
827                        $mBytes = 1;
828                    }
829
830                } elseif($strict) {
831                    /**
832                     *((0xC0 & (*in) != 0x80) && (mState != 0))
833                     * Incomplete multi-octet sequence.
834                     */
835                    trigger_error(
836                            'utf8_to_unicode: Incomplete multi-octet '.
837                            '   sequence in UTF-8 at byte '.$i,
838                            E_USER_WARNING
839                        );
840
841                    return false;
842                }
843            }
844        }
845        return $out;
846    }
847}
848
849if(!function_exists('unicode_to_utf8')){
850    /**
851     * Takes an array of ints representing the Unicode characters and returns
852     * a UTF-8 string. Astral planes are supported ie. the ints in the
853     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
854     * are not allowed.
855     *
856     * If $strict is set to true the function returns false if the input
857     * array contains ints that represent surrogates or are outside the
858     * Unicode range and raises a PHP error at level E_USER_WARNING
859     *
860     * Note: this function has been modified slightly in this library to use
861     * output buffering to concatenate the UTF-8 string (faster) as well as
862     * reference the array by it's keys
863     *
864     * @param  array $arr of unicode code points representing a string
865     * @param  boolean $strict Check for invalid sequences?
866     * @return string|false UTF-8 string or false if array contains invalid code points
867     *
868     * @author <hsivonen@iki.fi>
869     * @author Harry Fuecks <hfuecks@gmail.com>
870     * @see    utf8_to_unicode
871     * @link   http://hsivonen.iki.fi/php-utf8/
872     * @link   http://sourceforge.net/projects/phputf8/
873     */
874    function unicode_to_utf8($arr,$strict=false) {
875        if (!is_array($arr)) return '';
876        ob_start();
877
878        foreach (array_keys($arr) as $k) {
879
880            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
881                # ASCII range (including control chars)
882
883                echo chr($arr[$k]);
884
885            } else if ($arr[$k] <= 0x07ff) {
886                # 2 byte sequence
887
888                echo chr(0xc0 | ($arr[$k] >> 6));
889                echo chr(0x80 | ($arr[$k] & 0x003f));
890
891            } else if($arr[$k] == 0xFEFF) {
892                # Byte order mark (skip)
893
894                // nop -- zap the BOM
895
896            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
897                # Test for illegal surrogates
898
899                // found a surrogate
900                if($strict){
901                    trigger_error(
902                        'unicode_to_utf8: Illegal surrogate '.
903                            'at index: '.$k.', value: '.$arr[$k],
904                        E_USER_WARNING
905                        );
906                    return false;
907                }
908
909            } else if ($arr[$k] <= 0xffff) {
910                # 3 byte sequence
911
912                echo chr(0xe0 | ($arr[$k] >> 12));
913                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
914                echo chr(0x80 | ($arr[$k] & 0x003f));
915
916            } else if ($arr[$k] <= 0x10ffff) {
917                # 4 byte sequence
918
919                echo chr(0xf0 | ($arr[$k] >> 18));
920                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
921                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
922                echo chr(0x80 | ($arr[$k] & 0x3f));
923
924            } elseif($strict) {
925
926                trigger_error(
927                    'unicode_to_utf8: Codepoint out of Unicode range '.
928                        'at index: '.$k.', value: '.$arr[$k],
929                    E_USER_WARNING
930                    );
931
932                // out of range
933                return false;
934            }
935        }
936
937        $result = ob_get_contents();
938        ob_end_clean();
939        return $result;
940    }
941}
942
943if(!function_exists('utf8_to_utf16be')){
944    /**
945     * UTF-8 to UTF-16BE conversion.
946     *
947     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
948     *
949     * @param string $str
950     * @param bool $bom
951     * @return string
952     */
953    function utf8_to_utf16be(&$str, $bom = false) {
954        $out = $bom ? "\xFE\xFF" : '';
955        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
956
957        $uni = utf8_to_unicode($str);
958        foreach($uni as $cp){
959            $out .= pack('n',$cp);
960        }
961        return $out;
962    }
963}
964
965if(!function_exists('utf16be_to_utf8')){
966    /**
967     * UTF-8 to UTF-16BE conversion.
968     *
969     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
970     *
971     * @param string $str
972     * @return false|string
973     */
974    function utf16be_to_utf8(&$str) {
975        $uni = unpack('n*',$str);
976        return unicode_to_utf8($uni);
977    }
978}
979
980if(!function_exists('utf8_bad_replace')){
981    /**
982     * Replace bad bytes with an alternative character
983     *
984     * ASCII character is recommended for replacement char
985     *
986     * PCRE Pattern to locate bad bytes in a UTF-8 string
987     * Comes from W3 FAQ: Multilingual Forms
988     * Note: modified to include full ASCII range including control chars
989     *
990     * @author Harry Fuecks <hfuecks@gmail.com>
991     * @see http://www.w3.org/International/questions/qa-forms-utf-8
992     *
993     * @param string $str to search
994     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
995     * @return string
996     */
997    function utf8_bad_replace($str, $replace = '') {
998        $UTF8_BAD =
999         '([\x00-\x7F]'.                          # ASCII (including control chars)
1000         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
1001         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
1002         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
1003         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
1004         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
1005         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
1006         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
1007         '|(.{1}))';                              # invalid byte
1008        ob_start();
1009        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
1010            if ( !isset($matches[2])) {
1011                echo $matches[0];
1012            } else {
1013                echo $replace;
1014            }
1015            $str = substr($str,strlen($matches[0]));
1016        }
1017        $result = ob_get_contents();
1018        ob_end_clean();
1019        return $result;
1020    }
1021}
1022
1023if(!function_exists('utf8_correctIdx')){
1024    /**
1025     * adjust a byte index into a utf8 string to a utf8 character boundary
1026     *
1027     * @param string $str   utf8 character string
1028     * @param int    $i     byte index into $str
1029     * @param $next  bool     direction to search for boundary,
1030     *                           false = up (current character)
1031     *                           true = down (next character)
1032     *
1033     * @return int            byte index into $str now pointing to a utf8 character boundary
1034     *
1035     * @author       chris smith <chris@jalakai.co.uk>
1036     */
1037    function utf8_correctIdx(&$str,$i,$next=false) {
1038
1039        if ($i <= 0) return 0;
1040
1041        $limit = strlen($str);
1042        if ($i>=$limit) return $limit;
1043
1044        if ($next) {
1045            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
1046        } else {
1047            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
1048        }
1049
1050        return $i;
1051    }
1052}
1053
1054// only needed if no mb_string available
1055if(!UTF8_MBSTRING){
1056    /**
1057     * UTF-8 Case lookup table
1058     *
1059     * This lookuptable defines the upper case letters to their correspponding
1060     * lower case letter in UTF-8
1061     *
1062     * @author Andreas Gohr <andi@splitbrain.org>
1063     */
1064    global $UTF8_LOWER_TO_UPPER;
1065    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1066            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
1067            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
1068            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1069            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1070            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1071            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1072            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1073            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1074            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1075            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1076            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1077            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1078            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1079            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1080            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1081            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1082            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1083            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1084            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1085            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1086            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1087            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1088            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1089            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1090            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1091            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1092            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1093            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1094            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1095            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1096            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1097            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1098            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1099            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1100            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1101            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1102            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1103            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1104            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1105            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1106            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1107            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1108            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1109            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1110            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1111            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1112            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1113            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1114            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1115            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1116            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1117            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1118            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1119            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1120            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1121            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1122            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1123            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1124            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1125            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1126            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1127            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1128            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1129            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1130            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1131            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1132            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1133                );
1134
1135    /**
1136     * UTF-8 Case lookup table
1137     *
1138     * This lookuptable defines the lower case letters to their corresponding
1139     * upper case letter in UTF-8
1140     *
1141     * @author Andreas Gohr <andi@splitbrain.org>
1142     */
1143    global $UTF8_UPPER_TO_LOWER;
1144    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1145            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1146            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1147            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1148            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1149            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1150            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1151            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1152            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1153            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1154            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1155            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1156            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1157            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1158            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1159            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1160            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1161            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1162            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1163            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1164            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1165            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1166            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1167            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1168            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1169            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1170            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1171            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1172            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1173            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1174            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1175            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1176            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1177            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1178            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1179            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1180            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1181            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1182            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1183            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1184            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1185            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1186            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1187            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1188            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1189            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1190            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1191            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1192            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1193            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1194            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1195            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1196            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1197            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1198            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1199            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1200            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1201            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1202            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1203            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1204            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1205            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1206            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1207            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1208            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1209            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1210            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1211            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1212                );
1213}; // end of case lookup tables
1214
1215/**
1216 * UTF-8 lookup table for lower case accented letters
1217 *
1218 * This lookuptable defines replacements for accented characters from the ASCII-7
1219 * range. This are lower case letters only.
1220 *
1221 * @author Andreas Gohr <andi@splitbrain.org>
1222 * @see    utf8_deaccent()
1223 */
1224global $UTF8_LOWER_ACCENTS;
1225if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1226  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1227  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1228  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1229  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1230  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1231  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1232  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1233  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1234  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1235  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1236  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1237  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1238  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1239  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1240  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1241);
1242
1243/**
1244 * UTF-8 lookup table for upper case accented letters
1245 *
1246 * This lookuptable defines replacements for accented characters from the ASCII-7
1247 * range. This are upper case letters only.
1248 *
1249 * @author Andreas Gohr <andi@splitbrain.org>
1250 * @see    utf8_deaccent()
1251 */
1252global $UTF8_UPPER_ACCENTS;
1253if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1254  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1255  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1256  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1257  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1258  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1259  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1260  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1261  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1262  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1263  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1264  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1265  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1266  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1267  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1268  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1269);
1270
1271/**
1272 * UTF-8 array of common special characters
1273 *
1274 * This array should contain all special characters (not a letter or digit)
1275 * defined in the various local charsets - it's not a complete list of non-alphanum
1276 * characters in UTF-8. It's not perfect but should match most cases of special
1277 * chars.
1278 *
1279 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1280 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1281 *
1282 * @author Andreas Gohr <andi@splitbrain.org>
1283 * @see    utf8_stripspecials()
1284 */
1285global $UTF8_SPECIAL_CHARS;
1286if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1287  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1288  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1289          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1290  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1291  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1292  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1293  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1294  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1295  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1296  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1297  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1298  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1299  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1300  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1301  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1302  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1303  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1304  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1305  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1306  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1307  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1308  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1309  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1310  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1311  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1312  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1313  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1314  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1315  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1316  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1317  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1318  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1319  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1320  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1321  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1322  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1323  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1324  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1325  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1326  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1327  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1328  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1329  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1330  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1331  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1332  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1333  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1334  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1335  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1336  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1337  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1338  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1339  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1340          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1341  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1342  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1343  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1344  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1345  0xffeb, 0xffec, 0xffed, 0xffee,
1346  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1347  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1348  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1349  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1350  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1351);
1352
1353// utf8 version of above data
1354global $UTF8_SPECIAL_CHARS2;
1355if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1356    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1357    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1358    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1359    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1360    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1361    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1362    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1363    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1364    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1365    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1366    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1367    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1368    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1369    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1370    '➷➸➹➺➻➼➽➾'.
1371    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1372    '�'.
1373    '�ﹼﹽ'.
1374    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1375    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1376    '����������������������������������������������������������������'.
1377    '   ⁠';
1378
1379/**
1380 * Romanization lookup table
1381 *
1382 * This lookup tables provides a way to transform strings written in a language
1383 * different from the ones based upon latin letters into plain ASCII.
1384 *
1385 * Please note: this is not a scientific transliteration table. It only works
1386 * oneway from nonlatin to ASCII and it works by simple character replacement
1387 * only. Specialities of each language are not supported.
1388 *
1389 * @author Andreas Gohr <andi@splitbrain.org>
1390 * @author Vitaly Blokhin <vitinfo@vitn.com>
1391 * @link   http://www.uconv.com/translit.htm
1392 * @author Bisqwit <bisqwit@iki.fi>
1393 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1394 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1395 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1396 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1397 * @author Arthit Suriyawongkul <arthit@gmail.com>
1398 * @author Denis Scheither <amorphis@uni-bremen.de>
1399 * @author Eivind Morland <eivind.morland@gmail.com>
1400 */
1401global $UTF8_ROMANIZATION;
1402if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1403  // scandinavian - differs from what we do in deaccent
1404  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1405
1406  //russian cyrillic
1407  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1408  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1409  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1410  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1411  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1412  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1413  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1414  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1415  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1416  // Ukrainian cyrillic
1417  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1418  // Georgian
1419  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1420  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1421  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1422  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1423  'ჰ'=>'xh',
1424  //Sanskrit
1425  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1426  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1427  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1428  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1429  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1430  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1431  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1432  //Sanskrit diacritics
1433  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1434  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1435  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1436  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1437  //Hebrew
1438  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1439  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1440  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1441  'ש'=>'sh','ת'=>'t',
1442  //Arabic
1443  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1444  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1445  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1446  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1447
1448  // Japanese characters  (last update: 2008-05-09)
1449
1450  // Japanese hiragana
1451
1452  // 3 character syllables, っ doubles the consonant after
1453  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1454  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1455  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1456  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1457  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1458  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1459  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1460  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1461  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1462  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1463  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1464
1465  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1466  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1467  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1468
1469   // 2 character syllables - normal
1470  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1471  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1472  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1473  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1474  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1475  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1476  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1477  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1478  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1479  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1480  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1481  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1482  'うぇ'=>'we','うぃ'=>'wi',
1483  'いぇ'=>'ye',
1484
1485  // 2 character syllables, っ doubles the consonant after
1486  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1487  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1488  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1489  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1490  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1491  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1492  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1493  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1494  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1495  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1496  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1497
1498  // 1 character syllabels
1499  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1500  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1501  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1502  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1503  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1504  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1505  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1506  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1507  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1508  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1509  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1510  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1511  'わ'=>'wa','を'=>'wo',
1512  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1513  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1514  // old characters
1515  'ゑ'=>'we','ゐ'=>'wi',
1516
1517  //  convert what's left (probably only kicks in when something's missing above)
1518  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1519  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1520
1521  // never seen one of those (disabled for the moment)
1522  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1523  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1524  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1525  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1526  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1527  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1528  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1529  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1530  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1531  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1532  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1533  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1534  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1535  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1536
1537  // 'spare' characters from other romanization systems
1538  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1539  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1540  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1541  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1542  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1543  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1544  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1545  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1546  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1547  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1548
1549
1550  // Japanese katakana
1551
1552  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1553  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1554  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1555  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1556  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1557  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1558  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1559  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1560  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1561  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1562  'ッティー'=>'ttii',
1563  'ッヂィー'=>'ddii',
1564
1565  // 3 character syllables - doubled vowels
1566  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1567  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1568  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1569  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1570  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1571  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1572  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1573  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1574  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1575  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1576  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1577  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1578  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1579  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1580  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1581  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1582  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1583  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1584  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1585  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1586  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1587  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1588  'ウェー'=>'wee','ウィー'=>'wii',
1589  'イェー'=>'yee',
1590  'ティー'=>'tii',
1591  'ヂィー'=>'dii',
1592
1593  // 3 character syllables - doubled consonants
1594  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1595  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1596  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1597  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1598  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1599  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1600  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1601  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1602  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1603  'ッティ'=>'tti',
1604  'ッヂィ'=>'ddi',
1605
1606  // 3 character syllables - doubled vowel and consonants
1607  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1608  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1609  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1610  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1611  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1612  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1613  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1614  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1615  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1616  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1617  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1618
1619  // 2 character syllables - normal
1620  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1621  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1622  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1623  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1624  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1625  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1626  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1627  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1628  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1629  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1630  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1631  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1632  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1633  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1634  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1635  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1636  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1637  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1638  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1639  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1640  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1641  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1642  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1643  'ウェ'=>'we','ウィ'=>'wi',
1644  'イェ'=>'ye',
1645  'ティ'=>'ti',
1646  'ヂィ'=>'di',
1647
1648  // 2 character syllables - doubled vocal
1649  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1650  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1651  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1652  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1653  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1654  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1655  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1656  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1657  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1658  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1659  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1660  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1661  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1662  'ワー'=>'waa','ヲー'=>'woo',
1663  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1664  'ヵー'=>'kaa','ヶー'=>'kee',
1665  // old characters
1666  'ヱー'=>'wee','ヰー'=>'wii',
1667
1668  // seperate katakana 'n'
1669  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1670  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1671
1672  // 2 character syllables - doubled consonants
1673  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1674  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1675  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1676  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1677  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1678  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1679  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1680  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1681  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1682  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1683  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1684
1685  // 1 character syllables
1686  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1687  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1688  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1689  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1690  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1691  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1692  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1693  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1694  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1695  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1696  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1697  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1698  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1699  'ワ'=>'wa','ヲ'=>'wo',
1700  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1701  'ヵ'=>'ka','ヶ'=>'ke',
1702  // old characters
1703  'ヱ'=>'we','ヰ'=>'wi',
1704
1705  //  convert what's left (probably only kicks in when something's missing above)
1706  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1707  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1708
1709  // special characters
1710  '・'=>'_','、'=>'_',
1711  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1712
1713  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1714  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1715  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1716  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1717  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1718  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1719  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1720  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1721  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1722  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1723  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1724  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1725
1726  // "Greeklish"
1727  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1728  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1729
1730  // Thai
1731  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1732  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1733  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1734  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1735  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1736  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1737  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1738  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1739  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1740  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1741  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1742  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1743  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1744  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1745  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1746  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1747  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1748  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1749
1750  // Korean
1751  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1752  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1753  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1754  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1755  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1756  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1757);
1758
1759
1760