xref: /dokuwiki/inc/utf8.php (revision dfc5e46cbee86f9c2f44b5bc5f72505dd8847352)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20/**
21 * Check if PREG was compiled with UTF-8 support
22 *
23 * Without this many of the functions below will not work, so this is a minimal requirement
24 */
25if(!defined('UTF8_PREGSUPPORT')){
26    define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
27}
28
29/**
30 * Check if PREG was compiled with Unicode Property support
31 *
32 * This is not required for the functions below, but might be needed in a UTF-8 aware application
33 */
34if(!defined('UTF8_PROPERTYSUPPORT')){
35    define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
36}
37
38
39if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
40
41if(!function_exists('utf8_isASCII')){
42    /**
43     * Checks if a string contains 7bit ASCII only
44     *
45     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
46     */
47    function utf8_isASCII($str){
48        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
49    }
50}
51
52if(!function_exists('utf8_strip')){
53    /**
54     * Strips all highbyte chars
55     *
56     * Returns a pure ASCII7 string
57     *
58     * @author Andreas Gohr <andi@splitbrain.org>
59     */
60    function utf8_strip($str){
61        $ascii = '';
62        $len = strlen($str);
63        for($i=0; $i<$len; $i++){
64            if(ord($str{$i}) <128){
65                $ascii .= $str{$i};
66            }
67        }
68        return $ascii;
69    }
70}
71
72if(!function_exists('utf8_check')){
73    /**
74     * Tries to detect if a string is in Unicode encoding
75     *
76     * @author <bmorel@ssi.fr>
77     * @link   http://www.php.net/manual/en/function.utf8-encode.php
78     */
79    function utf8_check($Str) {
80        $len = strlen($Str);
81        for ($i=0; $i<$len; $i++) {
82            $b = ord($Str[$i]);
83            if ($b < 0x80) continue; # 0bbbbbbb
84            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
85            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
86            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
87            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
88            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
89            else return false; # Does not match any model
90
91            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
92                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
93                    return false;
94            }
95        }
96        return true;
97    }
98}
99
100if(!function_exists('utf8_basename')){
101    /**
102     * A locale independent basename() implementation
103     *
104     * works around a bug in PHP's basename() implementation
105     *
106     * @see basename()
107     * @link   https://bugs.php.net/bug.php?id=37738
108     * @param string $path     A path
109     * @param string $suffix   If the name component ends in suffix this will also be cut off
110     * @return string
111     */
112    function utf8_basename($path, $suffix=''){
113        $path = trim($path,'\\/');
114        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
115        if($rpos) $path = substr($path, $rpos+1);
116
117        $suflen = strlen($suffix);
118        if($suflen && (substr($path, -$suflen) == $suffix)){
119            $path = substr($path, 0, -$suflen);
120        }
121
122        return $path;
123    }
124}
125
126if(!function_exists('utf8_strlen')){
127    /**
128     * Unicode aware replacement for strlen()
129     *
130     * utf8_decode() converts characters that are not in ISO-8859-1
131     * to '?', which, for the purpose of counting, is alright - It's
132     * even faster than mb_strlen.
133     *
134     * @author <chernyshevsky at hotmail dot com>
135     * @see    strlen()
136     * @see    utf8_decode()
137     */
138    function utf8_strlen($string){
139        return strlen(utf8_decode($string));
140    }
141}
142
143if(!function_exists('utf8_substr')){
144    /**
145     * UTF-8 aware alternative to substr
146     *
147     * Return part of a string given character offset (and optionally length)
148     *
149     * @author Harry Fuecks <hfuecks@gmail.com>
150     * @author Chris Smith <chris@jalakai.co.uk>
151     * @param string $str
152     * @param int $offset number of UTF-8 characters offset (from left)
153     * @param int $length (optional) length in UTF-8 characters from offset
154     * @return mixed string or false if failure
155     */
156    function utf8_substr($str, $offset, $length = null) {
157        if(UTF8_MBSTRING){
158            if( $length === null ){
159                return mb_substr($str, $offset);
160            }else{
161                return mb_substr($str, $offset, $length);
162            }
163        }
164
165        /*
166         * Notes:
167         *
168         * no mb string support, so we'll use pcre regex's with 'u' flag
169         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
170         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
171         *
172         * substr documentation states false can be returned in some cases (e.g. offset > string length)
173         * mb_substr never returns false, it will return an empty string instead.
174         *
175         * calculating the number of characters in the string is a relatively expensive operation, so
176         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
177         */
178
179        // cast parameters to appropriate types to avoid multiple notices/warnings
180        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
181        $offset = (int)$offset;
182        if (!is_null($length)) $length = (int)$length;
183
184        // handle trivial cases
185        if ($length === 0) return '';
186        if ($offset < 0 && $length < 0 && $length < $offset) return '';
187
188        $offset_pattern = '';
189        $length_pattern = '';
190
191        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
192        if ($offset < 0) {
193            $strlen = strlen(utf8_decode($str));        // see notes
194            $offset = $strlen + $offset;
195            if ($offset < 0) $offset = 0;
196        }
197
198        // establish a pattern for offset, a non-captured group equal in length to offset
199        if ($offset > 0) {
200            $Ox = (int)($offset/65535);
201            $Oy = $offset%65535;
202
203            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
204            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
205        } else {
206            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
207        }
208
209        // establish a pattern for length
210        if (is_null($length)) {
211            $length_pattern = '(.*)$';                  // the rest of the string
212        } else {
213
214            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
215            if ($offset > $strlen) return '';           // another trivial case
216
217            if ($length > 0) {
218
219                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
220
221                $Lx = (int)($length/65535);
222                $Ly = $length%65535;
223
224                // +ve length requires ... a captured group of length characters
225                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
226                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
227
228            } else if ($length < 0) {
229
230                if ($length < ($offset - $strlen)) return '';
231
232                $Lx = (int)((-$length)/65535);
233                $Ly = (-$length)%65535;
234
235                // -ve length requires ... capture everything except a group of -length characters
236                //                         anchored at the tail-end of the string
237                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
238                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
239            }
240        }
241
242        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
243        return $match[1];
244    }
245}
246
247if(!function_exists('utf8_substr_replace')){
248    /**
249     * Unicode aware replacement for substr_replace()
250     *
251     * @author Andreas Gohr <andi@splitbrain.org>
252     * @see    substr_replace()
253     */
254    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
255        $ret = '';
256        if($start>0) $ret .= utf8_substr($string, 0, $start);
257        $ret .= $replacement;
258        $ret .= utf8_substr($string, $start+$length);
259        return $ret;
260    }
261}
262
263if(!function_exists('utf8_ltrim')){
264    /**
265     * Unicode aware replacement for ltrim()
266     *
267     * @author Andreas Gohr <andi@splitbrain.org>
268     * @see    ltrim()
269     * @param  string $str
270     * @param  string $charlist
271     * @return string
272     */
273    function utf8_ltrim($str,$charlist=''){
274        if($charlist == '') return ltrim($str);
275
276        //quote charlist for use in a characterclass
277        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
278
279        return preg_replace('/^['.$charlist.']+/u','',$str);
280    }
281}
282
283if(!function_exists('utf8_rtrim')){
284    /**
285     * Unicode aware replacement for rtrim()
286     *
287     * @author Andreas Gohr <andi@splitbrain.org>
288     * @see    rtrim()
289     * @param  string $str
290     * @param  string $charlist
291     * @return string
292     */
293    function  utf8_rtrim($str,$charlist=''){
294        if($charlist == '') return rtrim($str);
295
296        //quote charlist for use in a characterclass
297        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
298
299        return preg_replace('/['.$charlist.']+$/u','',$str);
300    }
301}
302
303if(!function_exists('utf8_trim')){
304    /**
305     * Unicode aware replacement for trim()
306     *
307     * @author Andreas Gohr <andi@splitbrain.org>
308     * @see    trim()
309     * @param  string $str
310     * @param  string $charlist
311     * @return string
312     */
313    function  utf8_trim($str,$charlist='') {
314        if($charlist == '') return trim($str);
315
316        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
317    }
318}
319
320if(!function_exists('utf8_strtolower')){
321    /**
322     * This is a unicode aware replacement for strtolower()
323     *
324     * Uses mb_string extension if available
325     *
326     * @author Leo Feyer <leo@typolight.org>
327     * @see    strtolower()
328     * @see    utf8_strtoupper()
329     */
330    function utf8_strtolower($string){
331        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
332
333        global $UTF8_UPPER_TO_LOWER;
334        return strtr($string,$UTF8_UPPER_TO_LOWER);
335    }
336}
337
338if(!function_exists('utf8_strtoupper')){
339    /**
340     * This is a unicode aware replacement for strtoupper()
341     *
342     * Uses mb_string extension if available
343     *
344     * @author Leo Feyer <leo@typolight.org>
345     * @see    strtoupper()
346     * @see    utf8_strtoupper()
347     */
348    function utf8_strtoupper($string){
349        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
350
351        global $UTF8_LOWER_TO_UPPER;
352        return strtr($string,$UTF8_LOWER_TO_UPPER);
353    }
354}
355
356if(!function_exists('utf8_ucfirst')){
357    /**
358     * UTF-8 aware alternative to ucfirst
359     * Make a string's first character uppercase
360     *
361     * @author Harry Fuecks
362     * @param string
363     * @return string with first character as upper case (if applicable)
364     */
365    function utf8_ucfirst($str){
366        switch ( utf8_strlen($str) ) {
367            case 0:
368                return '';
369            case 1:
370                return utf8_strtoupper($str);
371            default:
372                preg_match('/^(.{1})(.*)$/us', $str, $matches);
373                return utf8_strtoupper($matches[1]).$matches[2];
374        }
375    }
376}
377
378if(!function_exists('utf8_ucwords')){
379    /**
380     * UTF-8 aware alternative to ucwords
381     * Uppercase the first character of each word in a string
382     *
383     * @author Harry Fuecks
384     * @param string
385     * @return string with first char of each word uppercase
386     * @see http://www.php.net/ucwords
387     */
388    function utf8_ucwords($str) {
389        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
390        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
391        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
392        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
393
394        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
395    }
396
397    /**
398     * Callback function for preg_replace_callback call in utf8_ucwords
399     * You don't need to call this yourself
400     *
401     * @author Harry Fuecks
402     * @param  array $matches matches corresponding to a single word
403     * @return string with first char of the word in uppercase
404     * @see utf8_ucwords
405     * @see utf8_strtoupper
406     */
407    function utf8_ucwords_callback($matches) {
408        $leadingws = $matches[2];
409        $ucfirst = utf8_strtoupper($matches[3]);
410        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
411        return $leadingws . $ucword;
412    }
413}
414
415if(!function_exists('utf8_deaccent')){
416    /**
417     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
418     *
419     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
420     * letters. Default is to deaccent both cases ($case = 0)
421     *
422     * @author Andreas Gohr <andi@splitbrain.org>
423     */
424    function utf8_deaccent($string,$case=0){
425        if($case <= 0){
426            global $UTF8_LOWER_ACCENTS;
427            $string = strtr($string,$UTF8_LOWER_ACCENTS);
428        }
429        if($case >= 0){
430            global $UTF8_UPPER_ACCENTS;
431            $string = strtr($string,$UTF8_UPPER_ACCENTS);
432        }
433        return $string;
434    }
435}
436
437if(!function_exists('utf8_romanize')){
438    /**
439     * Romanize a non-latin string
440     *
441     * @author Andreas Gohr <andi@splitbrain.org>
442     */
443    function utf8_romanize($string){
444        if(utf8_isASCII($string)) return $string; //nothing to do
445
446        global $UTF8_ROMANIZATION;
447        return strtr($string,$UTF8_ROMANIZATION);
448    }
449}
450
451if(!function_exists('utf8_stripspecials')){
452    /**
453     * Removes special characters (nonalphanumeric) from a UTF-8 string
454     *
455     * This function adds the controlchars 0x00 to 0x19 to the array of
456     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
457     *
458     * @author Andreas Gohr <andi@splitbrain.org>
459     * @param  string $string     The UTF8 string to strip of special chars
460     * @param  string $repl       Replace special with this string
461     * @param  string $additional Additional chars to strip (used in regexp char class)
462     * @return string
463     */
464    function utf8_stripspecials($string,$repl='',$additional=''){
465        global $UTF8_SPECIAL_CHARS2;
466
467        static $specials = null;
468        if(is_null($specials)){
469            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
470            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
471        }
472
473        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
474    }
475}
476
477if(!function_exists('utf8_strpos')){
478    /**
479     * This is an Unicode aware replacement for strpos
480     *
481     * @author Leo Feyer <leo@typolight.org>
482     * @see    strpos()
483     * @param  string
484     * @param  string
485     * @param  integer
486     * @return integer
487     */
488    function utf8_strpos($haystack, $needle, $offset=0){
489        $comp = 0;
490        $length = null;
491
492        while (is_null($length) || $length < $offset) {
493            $pos = strpos($haystack, $needle, $offset + $comp);
494
495            if ($pos === false)
496                return false;
497
498            $length = utf8_strlen(substr($haystack, 0, $pos));
499
500            if ($length < $offset)
501                $comp = $pos - $length;
502        }
503
504        return $length;
505    }
506}
507
508if(!function_exists('utf8_tohtml')){
509    /**
510     * Encodes UTF-8 characters to HTML entities
511     *
512     * @author Tom N Harris <tnharris@whoopdedo.org>
513     * @author <vpribish at shopping dot com>
514     * @link   http://www.php.net/manual/en/function.utf8-decode.php
515     */
516    function utf8_tohtml ($str) {
517        $ret = '';
518        foreach (utf8_to_unicode($str) as $cp) {
519            if ($cp < 0x80)
520                $ret .= chr($cp);
521            elseif ($cp < 0x100)
522                $ret .= "&#$cp;";
523            else
524                $ret .= '&#x'.dechex($cp).';';
525        }
526        return $ret;
527    }
528}
529
530if(!function_exists('utf8_unhtml')){
531    /**
532     * Decodes HTML entities to UTF-8 characters
533     *
534     * Convert any &#..; entity to a codepoint,
535     * The entities flag defaults to only decoding numeric entities.
536     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
537     * are handled as well. Avoids the problem that would occur if you
538     * had to decode "&amp;#38;&#38;amp;#38;"
539     *
540     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
541     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
542     * what it should be                   -> "&#38;&amp#38;"
543     *
544     * @author Tom N Harris <tnharris@whoopdedo.org>
545     * @param  string  $str      UTF-8 encoded string
546     * @param  boolean $entities Flag controlling decoding of named entities.
547     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
548     */
549    function utf8_unhtml($str, $entities=null) {
550        static $decoder = null;
551        if (is_null($decoder))
552            $decoder = new utf8_entity_decoder();
553        if (is_null($entities))
554            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
555                                         'utf8_decode_numeric', $str);
556        else
557            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
558                                         array(&$decoder, 'decode'), $str);
559    }
560}
561
562if(!function_exists('utf8_decode_numeric')){
563    /**
564     * Decodes numeric HTML entities to their correct UTF-8 characters
565     *
566     * @param $ent string A numeric entity
567     * @return string
568     */
569    function utf8_decode_numeric($ent) {
570        switch ($ent[2]) {
571            case 'X':
572            case 'x':
573                $cp = hexdec($ent[3]);
574                break;
575            default:
576                $cp = intval($ent[3]);
577                break;
578        }
579        return unicode_to_utf8(array($cp));
580    }
581}
582
583if(!class_exists('utf8_entity_decoder')){
584    /**
585     * Encapsulate HTML entity decoding tables
586     */
587    class utf8_entity_decoder {
588        var $table;
589
590        /**
591         * Initializes the decoding tables
592         */
593        function __construct() {
594            $table = get_html_translation_table(HTML_ENTITIES);
595            $table = array_flip($table);
596            $this->table = array_map(array(&$this,'makeutf8'), $table);
597        }
598
599        /**
600         * Wrapper aorund unicode_to_utf8()
601         *
602         * @param $c string
603         * @return mixed
604         */
605        function makeutf8($c) {
606            return unicode_to_utf8(array(ord($c)));
607        }
608
609        /**
610         * Decodes any HTML entity to it's correct UTF-8 char equivalent
611         *
612         * @param $ent string An entity
613         * @return string
614         */
615        function decode($ent) {
616            if ($ent[1] == '#') {
617                return utf8_decode_numeric($ent);
618            } elseif (array_key_exists($ent[0],$this->table)) {
619                return $this->table[$ent[0]];
620            } else {
621                return $ent[0];
622            }
623        }
624    }
625}
626
627if(!function_exists('utf8_to_unicode')){
628    /**
629     * Takes an UTF-8 string and returns an array of ints representing the
630     * Unicode characters. Astral planes are supported ie. the ints in the
631     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
632     * are not allowed.
633     *
634     * If $strict is set to true the function returns false if the input
635     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
636     * level E_USER_WARNING
637     *
638     * Note: this function has been modified slightly in this library to
639     * trigger errors on encountering bad bytes
640     *
641     * @author <hsivonen@iki.fi>
642     * @author Harry Fuecks <hfuecks@gmail.com>
643     * @param  string  $str UTF-8 encoded string
644     * @param  boolean $strict Check for invalid sequences?
645     * @return mixed array of unicode code points or false if UTF-8 invalid
646     * @see    unicode_to_utf8
647     * @link   http://hsivonen.iki.fi/php-utf8/
648     * @link   http://sourceforge.net/projects/phputf8/
649     */
650    function utf8_to_unicode($str,$strict=false) {
651        $mState = 0;     // cached expected number of octets after the current octet
652                         // until the beginning of the next UTF8 character sequence
653        $mUcs4  = 0;     // cached Unicode character
654        $mBytes = 1;     // cached expected number of octets in the current sequence
655
656        $out = array();
657
658        $len = strlen($str);
659
660        for($i = 0; $i < $len; $i++) {
661
662            $in = ord($str{$i});
663
664            if ( $mState == 0) {
665
666                // When mState is zero we expect either a US-ASCII character or a
667                // multi-octet sequence.
668                if (0 == (0x80 & ($in))) {
669                    // US-ASCII, pass straight through.
670                    $out[] = $in;
671                    $mBytes = 1;
672
673                } else if (0xC0 == (0xE0 & ($in))) {
674                    // First octet of 2 octet sequence
675                    $mUcs4 = ($in);
676                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
677                    $mState = 1;
678                    $mBytes = 2;
679
680                } else if (0xE0 == (0xF0 & ($in))) {
681                    // First octet of 3 octet sequence
682                    $mUcs4 = ($in);
683                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
684                    $mState = 2;
685                    $mBytes = 3;
686
687                } else if (0xF0 == (0xF8 & ($in))) {
688                    // First octet of 4 octet sequence
689                    $mUcs4 = ($in);
690                    $mUcs4 = ($mUcs4 & 0x07) << 18;
691                    $mState = 3;
692                    $mBytes = 4;
693
694                } else if (0xF8 == (0xFC & ($in))) {
695                    /* First octet of 5 octet sequence.
696                     *
697                     * This is illegal because the encoded codepoint must be either
698                     * (a) not the shortest form or
699                     * (b) outside the Unicode range of 0-0x10FFFF.
700                     * Rather than trying to resynchronize, we will carry on until the end
701                     * of the sequence and let the later error handling code catch it.
702                     */
703                    $mUcs4 = ($in);
704                    $mUcs4 = ($mUcs4 & 0x03) << 24;
705                    $mState = 4;
706                    $mBytes = 5;
707
708                } else if (0xFC == (0xFE & ($in))) {
709                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
710                    $mUcs4 = ($in);
711                    $mUcs4 = ($mUcs4 & 1) << 30;
712                    $mState = 5;
713                    $mBytes = 6;
714
715                } elseif($strict) {
716                    /* Current octet is neither in the US-ASCII range nor a legal first
717                     * octet of a multi-octet sequence.
718                     */
719                    trigger_error(
720                            'utf8_to_unicode: Illegal sequence identifier '.
721                                'in UTF-8 at byte '.$i,
722                            E_USER_WARNING
723                        );
724                    return false;
725
726                }
727
728            } else {
729
730                // When mState is non-zero, we expect a continuation of the multi-octet
731                // sequence
732                if (0x80 == (0xC0 & ($in))) {
733
734                    // Legal continuation.
735                    $shift = ($mState - 1) * 6;
736                    $tmp = $in;
737                    $tmp = ($tmp & 0x0000003F) << $shift;
738                    $mUcs4 |= $tmp;
739
740                    /**
741                     * End of the multi-octet sequence. mUcs4 now contains the final
742                     * Unicode codepoint to be output
743                     */
744                    if (0 == --$mState) {
745
746                        /*
747                         * Check for illegal sequences and codepoints.
748                         */
749                        // From Unicode 3.1, non-shortest form is illegal
750                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
751                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
752                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
753                            (4 < $mBytes) ||
754                            // From Unicode 3.2, surrogate characters are illegal
755                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
756                            // Codepoints outside the Unicode range are illegal
757                            ($mUcs4 > 0x10FFFF)) {
758
759                            if($strict){
760                                trigger_error(
761                                        'utf8_to_unicode: Illegal sequence or codepoint '.
762                                            'in UTF-8 at byte '.$i,
763                                        E_USER_WARNING
764                                    );
765
766                                return false;
767                            }
768
769                        }
770
771                        if (0xFEFF != $mUcs4) {
772                            // BOM is legal but we don't want to output it
773                            $out[] = $mUcs4;
774                        }
775
776                        //initialize UTF8 cache
777                        $mState = 0;
778                        $mUcs4  = 0;
779                        $mBytes = 1;
780                    }
781
782                } elseif($strict) {
783                    /**
784                     *((0xC0 & (*in) != 0x80) && (mState != 0))
785                     * Incomplete multi-octet sequence.
786                     */
787                    trigger_error(
788                            'utf8_to_unicode: Incomplete multi-octet '.
789                            '   sequence in UTF-8 at byte '.$i,
790                            E_USER_WARNING
791                        );
792
793                    return false;
794                }
795            }
796        }
797        return $out;
798    }
799}
800
801if(!function_exists('unicode_to_utf8')){
802    /**
803     * Takes an array of ints representing the Unicode characters and returns
804     * a UTF-8 string. Astral planes are supported ie. the ints in the
805     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
806     * are not allowed.
807     *
808     * If $strict is set to true the function returns false if the input
809     * array contains ints that represent surrogates or are outside the
810     * Unicode range and raises a PHP error at level E_USER_WARNING
811     *
812     * Note: this function has been modified slightly in this library to use
813     * output buffering to concatenate the UTF-8 string (faster) as well as
814     * reference the array by it's keys
815     *
816     * @param  array $arr of unicode code points representing a string
817     * @param  boolean $strict Check for invalid sequences?
818     * @return mixed UTF-8 string or false if array contains invalid code points
819     * @author <hsivonen@iki.fi>
820     * @author Harry Fuecks <hfuecks@gmail.com>
821     * @see    utf8_to_unicode
822     * @link   http://hsivonen.iki.fi/php-utf8/
823     * @link   http://sourceforge.net/projects/phputf8/
824     */
825    function unicode_to_utf8($arr,$strict=false) {
826        if (!is_array($arr)) return '';
827        ob_start();
828
829        foreach (array_keys($arr) as $k) {
830
831            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
832                # ASCII range (including control chars)
833
834                echo chr($arr[$k]);
835
836            } else if ($arr[$k] <= 0x07ff) {
837                # 2 byte sequence
838
839                echo chr(0xc0 | ($arr[$k] >> 6));
840                echo chr(0x80 | ($arr[$k] & 0x003f));
841
842            } else if($arr[$k] == 0xFEFF) {
843                # Byte order mark (skip)
844
845                // nop -- zap the BOM
846
847            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
848                # Test for illegal surrogates
849
850                // found a surrogate
851                if($strict){
852                    trigger_error(
853                        'unicode_to_utf8: Illegal surrogate '.
854                            'at index: '.$k.', value: '.$arr[$k],
855                        E_USER_WARNING
856                        );
857                    return false;
858                }
859
860            } else if ($arr[$k] <= 0xffff) {
861                # 3 byte sequence
862
863                echo chr(0xe0 | ($arr[$k] >> 12));
864                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
865                echo chr(0x80 | ($arr[$k] & 0x003f));
866
867            } else if ($arr[$k] <= 0x10ffff) {
868                # 4 byte sequence
869
870                echo chr(0xf0 | ($arr[$k] >> 18));
871                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
872                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
873                echo chr(0x80 | ($arr[$k] & 0x3f));
874
875            } elseif($strict) {
876
877                trigger_error(
878                    'unicode_to_utf8: Codepoint out of Unicode range '.
879                        'at index: '.$k.', value: '.$arr[$k],
880                    E_USER_WARNING
881                    );
882
883                // out of range
884                return false;
885            }
886        }
887
888        $result = ob_get_contents();
889        ob_end_clean();
890        return $result;
891    }
892}
893
894if(!function_exists('utf8_to_utf16be')){
895    /**
896     * UTF-8 to UTF-16BE conversion.
897     *
898     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
899     */
900    function utf8_to_utf16be(&$str, $bom = false) {
901        $out = $bom ? "\xFE\xFF" : '';
902        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
903
904        $uni = utf8_to_unicode($str);
905        foreach($uni as $cp){
906            $out .= pack('n',$cp);
907        }
908        return $out;
909    }
910}
911
912if(!function_exists('utf16be_to_utf8')){
913    /**
914     * UTF-8 to UTF-16BE conversion.
915     *
916     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
917     */
918    function utf16be_to_utf8(&$str) {
919        $uni = unpack('n*',$str);
920        return unicode_to_utf8($uni);
921    }
922}
923
924if(!function_exists('utf8_bad_replace')){
925    /**
926     * Replace bad bytes with an alternative character
927     *
928     * ASCII character is recommended for replacement char
929     *
930     * PCRE Pattern to locate bad bytes in a UTF-8 string
931     * Comes from W3 FAQ: Multilingual Forms
932     * Note: modified to include full ASCII range including control chars
933     *
934     * @author Harry Fuecks <hfuecks@gmail.com>
935     * @see http://www.w3.org/International/questions/qa-forms-utf-8
936     * @param string $str to search
937     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
938     * @return string
939     */
940    function utf8_bad_replace($str, $replace = '') {
941        $UTF8_BAD =
942         '([\x00-\x7F]'.                          # ASCII (including control chars)
943         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
944         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
945         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
946         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
947         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
948         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
949         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
950         '|(.{1}))';                              # invalid byte
951        ob_start();
952        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
953            if ( !isset($matches[2])) {
954                echo $matches[0];
955            } else {
956                echo $replace;
957            }
958            $str = substr($str,strlen($matches[0]));
959        }
960        $result = ob_get_contents();
961        ob_end_clean();
962        return $result;
963    }
964}
965
966if(!function_exists('utf8_correctIdx')){
967    /**
968     * adjust a byte index into a utf8 string to a utf8 character boundary
969     *
970     * @param $str   string   utf8 character string
971     * @param $i     int      byte index into $str
972     * @param $next  bool     direction to search for boundary,
973     *                           false = up (current character)
974     *                           true = down (next character)
975     *
976     * @return int            byte index into $str now pointing to a utf8 character boundary
977     *
978     * @author       chris smith <chris@jalakai.co.uk>
979     */
980    function utf8_correctIdx(&$str,$i,$next=false) {
981
982        if ($i <= 0) return 0;
983
984        $limit = strlen($str);
985        if ($i>=$limit) return $limit;
986
987        if ($next) {
988            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
989        } else {
990            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
991        }
992
993        return $i;
994    }
995}
996
997// only needed if no mb_string available
998if(!UTF8_MBSTRING){
999    /**
1000     * UTF-8 Case lookup table
1001     *
1002     * This lookuptable defines the upper case letters to their correspponding
1003     * lower case letter in UTF-8
1004     *
1005     * @author Andreas Gohr <andi@splitbrain.org>
1006     */
1007    global $UTF8_LOWER_TO_UPPER;
1008    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1009            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
1010            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
1011            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1012            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1013            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1014            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1015            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1016            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1017            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1018            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1019            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1020            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1021            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1022            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1023            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1024            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1025            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1026            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1027            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1028            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1029            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1030            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1031            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1032            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1033            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1034            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1035            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1036            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1037            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1038            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1039            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1040            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1041            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1042            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1043            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1044            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1045            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1046            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1047            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1048            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1049            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1050            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1051            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1052            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1053            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1054            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1055            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1056            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1057            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1058            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1059            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1060            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1061            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1062            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1063            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1064            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1065            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1066            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1067            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1068            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1069            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1070            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1071            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1072            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1073            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1074            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1075            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1076                );
1077
1078    /**
1079     * UTF-8 Case lookup table
1080     *
1081     * This lookuptable defines the lower case letters to their corresponding
1082     * upper case letter in UTF-8
1083     *
1084     * @author Andreas Gohr <andi@splitbrain.org>
1085     */
1086    global $UTF8_UPPER_TO_LOWER;
1087    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1088            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1089            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1090            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1091            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1092            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1093            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1094            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1095            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1096            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1097            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1098            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1099            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1100            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1101            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1102            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1103            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1104            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1105            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1106            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1107            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1108            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1109            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1110            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1111            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1112            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1113            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1114            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1115            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1116            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1117            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1118            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1119            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1120            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1121            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1122            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1123            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1124            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1125            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1126            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1127            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1128            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1129            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1130            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1131            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1132            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1133            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1134            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1135            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1136            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1137            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1138            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1139            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1140            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1141            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1142            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1143            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1144            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1145            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1146            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1147            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1148            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1149            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1150            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1151            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1152            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1153            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1154            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1155                );
1156}; // end of case lookup tables
1157
1158/**
1159 * UTF-8 lookup table for lower case accented letters
1160 *
1161 * This lookuptable defines replacements for accented characters from the ASCII-7
1162 * range. This are lower case letters only.
1163 *
1164 * @author Andreas Gohr <andi@splitbrain.org>
1165 * @see    utf8_deaccent()
1166 */
1167global $UTF8_LOWER_ACCENTS;
1168if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1169  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1170  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1171  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1172  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1173  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1174  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1175  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1176  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1177  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1178  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1179  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1180  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1181  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1182  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1183  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1184);
1185
1186/**
1187 * UTF-8 lookup table for upper case accented letters
1188 *
1189 * This lookuptable defines replacements for accented characters from the ASCII-7
1190 * range. This are upper case letters only.
1191 *
1192 * @author Andreas Gohr <andi@splitbrain.org>
1193 * @see    utf8_deaccent()
1194 */
1195global $UTF8_UPPER_ACCENTS;
1196if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1197  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1198  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1199  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1200  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1201  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1202  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1203  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1204  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1205  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1206  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1207  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1208  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1209  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1210  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1211  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1212);
1213
1214/**
1215 * UTF-8 array of common special characters
1216 *
1217 * This array should contain all special characters (not a letter or digit)
1218 * defined in the various local charsets - it's not a complete list of non-alphanum
1219 * characters in UTF-8. It's not perfect but should match most cases of special
1220 * chars.
1221 *
1222 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1223 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1224 *
1225 * @author Andreas Gohr <andi@splitbrain.org>
1226 * @see    utf8_stripspecials()
1227 */
1228global $UTF8_SPECIAL_CHARS;
1229if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1230  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1231  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1232          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1233  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1234  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1235  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1236  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1237  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1238  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1239  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1240  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1241  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1242  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1243  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1244  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1245  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1246  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1247  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1248  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1249  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1250  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1251  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1252  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1253  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1254  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1255  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1256  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1257  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1258  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1259  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1260  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1261  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1262  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1263  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1264  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1265  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1266  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1267  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1268  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1269  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1270  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1271  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1272  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1273  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1274  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1275  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1276  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1277  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1278  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1279  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1280  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1281  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1282  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1283          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1284  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1285  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1286  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1287  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1288  0xffeb, 0xffec, 0xffed, 0xffee,
1289  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1290  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1291  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1292  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1293  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1294);
1295
1296// utf8 version of above data
1297global $UTF8_SPECIAL_CHARS2;
1298if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1299    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1300    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1301    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1302    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1303    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1304    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1305    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1306    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1307    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1308    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1309    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1310    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1311    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1312    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1313    '➷➸➹➺➻➼➽➾'.
1314    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1315    '�'.
1316    '�ﹼﹽ'.
1317    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1318    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1319    '����������������������������������������������������������������'.
1320    '   ⁠';
1321
1322/**
1323 * Romanization lookup table
1324 *
1325 * This lookup tables provides a way to transform strings written in a language
1326 * different from the ones based upon latin letters into plain ASCII.
1327 *
1328 * Please note: this is not a scientific transliteration table. It only works
1329 * oneway from nonlatin to ASCII and it works by simple character replacement
1330 * only. Specialities of each language are not supported.
1331 *
1332 * @author Andreas Gohr <andi@splitbrain.org>
1333 * @author Vitaly Blokhin <vitinfo@vitn.com>
1334 * @link   http://www.uconv.com/translit.htm
1335 * @author Bisqwit <bisqwit@iki.fi>
1336 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1337 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1338 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1339 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1340 * @author Arthit Suriyawongkul <arthit@gmail.com>
1341 * @author Denis Scheither <amorphis@uni-bremen.de>
1342 * @author Eivind Morland <eivind.morland@gmail.com>
1343 */
1344global $UTF8_ROMANIZATION;
1345if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1346  // scandinavian - differs from what we do in deaccent
1347  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1348
1349  //russian cyrillic
1350  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1351  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1352  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1353  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1354  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1355  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1356  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1357  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1358  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1359  // Ukrainian cyrillic
1360  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1361  // Georgian
1362  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1363  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1364  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1365  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1366  'ჰ'=>'xh',
1367  //Sanskrit
1368  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1369  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1370  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1371  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1372  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1373  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1374  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1375  //Sanskrit diacritics
1376  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1377  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1378  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1379  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1380  //Hebrew
1381  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1382  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1383  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1384  'ש'=>'sh','ת'=>'t',
1385  //Arabic
1386  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1387  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1388  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1389  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1390
1391  // Japanese characters  (last update: 2008-05-09)
1392
1393  // Japanese hiragana
1394
1395  // 3 character syllables, っ doubles the consonant after
1396  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1397  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1398  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1399  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1400  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1401  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1402  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1403  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1404  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1405  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1406  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1407
1408  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1409  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1410  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1411
1412   // 2 character syllables - normal
1413  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1414  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1415  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1416  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1417  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1418  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1419  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1420  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1421  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1422  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1423  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1424  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1425  'うぇ'=>'we','うぃ'=>'wi',
1426  'いぇ'=>'ye',
1427
1428  // 2 character syllables, っ doubles the consonant after
1429  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1430  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1431  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1432  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1433  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1434  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1435  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1436  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1437  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1438  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1439  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1440
1441  // 1 character syllabels
1442  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1443  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1444  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1445  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1446  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1447  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1448  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1449  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1450  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1451  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1452  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1453  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1454  'わ'=>'wa','を'=>'wo',
1455  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1456  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1457  // old characters
1458  'ゑ'=>'we','ゐ'=>'wi',
1459
1460  //  convert what's left (probably only kicks in when something's missing above)
1461  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1462  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1463
1464  // never seen one of those (disabled for the moment)
1465  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1466  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1467  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1468  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1469  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1470  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1471  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1472  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1473  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1474  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1475  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1476  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1477  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1478  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1479
1480  // 'spare' characters from other romanization systems
1481  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1482  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1483  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1484  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1485  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1486  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1487  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1488  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1489  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1490  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1491
1492
1493  // Japanese katakana
1494
1495  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1496  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1497  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1498  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1499  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1500  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1501  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1502  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1503  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1504  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1505  'ッティー'=>'ttii',
1506  'ッヂィー'=>'ddii',
1507
1508  // 3 character syllables - doubled vowels
1509  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1510  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1511  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1512  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1513  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1514  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1515  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1516  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1517  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1518  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1519  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1520  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1521  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1522  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1523  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1524  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1525  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1526  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1527  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1528  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1529  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1530  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1531  'ウェー'=>'wee','ウィー'=>'wii',
1532  'イェー'=>'yee',
1533  'ティー'=>'tii',
1534  'ヂィー'=>'dii',
1535
1536  // 3 character syllables - doubled consonants
1537  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1538  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1539  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1540  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1541  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1542  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1543  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1544  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1545  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1546  'ッティ'=>'tti',
1547  'ッヂィ'=>'ddi',
1548
1549  // 3 character syllables - doubled vowel and consonants
1550  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1551  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1552  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1553  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1554  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1555  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1556  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1557  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1558  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1559  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1560  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1561
1562  // 2 character syllables - normal
1563  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1564  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1565  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1566  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1567  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1568  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1569  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1570  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1571  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1572  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1573  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1574  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1575  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1576  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1577  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1578  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1579  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1580  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1581  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1582  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1583  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1584  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1585  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1586  'ウェ'=>'we','ウィ'=>'wi',
1587  'イェ'=>'ye',
1588  'ティ'=>'ti',
1589  'ヂィ'=>'di',
1590
1591  // 2 character syllables - doubled vocal
1592  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1593  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1594  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1595  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1596  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1597  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1598  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1599  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1600  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1601  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1602  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1603  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1604  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1605  'ワー'=>'waa','ヲー'=>'woo',
1606  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1607  'ヵー'=>'kaa','ヶー'=>'kee',
1608  // old characters
1609  'ヱー'=>'wee','ヰー'=>'wii',
1610
1611  // seperate katakana 'n'
1612  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1613  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1614
1615  // 2 character syllables - doubled consonants
1616  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1617  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1618  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1619  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1620  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1621  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1622  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1623  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1624  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1625  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1626  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1627
1628  // 1 character syllables
1629  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1630  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1631  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1632  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1633  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1634  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1635  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1636  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1637  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1638  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1639  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1640  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1641  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1642  'ワ'=>'wa','ヲ'=>'wo',
1643  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1644  'ヵ'=>'ka','ヶ'=>'ke',
1645  // old characters
1646  'ヱ'=>'we','ヰ'=>'wi',
1647
1648  //  convert what's left (probably only kicks in when something's missing above)
1649  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1650  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1651
1652  // special characters
1653  '・'=>'_','、'=>'_',
1654  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1655
1656  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1657  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1658  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1659  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1660  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1661  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1662  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1663  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1664  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1665  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1666  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1667  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1668
1669  // "Greeklish"
1670  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1671  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1672
1673  // Thai
1674  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1675  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1676  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1677  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1678  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1679  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1680  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1681  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1682  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1683  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1684  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1685  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1686  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1687  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1688  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1689  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1690  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1691  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1692
1693  // Korean
1694  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1695  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1696  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1697  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1698  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1699  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1700);
1701
1702
1703