xref: /dokuwiki/inc/utf8.php (revision 618a9504b8d0a3c2fc39602e6553d07e71dfdf20)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20/**
21 * Check if PREG was compiled with UTF-8 support
22 *
23 * Without this many of the functions below will not work, so this is a minimal requirement
24 */
25if(!defined('UTF8_PREGSUPPORT')){
26    define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
27}
28
29/**
30 * Check if PREG was compiled with Unicode Property support
31 *
32 * This is not required for the functions below, but might be needed in a UTF-8 aware application
33 */
34if(!defined('UTF8_PROPERTYSUPPORT')){
35    define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
36}
37
38
39if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
40
41if(!function_exists('utf8_isASCII')){
42    /**
43     * Checks if a string contains 7bit ASCII only
44     *
45     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
46     *
47     * @param string $str
48     * @return bool
49     */
50    function utf8_isASCII($str){
51        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
52    }
53}
54
55if(!function_exists('utf8_strip')){
56    /**
57     * Strips all highbyte chars
58     *
59     * Returns a pure ASCII7 string
60     *
61     * @author Andreas Gohr <andi@splitbrain.org>
62     *
63     * @param string $str
64     * @return string
65     */
66    function utf8_strip($str){
67        $ascii = '';
68        $len = strlen($str);
69        for($i=0; $i<$len; $i++){
70            if(ord($str{$i}) <128){
71                $ascii .= $str{$i};
72            }
73        }
74        return $ascii;
75    }
76}
77
78if(!function_exists('utf8_check')){
79    /**
80     * Tries to detect if a string is in Unicode encoding
81     *
82     * @author <bmorel@ssi.fr>
83     * @link   http://php.net/manual/en/function.utf8-encode.php
84     *
85     * @param string $Str
86     * @return bool
87     */
88    function utf8_check($Str) {
89        $len = strlen($Str);
90        for ($i=0; $i<$len; $i++) {
91            $b = ord($Str[$i]);
92            if ($b < 0x80) continue; # 0bbbbbbb
93            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
94            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
95            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
96            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
97            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
98            else return false; # Does not match any model
99
100            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
101                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
102                    return false;
103            }
104        }
105        return true;
106    }
107}
108
109if(!function_exists('utf8_basename')){
110    /**
111     * A locale independent basename() implementation
112     *
113     * works around a bug in PHP's basename() implementation
114     *
115     * @see basename()
116     * @link   https://bugs.php.net/bug.php?id=37738
117     *
118     * @param string $path     A path
119     * @param string $suffix   If the name component ends in suffix this will also be cut off
120     * @return string
121     */
122    function utf8_basename($path, $suffix=''){
123        $path = trim($path,'\\/');
124        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
125        if($rpos) $path = substr($path, $rpos+1);
126
127        $suflen = strlen($suffix);
128        if($suflen && (substr($path, -$suflen) == $suffix)){
129            $path = substr($path, 0, -$suflen);
130        }
131
132        return $path;
133    }
134}
135
136if(!function_exists('utf8_strlen')){
137    /**
138     * Unicode aware replacement for strlen()
139     *
140     * utf8_decode() converts characters that are not in ISO-8859-1
141     * to '?', which, for the purpose of counting, is alright - It's
142     * even faster than mb_strlen.
143     *
144     * @author <chernyshevsky at hotmail dot com>
145     * @see    strlen()
146     * @see    utf8_decode()
147     *
148     * @param string $string
149     * @return int
150     */
151    function utf8_strlen($string) {
152        if (function_exists('utf8_decode')) {
153            return strlen(utf8_decode($string));
154        } elseif (UTF8_MBSTRING) {
155            return mb_strlen($string, 'UTF-8');
156        } elseif (function_exists('iconv_strlen')) {
157            return iconv_strlen($string, 'UTF-8');
158        } else {
159            return strlen($string);
160        }
161    }
162}
163
164if(!function_exists('utf8_substr')){
165    /**
166     * UTF-8 aware alternative to substr
167     *
168     * Return part of a string given character offset (and optionally length)
169     *
170     * @author Harry Fuecks <hfuecks@gmail.com>
171     * @author Chris Smith <chris@jalakai.co.uk>
172     *
173     * @param string $str
174     * @param int $offset number of UTF-8 characters offset (from left)
175     * @param int $length (optional) length in UTF-8 characters from offset
176     * @return string
177     */
178    function utf8_substr($str, $offset, $length = null) {
179        if(UTF8_MBSTRING){
180            if( $length === null ){
181                return mb_substr($str, $offset);
182            }else{
183                return mb_substr($str, $offset, $length);
184            }
185        }
186
187        /*
188         * Notes:
189         *
190         * no mb string support, so we'll use pcre regex's with 'u' flag
191         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
192         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
193         *
194         * substr documentation states false can be returned in some cases (e.g. offset > string length)
195         * mb_substr never returns false, it will return an empty string instead.
196         *
197         * calculating the number of characters in the string is a relatively expensive operation, so
198         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
199         */
200
201        // cast parameters to appropriate types to avoid multiple notices/warnings
202        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
203        $offset = (int)$offset;
204        if (!is_null($length)) $length = (int)$length;
205
206        // handle trivial cases
207        if ($length === 0) return '';
208        if ($offset < 0 && $length < 0 && $length < $offset) return '';
209
210        $offset_pattern = '';
211        $length_pattern = '';
212
213        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
214        if ($offset < 0) {
215            $strlen = utf8_strlen($str);        // see notes
216            $offset = $strlen + $offset;
217            if ($offset < 0) $offset = 0;
218        }
219
220        // establish a pattern for offset, a non-captured group equal in length to offset
221        if ($offset > 0) {
222            $Ox = (int)($offset/65535);
223            $Oy = $offset%65535;
224
225            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
226            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
227        } else {
228            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
229        }
230
231        // establish a pattern for length
232        if (is_null($length)) {
233            $length_pattern = '(.*)$';                  // the rest of the string
234        } else {
235
236            if (!isset($strlen)) $strlen = utf8_strlen($str);    // see notes
237            if ($offset > $strlen) return '';           // another trivial case
238
239            if ($length > 0) {
240
241                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
242
243                $Lx = (int)($length/65535);
244                $Ly = $length%65535;
245
246                // +ve length requires ... a captured group of length characters
247                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
248                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
249
250            } else if ($length < 0) {
251
252                if ($length < ($offset - $strlen)) return '';
253
254                $Lx = (int)((-$length)/65535);
255                $Ly = (-$length)%65535;
256
257                // -ve length requires ... capture everything except a group of -length characters
258                //                         anchored at the tail-end of the string
259                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
260                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
261            }
262        }
263
264        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
265        return $match[1];
266    }
267}
268
269if(!function_exists('utf8_substr_replace')){
270    /**
271     * Unicode aware replacement for substr_replace()
272     *
273     * @author Andreas Gohr <andi@splitbrain.org>
274     * @see    substr_replace()
275     *
276     * @param string $string      input string
277     * @param string $replacement the replacement
278     * @param int    $start       the replacing will begin at the start'th offset into string.
279     * @param int    $length      If given and is positive, it represents the length of the portion of string which is
280     *                            to be replaced. If length is zero then this function will have the effect of inserting
281     *                            replacement into string at the given start offset.
282     * @return string
283     */
284    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
285        $ret = '';
286        if($start>0) $ret .= utf8_substr($string, 0, $start);
287        $ret .= $replacement;
288        $ret .= utf8_substr($string, $start+$length);
289        return $ret;
290    }
291}
292
293if(!function_exists('utf8_ltrim')){
294    /**
295     * Unicode aware replacement for ltrim()
296     *
297     * @author Andreas Gohr <andi@splitbrain.org>
298     * @see    ltrim()
299     *
300     * @param  string $str
301     * @param  string $charlist
302     * @return string
303     */
304    function utf8_ltrim($str,$charlist=''){
305        if($charlist == '') return ltrim($str);
306
307        //quote charlist for use in a characterclass
308        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
309
310        return preg_replace('/^['.$charlist.']+/u','',$str);
311    }
312}
313
314if(!function_exists('utf8_rtrim')){
315    /**
316     * Unicode aware replacement for rtrim()
317     *
318     * @author Andreas Gohr <andi@splitbrain.org>
319     * @see    rtrim()
320     *
321     * @param  string $str
322     * @param  string $charlist
323     * @return string
324     */
325    function  utf8_rtrim($str,$charlist=''){
326        if($charlist == '') return rtrim($str);
327
328        //quote charlist for use in a characterclass
329        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
330
331        return preg_replace('/['.$charlist.']+$/u','',$str);
332    }
333}
334
335if(!function_exists('utf8_trim')){
336    /**
337     * Unicode aware replacement for trim()
338     *
339     * @author Andreas Gohr <andi@splitbrain.org>
340     * @see    trim()
341     *
342     * @param  string $str
343     * @param  string $charlist
344     * @return string
345     */
346    function  utf8_trim($str,$charlist='') {
347        if($charlist == '') return trim($str);
348
349        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
350    }
351}
352
353if(!function_exists('utf8_strtolower')){
354    /**
355     * This is a unicode aware replacement for strtolower()
356     *
357     * Uses mb_string extension if available
358     *
359     * @author Leo Feyer <leo@typolight.org>
360     * @see    strtolower()
361     * @see    utf8_strtoupper()
362     *
363     * @param string $string
364     * @return string
365     */
366    function utf8_strtolower($string){
367        if(UTF8_MBSTRING) {
368            if (class_exists("Normalizer", $autoload = false))
369                return normalizer::normalize(mb_strtolower($string,'utf-8'));
370            else
371                return (mb_strtolower($string,'utf-8'));
372        }
373        global $UTF8_UPPER_TO_LOWER;
374        return strtr($string,$UTF8_UPPER_TO_LOWER);
375    }
376}
377
378if(!function_exists('utf8_strtoupper')){
379    /**
380     * This is a unicode aware replacement for strtoupper()
381     *
382     * Uses mb_string extension if available
383     *
384     * @author Leo Feyer <leo@typolight.org>
385     * @see    strtoupper()
386     * @see    utf8_strtoupper()
387     *
388     * @param string $string
389     * @return string
390     */
391    function utf8_strtoupper($string){
392        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
393
394        global $UTF8_LOWER_TO_UPPER;
395        return strtr($string,$UTF8_LOWER_TO_UPPER);
396    }
397}
398
399if(!function_exists('utf8_ucfirst')){
400    /**
401     * UTF-8 aware alternative to ucfirst
402     * Make a string's first character uppercase
403     *
404     * @author Harry Fuecks
405     *
406     * @param string $str
407     * @return string with first character as upper case (if applicable)
408     */
409    function utf8_ucfirst($str){
410        switch ( utf8_strlen($str) ) {
411            case 0:
412                return '';
413            case 1:
414                return utf8_strtoupper($str);
415            default:
416                preg_match('/^(.{1})(.*)$/us', $str, $matches);
417                return utf8_strtoupper($matches[1]).$matches[2];
418        }
419    }
420}
421
422if(!function_exists('utf8_ucwords')){
423    /**
424     * UTF-8 aware alternative to ucwords
425     * Uppercase the first character of each word in a string
426     *
427     * @author Harry Fuecks
428     * @see http://php.net/ucwords
429     *
430     * @param string $str
431     * @return string with first char of each word uppercase
432     */
433    function utf8_ucwords($str) {
434        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
435        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
436        // This corresponds to the definition of a "word" defined at http://php.net/ucwords
437        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
438
439        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
440    }
441
442    /**
443     * Callback function for preg_replace_callback call in utf8_ucwords
444     * You don't need to call this yourself
445     *
446     * @author Harry Fuecks
447     * @see utf8_ucwords
448     * @see utf8_strtoupper
449     *
450     * @param  array $matches matches corresponding to a single word
451     * @return string with first char of the word in uppercase
452     */
453    function utf8_ucwords_callback($matches) {
454        $leadingws = $matches[2];
455        $ucfirst = utf8_strtoupper($matches[3]);
456        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
457        return $leadingws . $ucword;
458    }
459}
460
461if(!function_exists('utf8_deaccent')){
462    /**
463     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
464     *
465     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
466     * letters. Default is to deaccent both cases ($case = 0)
467     *
468     * @author Andreas Gohr <andi@splitbrain.org>
469     *
470     * @param string $string
471     * @param int $case
472     * @return string
473     */
474    function utf8_deaccent($string,$case=0){
475        if($case <= 0){
476            global $UTF8_LOWER_ACCENTS;
477            $string = strtr($string,$UTF8_LOWER_ACCENTS);
478        }
479        if($case >= 0){
480            global $UTF8_UPPER_ACCENTS;
481            $string = strtr($string,$UTF8_UPPER_ACCENTS);
482        }
483        return $string;
484    }
485}
486
487if(!function_exists('utf8_romanize')){
488    /**
489     * Romanize a non-latin string
490     *
491     * @author Andreas Gohr <andi@splitbrain.org>
492     *
493     * @param string $string
494     * @return string
495     */
496    function utf8_romanize($string){
497        if(utf8_isASCII($string)) return $string; //nothing to do
498
499        global $UTF8_ROMANIZATION;
500        return strtr($string,$UTF8_ROMANIZATION);
501    }
502}
503
504if(!function_exists('utf8_stripspecials')){
505    /**
506     * Removes special characters (nonalphanumeric) from a UTF-8 string
507     *
508     * This function adds the controlchars 0x00 to 0x19 to the array of
509     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
510     *
511     * @author Andreas Gohr <andi@splitbrain.org>
512     *
513     * @param  string $string     The UTF8 string to strip of special chars
514     * @param  string $repl       Replace special with this string
515     * @param  string $additional Additional chars to strip (used in regexp char class)
516     * @return string
517     */
518    function utf8_stripspecials($string,$repl='',$additional=''){
519        global $UTF8_SPECIAL_CHARS2;
520
521        static $specials = null;
522        if(is_null($specials)){
523            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
524            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
525        }
526
527        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
528    }
529}
530
531if(!function_exists('utf8_strpos')){
532    /**
533     * This is an Unicode aware replacement for strpos
534     *
535     * @author Leo Feyer <leo@typolight.org>
536     * @see    strpos()
537     *
538     * @param  string  $haystack
539     * @param  string  $needle
540     * @param  integer $offset
541     * @return integer
542     */
543    function utf8_strpos($haystack, $needle, $offset=0){
544        $comp = 0;
545        $length = null;
546
547        while (is_null($length) || $length < $offset) {
548            $pos = strpos($haystack, $needle, $offset + $comp);
549
550            if ($pos === false)
551                return false;
552
553            $length = utf8_strlen(substr($haystack, 0, $pos));
554
555            if ($length < $offset)
556                $comp = $pos - $length;
557        }
558
559        return $length;
560    }
561}
562
563if(!function_exists('utf8_tohtml')){
564    /**
565     * Encodes UTF-8 characters to HTML entities
566     *
567     * @author Tom N Harris <tnharris@whoopdedo.org>
568     * @author <vpribish at shopping dot com>
569     * @link   http://php.net/manual/en/function.utf8-decode.php
570     *
571     * @param string $str
572     * @param bool $all Encode non-utf8 char to HTML as well
573     * @return string
574     */
575    function utf8_tohtml($str, $all = false) {
576        $ret = '';
577        foreach (utf8_to_unicode($str) as $cp) {
578            if ($cp < 0x80 && !$all)
579                $ret .= chr($cp);
580            elseif ($cp < 0x100)
581                $ret .= "&#$cp;";
582            else
583                $ret .= '&#x'.dechex($cp).';';
584        }
585        return $ret;
586    }
587}
588
589if(!function_exists('utf8_unhtml')){
590    /**
591     * Decodes HTML entities to UTF-8 characters
592     *
593     * Convert any &#..; entity to a codepoint,
594     * The entities flag defaults to only decoding numeric entities.
595     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
596     * are handled as well. Avoids the problem that would occur if you
597     * had to decode "&amp;#38;&#38;amp;#38;"
598     *
599     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
600     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
601     * what it should be                   -> "&#38;&amp#38;"
602     *
603     * @author Tom N Harris <tnharris@whoopdedo.org>
604     *
605     * @param  string  $str      UTF-8 encoded string
606     * @param  boolean $entities Flag controlling decoding of named entities.
607     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
608     */
609    function utf8_unhtml($str, $entities=null) {
610        static $decoder = null;
611        if (is_null($decoder))
612            $decoder = new utf8_entity_decoder();
613        if (is_null($entities))
614            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
615                                         'utf8_decode_numeric', $str);
616        else
617            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
618                                         array(&$decoder, 'decode'), $str);
619    }
620}
621
622if(!function_exists('utf8_decode_numeric')){
623    /**
624     * Decodes numeric HTML entities to their correct UTF-8 characters
625     *
626     * @param $ent string A numeric entity
627     * @return string|false
628     */
629    function utf8_decode_numeric($ent) {
630        switch ($ent[2]) {
631            case 'X':
632            case 'x':
633                $cp = hexdec($ent[3]);
634                break;
635            default:
636                $cp = intval($ent[3]);
637                break;
638        }
639        return unicode_to_utf8(array($cp));
640    }
641}
642
643if(!class_exists('utf8_entity_decoder')){
644    /**
645     * Encapsulate HTML entity decoding tables
646     */
647    class utf8_entity_decoder {
648        protected $table;
649
650        /**
651         * Initializes the decoding tables
652         */
653        function __construct() {
654            $table = get_html_translation_table(HTML_ENTITIES);
655            $table = array_flip($table);
656            $this->table = array_map(array(&$this,'makeutf8'), $table);
657        }
658
659        /**
660         * Wrapper around unicode_to_utf8()
661         *
662         * @param string $c
663         * @return string|false
664         */
665        function makeutf8($c) {
666            return unicode_to_utf8(array(ord($c)));
667        }
668
669        /**
670         * Decodes any HTML entity to it's correct UTF-8 char equivalent
671         *
672         * @param string $ent An entity
673         * @return string|false
674         */
675        function decode($ent) {
676            if ($ent[1] == '#') {
677                return utf8_decode_numeric($ent);
678            } elseif (array_key_exists($ent[0],$this->table)) {
679                return $this->table[$ent[0]];
680            } else {
681                return $ent[0];
682            }
683        }
684    }
685}
686
687if(!function_exists('utf8_to_unicode')){
688    /**
689     * Takes an UTF-8 string and returns an array of ints representing the
690     * Unicode characters. Astral planes are supported ie. the ints in the
691     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
692     * are not allowed.
693     *
694     * If $strict is set to true the function returns false if the input
695     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
696     * level E_USER_WARNING
697     *
698     * Note: this function has been modified slightly in this library to
699     * trigger errors on encountering bad bytes
700     *
701     * @author <hsivonen@iki.fi>
702     * @author Harry Fuecks <hfuecks@gmail.com>
703     * @see    unicode_to_utf8
704     * @link   http://hsivonen.iki.fi/php-utf8/
705     * @link   http://sourceforge.net/projects/phputf8/
706     *
707     * @param  string  $str UTF-8 encoded string
708     * @param  boolean $strict Check for invalid sequences?
709     * @return mixed array of unicode code points or false if UTF-8 invalid
710     */
711    function utf8_to_unicode($str,$strict=false) {
712        $mState = 0;     // cached expected number of octets after the current octet
713                         // until the beginning of the next UTF8 character sequence
714        $mUcs4  = 0;     // cached Unicode character
715        $mBytes = 1;     // cached expected number of octets in the current sequence
716
717        $out = array();
718
719        $len = strlen($str);
720
721        for($i = 0; $i < $len; $i++) {
722
723            $in = ord($str{$i});
724
725            if ( $mState == 0) {
726
727                // When mState is zero we expect either a US-ASCII character or a
728                // multi-octet sequence.
729                if (0 == (0x80 & ($in))) {
730                    // US-ASCII, pass straight through.
731                    $out[] = $in;
732                    $mBytes = 1;
733
734                } else if (0xC0 == (0xE0 & ($in))) {
735                    // First octet of 2 octet sequence
736                    $mUcs4 = ($in);
737                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
738                    $mState = 1;
739                    $mBytes = 2;
740
741                } else if (0xE0 == (0xF0 & ($in))) {
742                    // First octet of 3 octet sequence
743                    $mUcs4 = ($in);
744                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
745                    $mState = 2;
746                    $mBytes = 3;
747
748                } else if (0xF0 == (0xF8 & ($in))) {
749                    // First octet of 4 octet sequence
750                    $mUcs4 = ($in);
751                    $mUcs4 = ($mUcs4 & 0x07) << 18;
752                    $mState = 3;
753                    $mBytes = 4;
754
755                } else if (0xF8 == (0xFC & ($in))) {
756                    /* First octet of 5 octet sequence.
757                     *
758                     * This is illegal because the encoded codepoint must be either
759                     * (a) not the shortest form or
760                     * (b) outside the Unicode range of 0-0x10FFFF.
761                     * Rather than trying to resynchronize, we will carry on until the end
762                     * of the sequence and let the later error handling code catch it.
763                     */
764                    $mUcs4 = ($in);
765                    $mUcs4 = ($mUcs4 & 0x03) << 24;
766                    $mState = 4;
767                    $mBytes = 5;
768
769                } else if (0xFC == (0xFE & ($in))) {
770                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
771                    $mUcs4 = ($in);
772                    $mUcs4 = ($mUcs4 & 1) << 30;
773                    $mState = 5;
774                    $mBytes = 6;
775
776                } elseif($strict) {
777                    /* Current octet is neither in the US-ASCII range nor a legal first
778                     * octet of a multi-octet sequence.
779                     */
780                    trigger_error(
781                            'utf8_to_unicode: Illegal sequence identifier '.
782                                'in UTF-8 at byte '.$i,
783                            E_USER_WARNING
784                        );
785                    return false;
786
787                }
788
789            } else {
790
791                // When mState is non-zero, we expect a continuation of the multi-octet
792                // sequence
793                if (0x80 == (0xC0 & ($in))) {
794
795                    // Legal continuation.
796                    $shift = ($mState - 1) * 6;
797                    $tmp = $in;
798                    $tmp = ($tmp & 0x0000003F) << $shift;
799                    $mUcs4 |= $tmp;
800
801                    /**
802                     * End of the multi-octet sequence. mUcs4 now contains the final
803                     * Unicode codepoint to be output
804                     */
805                    if (0 == --$mState) {
806
807                        /*
808                         * Check for illegal sequences and codepoints.
809                         */
810                        // From Unicode 3.1, non-shortest form is illegal
811                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
812                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
813                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
814                            (4 < $mBytes) ||
815                            // From Unicode 3.2, surrogate characters are illegal
816                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
817                            // Codepoints outside the Unicode range are illegal
818                            ($mUcs4 > 0x10FFFF)) {
819
820                            if($strict){
821                                trigger_error(
822                                        'utf8_to_unicode: Illegal sequence or codepoint '.
823                                            'in UTF-8 at byte '.$i,
824                                        E_USER_WARNING
825                                    );
826
827                                return false;
828                            }
829
830                        }
831
832                        if (0xFEFF != $mUcs4) {
833                            // BOM is legal but we don't want to output it
834                            $out[] = $mUcs4;
835                        }
836
837                        //initialize UTF8 cache
838                        $mState = 0;
839                        $mUcs4  = 0;
840                        $mBytes = 1;
841                    }
842
843                } elseif($strict) {
844                    /**
845                     *((0xC0 & (*in) != 0x80) && (mState != 0))
846                     * Incomplete multi-octet sequence.
847                     */
848                    trigger_error(
849                            'utf8_to_unicode: Incomplete multi-octet '.
850                            '   sequence in UTF-8 at byte '.$i,
851                            E_USER_WARNING
852                        );
853
854                    return false;
855                }
856            }
857        }
858        return $out;
859    }
860}
861
862if(!function_exists('unicode_to_utf8')){
863    /**
864     * Takes an array of ints representing the Unicode characters and returns
865     * a UTF-8 string. Astral planes are supported ie. the ints in the
866     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
867     * are not allowed.
868     *
869     * If $strict is set to true the function returns false if the input
870     * array contains ints that represent surrogates or are outside the
871     * Unicode range and raises a PHP error at level E_USER_WARNING
872     *
873     * Note: this function has been modified slightly in this library to use
874     * output buffering to concatenate the UTF-8 string (faster) as well as
875     * reference the array by it's keys
876     *
877     * @param  array $arr of unicode code points representing a string
878     * @param  boolean $strict Check for invalid sequences?
879     * @return string|false UTF-8 string or false if array contains invalid code points
880     *
881     * @author <hsivonen@iki.fi>
882     * @author Harry Fuecks <hfuecks@gmail.com>
883     * @see    utf8_to_unicode
884     * @link   http://hsivonen.iki.fi/php-utf8/
885     * @link   http://sourceforge.net/projects/phputf8/
886     */
887    function unicode_to_utf8($arr,$strict=false) {
888        if (!is_array($arr)) return '';
889        ob_start();
890
891        foreach (array_keys($arr) as $k) {
892
893            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
894                # ASCII range (including control chars)
895
896                echo chr($arr[$k]);
897
898            } else if ($arr[$k] <= 0x07ff) {
899                # 2 byte sequence
900
901                echo chr(0xc0 | ($arr[$k] >> 6));
902                echo chr(0x80 | ($arr[$k] & 0x003f));
903
904            } else if($arr[$k] == 0xFEFF) {
905                # Byte order mark (skip)
906
907                // nop -- zap the BOM
908
909            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
910                # Test for illegal surrogates
911
912                // found a surrogate
913                if($strict){
914                    trigger_error(
915                        'unicode_to_utf8: Illegal surrogate '.
916                            'at index: '.$k.', value: '.$arr[$k],
917                        E_USER_WARNING
918                        );
919                    return false;
920                }
921
922            } else if ($arr[$k] <= 0xffff) {
923                # 3 byte sequence
924
925                echo chr(0xe0 | ($arr[$k] >> 12));
926                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
927                echo chr(0x80 | ($arr[$k] & 0x003f));
928
929            } else if ($arr[$k] <= 0x10ffff) {
930                # 4 byte sequence
931
932                echo chr(0xf0 | ($arr[$k] >> 18));
933                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
934                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
935                echo chr(0x80 | ($arr[$k] & 0x3f));
936
937            } elseif($strict) {
938
939                trigger_error(
940                    'unicode_to_utf8: Codepoint out of Unicode range '.
941                        'at index: '.$k.', value: '.$arr[$k],
942                    E_USER_WARNING
943                    );
944
945                // out of range
946                return false;
947            }
948        }
949
950        $result = ob_get_contents();
951        ob_end_clean();
952        return $result;
953    }
954}
955
956if(!function_exists('utf8_to_utf16be')){
957    /**
958     * UTF-8 to UTF-16BE conversion.
959     *
960     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
961     *
962     * @param string $str
963     * @param bool $bom
964     * @return string
965     */
966    function utf8_to_utf16be(&$str, $bom = false) {
967        $out = $bom ? "\xFE\xFF" : '';
968        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
969
970        $uni = utf8_to_unicode($str);
971        foreach($uni as $cp){
972            $out .= pack('n',$cp);
973        }
974        return $out;
975    }
976}
977
978if(!function_exists('utf16be_to_utf8')){
979    /**
980     * UTF-8 to UTF-16BE conversion.
981     *
982     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
983     *
984     * @param string $str
985     * @return false|string
986     */
987    function utf16be_to_utf8(&$str) {
988        $uni = unpack('n*',$str);
989        return unicode_to_utf8($uni);
990    }
991}
992
993if(!function_exists('utf8_bad_replace')){
994    /**
995     * Replace bad bytes with an alternative character
996     *
997     * ASCII character is recommended for replacement char
998     *
999     * PCRE Pattern to locate bad bytes in a UTF-8 string
1000     * Comes from W3 FAQ: Multilingual Forms
1001     * Note: modified to include full ASCII range including control chars
1002     *
1003     * @author Harry Fuecks <hfuecks@gmail.com>
1004     * @see http://www.w3.org/International/questions/qa-forms-utf-8
1005     *
1006     * @param string $str to search
1007     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
1008     * @return string
1009     */
1010    function utf8_bad_replace($str, $replace = '') {
1011        $UTF8_BAD =
1012         '([\x00-\x7F]'.                          # ASCII (including control chars)
1013         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
1014         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
1015         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
1016         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
1017         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
1018         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
1019         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
1020         '|(.{1}))';                              # invalid byte
1021        ob_start();
1022        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
1023            if ( !isset($matches[2])) {
1024                echo $matches[0];
1025            } else {
1026                echo $replace;
1027            }
1028            $str = substr($str,strlen($matches[0]));
1029        }
1030        $result = ob_get_contents();
1031        ob_end_clean();
1032        return $result;
1033    }
1034}
1035
1036if(!function_exists('utf8_correctIdx')){
1037    /**
1038     * adjust a byte index into a utf8 string to a utf8 character boundary
1039     *
1040     * @param string $str   utf8 character string
1041     * @param int    $i     byte index into $str
1042     * @param $next  bool     direction to search for boundary,
1043     *                           false = up (current character)
1044     *                           true = down (next character)
1045     *
1046     * @return int            byte index into $str now pointing to a utf8 character boundary
1047     *
1048     * @author       chris smith <chris@jalakai.co.uk>
1049     */
1050    function utf8_correctIdx(&$str,$i,$next=false) {
1051
1052        if ($i <= 0) return 0;
1053
1054        $limit = strlen($str);
1055        if ($i>=$limit) return $limit;
1056
1057        if ($next) {
1058            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
1059        } else {
1060            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
1061        }
1062
1063        return $i;
1064    }
1065}
1066
1067// only needed if no mb_string available
1068if(!UTF8_MBSTRING){
1069    /**
1070     * UTF-8 Case lookup table
1071     *
1072     * This lookuptable defines the upper case letters to their correspponding
1073     * lower case letter in UTF-8
1074     *
1075     * @author Andreas Gohr <andi@splitbrain.org>
1076     */
1077    global $UTF8_LOWER_TO_UPPER;
1078    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1079            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
1080            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
1081            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1082            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1083            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1084            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1085            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1086            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1087            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1088            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1089            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1090            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1091            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1092            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1093            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1094            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1095            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1096            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1097            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1098            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1099            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1100            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1101            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1102            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1103            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1104            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1105            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1106            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1107            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1108            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1109            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1110            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1111            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1112            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1113            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1114            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1115            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1116            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1117            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1118            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1119            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1120            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1121            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1122            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1123            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1124            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1125            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1126            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1127            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1128            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1129            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1130            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1131            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1132            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1133            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1134            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1135            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1136            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1137            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1138            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1139            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1140            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1141            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1142            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1143            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1144            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1145            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1146                );
1147
1148    /**
1149     * UTF-8 Case lookup table
1150     *
1151     * This lookuptable defines the lower case letters to their corresponding
1152     * upper case letter in UTF-8
1153     *
1154     * @author Andreas Gohr <andi@splitbrain.org>
1155     */
1156    global $UTF8_UPPER_TO_LOWER;
1157    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1158            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1159            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1160            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1161            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1162            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1163            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1164            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1165            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1166            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1167            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1168            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1169            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1170            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1171            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1172            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1173            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1174            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1175            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1176            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1177            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1178            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1179            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1180            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1181            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1182            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1183            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1184            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1185            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1186            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1187            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1188            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1189            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1190            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1191            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1192            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1193            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1194            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1195            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1196            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1197            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1198            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1199            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1200            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1201            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1202            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1203            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1204            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1205            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1206            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1207            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1208            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1209            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1210            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1211            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1212            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1213            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1214            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1215            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1216            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1217            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1218            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1219            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1220            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1221            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1222            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1223            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1224            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1225                );
1226}; // end of case lookup tables
1227
1228/**
1229 * UTF-8 lookup table for lower case accented letters
1230 *
1231 * This lookuptable defines replacements for accented characters from the ASCII-7
1232 * range. This are lower case letters only.
1233 *
1234 * @author Andreas Gohr <andi@splitbrain.org>
1235 * @see    utf8_deaccent()
1236 */
1237global $UTF8_LOWER_ACCENTS;
1238if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1239  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1240  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1241  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1242  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1243  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1244  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1245  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1246  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1247  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1248  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1249  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1250  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1251  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1252  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1253  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1254);
1255
1256/**
1257 * UTF-8 lookup table for upper case accented letters
1258 *
1259 * This lookuptable defines replacements for accented characters from the ASCII-7
1260 * range. This are upper case letters only.
1261 *
1262 * @author Andreas Gohr <andi@splitbrain.org>
1263 * @see    utf8_deaccent()
1264 */
1265global $UTF8_UPPER_ACCENTS;
1266if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1267  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1268  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1269  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1270  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1271  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1272  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1273  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1274  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1275  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1276  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1277  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1278  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1279  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1280  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1281  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1282);
1283
1284/**
1285 * UTF-8 array of common special characters
1286 *
1287 * This array should contain all special characters (not a letter or digit)
1288 * defined in the various local charsets - it's not a complete list of non-alphanum
1289 * characters in UTF-8. It's not perfect but should match most cases of special
1290 * chars.
1291 *
1292 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1293 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1294 *
1295 * @author Andreas Gohr <andi@splitbrain.org>
1296 * @see    utf8_stripspecials()
1297 */
1298global $UTF8_SPECIAL_CHARS;
1299if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1300  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1301  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1302          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1303  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1304  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1305  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1306  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1307  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1308  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1309  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1310  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1311  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1312  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1313  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1314  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1315  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1316  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1317  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1318  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1319  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1320  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1321  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1322  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1323  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1324  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1325  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1326  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1327  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1328  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1329  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1330  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1331  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1332  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1333  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1334  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1335  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1336  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1337  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1338  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1339  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1340  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1341  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1342  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1343  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1344  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1345  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1346  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1347  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1348  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1349  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1350  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1351  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1352  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1353          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1354  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1355  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1356  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1357  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1358  0xffeb, 0xffec, 0xffed, 0xffee,
1359  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1360  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1361  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1362  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1363  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1364);
1365
1366// utf8 version of above data
1367global $UTF8_SPECIAL_CHARS2;
1368if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1369    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1370    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1371    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1372    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1373    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1374    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1375    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1376    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1377    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1378    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1379    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1380    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1381    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1382    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1383    '➷➸➹➺➻➼➽➾'.
1384    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1385    '�'.
1386    '�ﹼﹽ'.
1387    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1388    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1389    '����������������������������������������������������������������'.
1390    '   ⁠';
1391
1392/**
1393 * Romanization lookup table
1394 *
1395 * This lookup tables provides a way to transform strings written in a language
1396 * different from the ones based upon latin letters into plain ASCII.
1397 *
1398 * Please note: this is not a scientific transliteration table. It only works
1399 * oneway from nonlatin to ASCII and it works by simple character replacement
1400 * only. Specialities of each language are not supported.
1401 *
1402 * @author Andreas Gohr <andi@splitbrain.org>
1403 * @author Vitaly Blokhin <vitinfo@vitn.com>
1404 * @link   http://www.uconv.com/translit.htm
1405 * @author Bisqwit <bisqwit@iki.fi>
1406 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1407 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1408 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1409 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1410 * @author Arthit Suriyawongkul <arthit@gmail.com>
1411 * @author Denis Scheither <amorphis@uni-bremen.de>
1412 * @author Eivind Morland <eivind.morland@gmail.com>
1413 */
1414global $UTF8_ROMANIZATION;
1415if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1416  // scandinavian - differs from what we do in deaccent
1417  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1418
1419  //russian cyrillic
1420  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1421  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1422  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1423  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1424  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1425  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1426  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1427  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1428  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1429  // Ukrainian cyrillic
1430  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1431  // Georgian
1432  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1433  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1434  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1435  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1436  'ჰ'=>'xh',
1437  //Sanskrit
1438  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1439  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1440  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1441  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1442  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1443  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1444  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1445  //Sanskrit diacritics
1446  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1447  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1448  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1449  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1450  //Hebrew
1451  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1452  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1453  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1454  'ש'=>'sh','ת'=>'t',
1455  //Arabic
1456  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1457  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1458  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1459  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1460
1461  // Japanese characters  (last update: 2008-05-09)
1462
1463  // Japanese hiragana
1464
1465  // 3 character syllables, っ doubles the consonant after
1466  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1467  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1468  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1469  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1470  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1471  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1472  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1473  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1474  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1475  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1476  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1477
1478  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1479  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1480  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1481
1482   // 2 character syllables - normal
1483  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1484  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1485  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1486  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1487  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1488  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1489  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1490  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1491  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1492  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1493  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1494  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1495  'うぇ'=>'we','うぃ'=>'wi',
1496  'いぇ'=>'ye',
1497
1498  // 2 character syllables, っ doubles the consonant after
1499  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1500  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1501  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1502  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1503  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1504  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1505  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1506  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1507  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1508  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1509  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1510
1511  // 1 character syllabels
1512  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1513  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1514  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1515  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1516  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1517  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1518  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1519  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1520  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1521  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1522  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1523  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1524  'わ'=>'wa','を'=>'wo',
1525  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1526  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1527  // old characters
1528  'ゑ'=>'we','ゐ'=>'wi',
1529
1530  //  convert what's left (probably only kicks in when something's missing above)
1531  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1532  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1533
1534  // never seen one of those (disabled for the moment)
1535  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1536  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1537  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1538  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1539  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1540  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1541  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1542  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1543  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1544  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1545  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1546  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1547  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1548  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1549
1550  // 'spare' characters from other romanization systems
1551  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1552  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1553  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1554  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1555  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1556  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1557  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1558  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1559  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1560  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1561
1562
1563  // Japanese katakana
1564
1565  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1566  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1567  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1568  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1569  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1570  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1571  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1572  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1573  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1574  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1575  'ッティー'=>'ttii',
1576  'ッヂィー'=>'ddii',
1577
1578  // 3 character syllables - doubled vowels
1579  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1580  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1581  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1582  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1583  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1584  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1585  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1586  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1587  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1588  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1589  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1590  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1591  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1592  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1593  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1594  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1595  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1596  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1597  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1598  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1599  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1600  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1601  'ウェー'=>'wee','ウィー'=>'wii',
1602  'イェー'=>'yee',
1603  'ティー'=>'tii',
1604  'ヂィー'=>'dii',
1605
1606  // 3 character syllables - doubled consonants
1607  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1608  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1609  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1610  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1611  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1612  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1613  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1614  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1615  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1616  'ッティ'=>'tti',
1617  'ッヂィ'=>'ddi',
1618
1619  // 3 character syllables - doubled vowel and consonants
1620  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1621  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1622  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1623  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1624  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1625  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1626  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1627  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1628  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1629  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1630  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1631
1632  // 2 character syllables - normal
1633  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1634  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1635  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1636  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1637  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1638  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1639  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1640  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1641  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1642  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1643  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1644  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1645  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1646  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1647  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1648  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1649  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1650  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1651  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1652  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1653  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1654  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1655  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1656  'ウェ'=>'we','ウィ'=>'wi',
1657  'イェ'=>'ye',
1658  'ティ'=>'ti',
1659  'ヂィ'=>'di',
1660
1661  // 2 character syllables - doubled vocal
1662  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1663  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1664  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1665  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1666  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1667  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1668  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1669  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1670  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1671  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1672  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1673  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1674  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1675  'ワー'=>'waa','ヲー'=>'woo',
1676  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1677  'ヵー'=>'kaa','ヶー'=>'kee',
1678  // old characters
1679  'ヱー'=>'wee','ヰー'=>'wii',
1680
1681  // seperate katakana 'n'
1682  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1683  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1684
1685  // 2 character syllables - doubled consonants
1686  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1687  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1688  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1689  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1690  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1691  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1692  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1693  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1694  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1695  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1696  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1697
1698  // 1 character syllables
1699  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1700  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1701  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1702  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1703  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1704  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1705  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1706  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1707  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1708  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1709  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1710  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1711  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1712  'ワ'=>'wa','ヲ'=>'wo',
1713  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1714  'ヵ'=>'ka','ヶ'=>'ke',
1715  // old characters
1716  'ヱ'=>'we','ヰ'=>'wi',
1717
1718  //  convert what's left (probably only kicks in when something's missing above)
1719  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1720  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1721
1722  // special characters
1723  '・'=>'_','、'=>'_',
1724  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1725
1726  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1727  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1728  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1729  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1730  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1731  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1732  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1733  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1734  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1735  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1736  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1737  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1738
1739  // "Greeklish"
1740  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1741  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1742
1743  // Thai
1744  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1745  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1746  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1747  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1748  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1749  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1750  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1751  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1752  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1753  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1754  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1755  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1756  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1757  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1758  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1759  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1760  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1761  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1762
1763  // Korean
1764  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1765  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1766  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1767  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1768  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1769  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1770);
1771
1772
1773