1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20/**
21 * Check if PREG was compiled with UTF-8 support
22 *
23 * Without this many of the functions below will not work, so this is a minimal requirement
24 */
25if(!defined('UTF8_PREGSUPPORT')){
26    define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
27}
28
29/**
30 * Check if PREG was compiled with Unicode Property support
31 *
32 * This is not required for the functions below, but might be needed in a UTF-8 aware application
33 */
34if(!defined('UTF8_PROPERTYSUPPORT')){
35    define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
36}
37
38
39if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
40
41if(!function_exists('utf8_isASCII')){
42    /**
43     * Checks if a string contains 7bit ASCII only
44     *
45     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
46     *
47     * @param string $str
48     * @return bool
49     */
50    function utf8_isASCII($str){
51        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
52    }
53}
54
55if(!function_exists('utf8_strip')){
56    /**
57     * Strips all highbyte chars
58     *
59     * Returns a pure ASCII7 string
60     *
61     * @author Andreas Gohr <andi@splitbrain.org>
62     *
63     * @param string $str
64     * @return string
65     */
66    function utf8_strip($str){
67        $ascii = '';
68        $len = strlen($str);
69        for($i=0; $i<$len; $i++){
70            if(ord($str[$i]) <128){
71                $ascii .= $str[$i];
72            }
73        }
74        return $ascii;
75    }
76}
77
78if(!function_exists('utf8_check')){
79    /**
80     * Tries to detect if a string is in Unicode encoding
81     *
82     * @author <bmorel@ssi.fr>
83     * @link   http://php.net/manual/en/function.utf8-encode.php
84     *
85     * @param string $Str
86     * @return bool
87     */
88    function utf8_check($Str) {
89        $len = strlen($Str);
90        for ($i=0; $i<$len; $i++) {
91            $b = ord($Str[$i]);
92            if ($b < 0x80) continue; # 0bbbbbbb
93            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
94            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
95            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
96            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
97            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
98            else return false; # Does not match any model
99
100            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
101                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
102                    return false;
103            }
104        }
105        return true;
106    }
107}
108
109if(!function_exists('utf8_basename')){
110    /**
111     * A locale independent basename() implementation
112     *
113     * works around a bug in PHP's basename() implementation
114     *
115     * @see basename()
116     * @link   https://bugs.php.net/bug.php?id=37738
117     *
118     * @param string $path     A path
119     * @param string $suffix   If the name component ends in suffix this will also be cut off
120     * @return string
121     */
122    function utf8_basename($path, $suffix=''){
123        $path = trim($path,'\\/');
124        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
125        if($rpos) $path = substr($path, $rpos+1);
126
127        $suflen = strlen($suffix);
128        if($suflen && (substr($path, -$suflen) == $suffix)){
129            $path = substr($path, 0, -$suflen);
130        }
131
132        return $path;
133    }
134}
135
136if(!function_exists('utf8_strlen')){
137    /**
138     * Unicode aware replacement for strlen()
139     *
140     * utf8_decode() converts characters that are not in ISO-8859-1
141     * to '?', which, for the purpose of counting, is alright - It's
142     * even faster than mb_strlen.
143     *
144     * @author <chernyshevsky at hotmail dot com>
145     * @see    strlen()
146     * @see    utf8_decode()
147     *
148     * @param string $string
149     * @return int
150     */
151    function utf8_strlen($string) {
152        if (function_exists('utf8_decode')) {
153            return strlen(utf8_decode($string));
154        } elseif (UTF8_MBSTRING) {
155            return mb_strlen($string, 'UTF-8');
156        } elseif (function_exists('iconv_strlen')) {
157            return iconv_strlen($string, 'UTF-8');
158        } else {
159            return strlen($string);
160        }
161    }
162}
163
164if(!function_exists('utf8_substr')){
165    /**
166     * UTF-8 aware alternative to substr
167     *
168     * Return part of a string given character offset (and optionally length)
169     *
170     * @author Harry Fuecks <hfuecks@gmail.com>
171     * @author Chris Smith <chris@jalakai.co.uk>
172     *
173     * @param string $str
174     * @param int $offset number of UTF-8 characters offset (from left)
175     * @param int $length (optional) length in UTF-8 characters from offset
176     * @return string
177     */
178    function utf8_substr($str, $offset, $length = null) {
179        if(UTF8_MBSTRING){
180            if( $length === null ){
181                return mb_substr($str, $offset);
182            }else{
183                return mb_substr($str, $offset, $length);
184            }
185        }
186
187        /*
188         * Notes:
189         *
190         * no mb string support, so we'll use pcre regex's with 'u' flag
191         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
192         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
193         *
194         * substr documentation states false can be returned in some cases (e.g. offset > string length)
195         * mb_substr never returns false, it will return an empty string instead.
196         *
197         * calculating the number of characters in the string is a relatively expensive operation, so
198         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
199         */
200
201        // cast parameters to appropriate types to avoid multiple notices/warnings
202        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
203        $offset = (int)$offset;
204        if (!is_null($length)) $length = (int)$length;
205
206        // handle trivial cases
207        if ($length === 0) return '';
208        if ($offset < 0 && $length < 0 && $length < $offset) return '';
209
210        $offset_pattern = '';
211        $length_pattern = '';
212
213        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
214        if ($offset < 0) {
215            $strlen = utf8_strlen($str);        // see notes
216            $offset = $strlen + $offset;
217            if ($offset < 0) $offset = 0;
218        }
219
220        // establish a pattern for offset, a non-captured group equal in length to offset
221        if ($offset > 0) {
222            $Ox = (int)($offset/65535);
223            $Oy = $offset%65535;
224
225            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
226            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
227        } else {
228            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
229        }
230
231        // establish a pattern for length
232        if (is_null($length)) {
233            $length_pattern = '(.*)$';                  // the rest of the string
234        } else {
235
236            if (!isset($strlen)) $strlen = utf8_strlen($str);    // see notes
237            if ($offset > $strlen) return '';           // another trivial case
238
239            if ($length > 0) {
240
241                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
242
243                $Lx = (int)($length/65535);
244                $Ly = $length%65535;
245
246                // +ve length requires ... a captured group of length characters
247                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
248                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
249
250            } else if ($length < 0) {
251
252                if ($length < ($offset - $strlen)) return '';
253
254                $Lx = (int)((-$length)/65535);
255                $Ly = (-$length)%65535;
256
257                // -ve length requires ... capture everything except a group of -length characters
258                //                         anchored at the tail-end of the string
259                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
260                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
261            }
262        }
263
264        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
265        return $match[1];
266    }
267}
268
269if(!function_exists('utf8_substr_replace')){
270    /**
271     * Unicode aware replacement for substr_replace()
272     *
273     * @author Andreas Gohr <andi@splitbrain.org>
274     * @see    substr_replace()
275     *
276     * @param string $string      input string
277     * @param string $replacement the replacement
278     * @param int    $start       the replacing will begin at the start'th offset into string.
279     * @param int    $length      If given and is positive, it represents the length of the portion of string which is
280     *                            to be replaced. If length is zero then this function will have the effect of inserting
281     *                            replacement into string at the given start offset.
282     * @return string
283     */
284    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
285        $ret = '';
286        if($start>0) $ret .= utf8_substr($string, 0, $start);
287        $ret .= $replacement;
288        $ret .= utf8_substr($string, $start+$length);
289        return $ret;
290    }
291}
292
293if(!function_exists('utf8_ltrim')){
294    /**
295     * Unicode aware replacement for ltrim()
296     *
297     * @author Andreas Gohr <andi@splitbrain.org>
298     * @see    ltrim()
299     *
300     * @param  string $str
301     * @param  string $charlist
302     * @return string
303     */
304    function utf8_ltrim($str,$charlist=''){
305        if($charlist == '') return ltrim($str);
306
307        //quote charlist for use in a characterclass
308        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
309
310        return preg_replace('/^['.$charlist.']+/u','',$str);
311    }
312}
313
314if(!function_exists('utf8_rtrim')){
315    /**
316     * Unicode aware replacement for rtrim()
317     *
318     * @author Andreas Gohr <andi@splitbrain.org>
319     * @see    rtrim()
320     *
321     * @param  string $str
322     * @param  string $charlist
323     * @return string
324     */
325    function  utf8_rtrim($str,$charlist=''){
326        if($charlist == '') return rtrim($str);
327
328        //quote charlist for use in a characterclass
329        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
330
331        return preg_replace('/['.$charlist.']+$/u','',$str);
332    }
333}
334
335if(!function_exists('utf8_trim')){
336    /**
337     * Unicode aware replacement for trim()
338     *
339     * @author Andreas Gohr <andi@splitbrain.org>
340     * @see    trim()
341     *
342     * @param  string $str
343     * @param  string $charlist
344     * @return string
345     */
346    function  utf8_trim($str,$charlist='') {
347        if($charlist == '') return trim($str);
348
349        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
350    }
351}
352
353if(!function_exists('utf8_strtolower')){
354    /**
355     * This is a unicode aware replacement for strtolower()
356     *
357     * Uses mb_string extension if available
358     *
359     * @author Leo Feyer <leo@typolight.org>
360     * @see    strtolower()
361     * @see    utf8_strtoupper()
362     *
363     * @param string $string
364     * @return string
365     */
366    function utf8_strtolower($string){
367        if(UTF8_MBSTRING) {
368            if (class_exists("Normalizer", $autoload = false))
369                return normalizer::normalize(mb_strtolower($string,'utf-8'));
370            else
371                return (mb_strtolower($string,'utf-8'));
372        }
373        global $UTF8_UPPER_TO_LOWER;
374        return strtr($string,$UTF8_UPPER_TO_LOWER);
375    }
376}
377
378if(!function_exists('utf8_strtoupper')){
379    /**
380     * This is a unicode aware replacement for strtoupper()
381     *
382     * Uses mb_string extension if available
383     *
384     * @author Leo Feyer <leo@typolight.org>
385     * @see    strtoupper()
386     * @see    utf8_strtoupper()
387     *
388     * @param string $string
389     * @return string
390     */
391    function utf8_strtoupper($string){
392        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
393
394        global $UTF8_LOWER_TO_UPPER;
395        return strtr($string,$UTF8_LOWER_TO_UPPER);
396    }
397}
398
399if(!function_exists('utf8_ucfirst')){
400    /**
401     * UTF-8 aware alternative to ucfirst
402     * Make a string's first character uppercase
403     *
404     * @author Harry Fuecks
405     *
406     * @param string $str
407     * @return string with first character as upper case (if applicable)
408     */
409    function utf8_ucfirst($str){
410        switch ( utf8_strlen($str) ) {
411            case 0:
412                return '';
413            case 1:
414                return utf8_strtoupper($str);
415            default:
416                preg_match('/^(.{1})(.*)$/us', $str, $matches);
417                return utf8_strtoupper($matches[1]).$matches[2];
418        }
419    }
420}
421
422if(!function_exists('utf8_ucwords')){
423    /**
424     * UTF-8 aware alternative to ucwords
425     * Uppercase the first character of each word in a string
426     *
427     * @author Harry Fuecks
428     * @see http://php.net/ucwords
429     *
430     * @param string $str
431     * @return string with first char of each word uppercase
432     */
433    function utf8_ucwords($str) {
434        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
435        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
436        // This corresponds to the definition of a "word" defined at http://php.net/ucwords
437        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
438
439        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
440    }
441
442    /**
443     * Callback function for preg_replace_callback call in utf8_ucwords
444     * You don't need to call this yourself
445     *
446     * @author Harry Fuecks
447     * @see utf8_ucwords
448     * @see utf8_strtoupper
449     *
450     * @param  array $matches matches corresponding to a single word
451     * @return string with first char of the word in uppercase
452     */
453    function utf8_ucwords_callback($matches) {
454        $leadingws = $matches[2];
455        $ucfirst = utf8_strtoupper($matches[3]);
456        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
457        return $leadingws . $ucword;
458    }
459}
460
461if(!function_exists('utf8_deaccent')){
462    /**
463     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
464     *
465     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
466     * letters. Default is to deaccent both cases ($case = 0)
467     *
468     * @author Andreas Gohr <andi@splitbrain.org>
469     *
470     * @param string $string
471     * @param int $case
472     * @return string
473     */
474    function utf8_deaccent($string,$case=0){
475        if($case <= 0){
476            global $UTF8_LOWER_ACCENTS;
477            $string = strtr($string,$UTF8_LOWER_ACCENTS);
478        }
479        if($case >= 0){
480            global $UTF8_UPPER_ACCENTS;
481            $string = strtr($string,$UTF8_UPPER_ACCENTS);
482        }
483        return $string;
484    }
485}
486
487if(!function_exists('utf8_romanize')){
488    /**
489     * Romanize a non-latin string
490     *
491     * @author Andreas Gohr <andi@splitbrain.org>
492     *
493     * @param string $string
494     * @return string
495     */
496    function utf8_romanize($string){
497        if(utf8_isASCII($string)) return $string; //nothing to do
498
499        global $UTF8_ROMANIZATION;
500        return strtr($string,$UTF8_ROMANIZATION);
501    }
502}
503
504if(!function_exists('utf8_stripspecials')){
505    /**
506     * Removes special characters (nonalphanumeric) from a UTF-8 string
507     *
508     * This function adds the controlchars 0x00 to 0x19 to the array of
509     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
510     *
511     * @author Andreas Gohr <andi@splitbrain.org>
512     *
513     * @param  string $string     The UTF8 string to strip of special chars
514     * @param  string $repl       Replace special with this string
515     * @param  string $additional Additional chars to strip (used in regexp char class)
516     * @return string
517     */
518    function utf8_stripspecials($string,$repl='',$additional=''){
519        global $UTF8_SPECIAL_CHARS2;
520
521        static $specials = null;
522        if(is_null($specials)){
523            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
524            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
525        }
526
527        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
528    }
529}
530
531if(!function_exists('utf8_strpos')){
532    /**
533     * This is an Unicode aware replacement for strpos
534     *
535     * @author Leo Feyer <leo@typolight.org>
536     * @see    strpos()
537     *
538     * @param  string  $haystack
539     * @param  string  $needle
540     * @param  integer $offset
541     * @return integer
542     */
543    function utf8_strpos($haystack, $needle, $offset=0){
544        $comp = 0;
545        $length = null;
546
547        while (is_null($length) || $length < $offset) {
548            $pos = strpos($haystack, $needle, $offset + $comp);
549
550            if ($pos === false)
551                return false;
552
553            $length = utf8_strlen(substr($haystack, 0, $pos));
554
555            if ($length < $offset)
556                $comp = $pos - $length;
557        }
558
559        return $length;
560    }
561}
562
563if(!function_exists('utf8_tohtml')){
564    /**
565     * Encodes UTF-8 characters to HTML entities
566     *
567     * @author Tom N Harris <tnharris@whoopdedo.org>
568     * @author <vpribish at shopping dot com>
569     * @link   http://php.net/manual/en/function.utf8-decode.php
570     *
571     * @param string $str
572     * @return string
573     */
574    function utf8_tohtml ($str) {
575        $ret = '';
576        foreach (utf8_to_unicode($str) as $cp) {
577            if ($cp < 0x80)
578                $ret .= chr($cp);
579            elseif ($cp < 0x100)
580                $ret .= "&#$cp;";
581            else
582                $ret .= '&#x'.dechex($cp).';';
583        }
584        return $ret;
585    }
586}
587
588if(!function_exists('utf8_unhtml')){
589    /**
590     * Decodes HTML entities to UTF-8 characters
591     *
592     * Convert any &#..; entity to a codepoint,
593     * The entities flag defaults to only decoding numeric entities.
594     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
595     * are handled as well. Avoids the problem that would occur if you
596     * had to decode "&amp;#38;&#38;amp;#38;"
597     *
598     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
599     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
600     * what it should be                   -> "&#38;&amp#38;"
601     *
602     * @author Tom N Harris <tnharris@whoopdedo.org>
603     *
604     * @param  string  $str      UTF-8 encoded string
605     * @param  boolean $entities Flag controlling decoding of named entities.
606     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
607     */
608    function utf8_unhtml($str, $entities=null) {
609        static $decoder = null;
610        if (is_null($decoder))
611            $decoder = new utf8_entity_decoder();
612        if (is_null($entities))
613            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
614                                         'utf8_decode_numeric', $str);
615        else
616            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
617                                         array(&$decoder, 'decode'), $str);
618    }
619}
620
621if(!function_exists('utf8_decode_numeric')){
622    /**
623     * Decodes numeric HTML entities to their correct UTF-8 characters
624     *
625     * @param $ent string A numeric entity
626     * @return string|false
627     */
628    function utf8_decode_numeric($ent) {
629        switch ($ent[2]) {
630            case 'X':
631            case 'x':
632                $cp = hexdec($ent[3]);
633                break;
634            default:
635                $cp = intval($ent[3]);
636                break;
637        }
638        return unicode_to_utf8(array($cp));
639    }
640}
641
642if(!class_exists('utf8_entity_decoder')){
643    /**
644     * Encapsulate HTML entity decoding tables
645     */
646    class utf8_entity_decoder {
647        protected $table;
648
649        /**
650         * Initializes the decoding tables
651         */
652        function __construct() {
653            $table = get_html_translation_table(HTML_ENTITIES);
654            $table = array_flip($table);
655            $this->table = array_map(array(&$this,'makeutf8'), $table);
656        }
657
658        /**
659         * Wrapper around unicode_to_utf8()
660         *
661         * @param string $c
662         * @return string|false
663         */
664        function makeutf8($c) {
665            return unicode_to_utf8(array(ord($c)));
666        }
667
668        /**
669         * Decodes any HTML entity to it's correct UTF-8 char equivalent
670         *
671         * @param string $ent An entity
672         * @return string|false
673         */
674        function decode($ent) {
675            if ($ent[1] == '#') {
676                return utf8_decode_numeric($ent);
677            } elseif (array_key_exists($ent[0],$this->table)) {
678                return $this->table[$ent[0]];
679            } else {
680                return $ent[0];
681            }
682        }
683    }
684}
685
686if(!function_exists('utf8_to_unicode')){
687    /**
688     * Takes an UTF-8 string and returns an array of ints representing the
689     * Unicode characters. Astral planes are supported ie. the ints in the
690     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
691     * are not allowed.
692     *
693     * If $strict is set to true the function returns false if the input
694     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
695     * level E_USER_WARNING
696     *
697     * Note: this function has been modified slightly in this library to
698     * trigger errors on encountering bad bytes
699     *
700     * @author <hsivonen@iki.fi>
701     * @author Harry Fuecks <hfuecks@gmail.com>
702     * @see    unicode_to_utf8
703     * @link   http://hsivonen.iki.fi/php-utf8/
704     * @link   http://sourceforge.net/projects/phputf8/
705     *
706     * @param  string  $str UTF-8 encoded string
707     * @param  boolean $strict Check for invalid sequences?
708     * @return mixed array of unicode code points or false if UTF-8 invalid
709     */
710    function utf8_to_unicode($str,$strict=false) {
711        $mState = 0;     // cached expected number of octets after the current octet
712                         // until the beginning of the next UTF8 character sequence
713        $mUcs4  = 0;     // cached Unicode character
714        $mBytes = 1;     // cached expected number of octets in the current sequence
715
716        $out = array();
717
718        $len = strlen($str);
719
720        for($i = 0; $i < $len; $i++) {
721
722            $in = ord($str[$i]);
723
724            if ( $mState == 0) {
725
726                // When mState is zero we expect either a US-ASCII character or a
727                // multi-octet sequence.
728                if (0 == (0x80 & ($in))) {
729                    // US-ASCII, pass straight through.
730                    $out[] = $in;
731                    $mBytes = 1;
732
733                } else if (0xC0 == (0xE0 & ($in))) {
734                    // First octet of 2 octet sequence
735                    $mUcs4 = ($in);
736                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
737                    $mState = 1;
738                    $mBytes = 2;
739
740                } else if (0xE0 == (0xF0 & ($in))) {
741                    // First octet of 3 octet sequence
742                    $mUcs4 = ($in);
743                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
744                    $mState = 2;
745                    $mBytes = 3;
746
747                } else if (0xF0 == (0xF8 & ($in))) {
748                    // First octet of 4 octet sequence
749                    $mUcs4 = ($in);
750                    $mUcs4 = ($mUcs4 & 0x07) << 18;
751                    $mState = 3;
752                    $mBytes = 4;
753
754                } else if (0xF8 == (0xFC & ($in))) {
755                    /* First octet of 5 octet sequence.
756                     *
757                     * This is illegal because the encoded codepoint must be either
758                     * (a) not the shortest form or
759                     * (b) outside the Unicode range of 0-0x10FFFF.
760                     * Rather than trying to resynchronize, we will carry on until the end
761                     * of the sequence and let the later error handling code catch it.
762                     */
763                    $mUcs4 = ($in);
764                    $mUcs4 = ($mUcs4 & 0x03) << 24;
765                    $mState = 4;
766                    $mBytes = 5;
767
768                } else if (0xFC == (0xFE & ($in))) {
769                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
770                    $mUcs4 = ($in);
771                    $mUcs4 = ($mUcs4 & 1) << 30;
772                    $mState = 5;
773                    $mBytes = 6;
774
775                } elseif($strict) {
776                    /* Current octet is neither in the US-ASCII range nor a legal first
777                     * octet of a multi-octet sequence.
778                     */
779                    trigger_error(
780                            'utf8_to_unicode: Illegal sequence identifier '.
781                                'in UTF-8 at byte '.$i,
782                            E_USER_WARNING
783                        );
784                    return false;
785
786                }
787
788            } else {
789
790                // When mState is non-zero, we expect a continuation of the multi-octet
791                // sequence
792                if (0x80 == (0xC0 & ($in))) {
793
794                    // Legal continuation.
795                    $shift = ($mState - 1) * 6;
796                    $tmp = $in;
797                    $tmp = ($tmp & 0x0000003F) << $shift;
798                    $mUcs4 |= $tmp;
799
800                    /**
801                     * End of the multi-octet sequence. mUcs4 now contains the final
802                     * Unicode codepoint to be output
803                     */
804                    if (0 == --$mState) {
805
806                        /*
807                         * Check for illegal sequences and codepoints.
808                         */
809                        // From Unicode 3.1, non-shortest form is illegal
810                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
811                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
812                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
813                            (4 < $mBytes) ||
814                            // From Unicode 3.2, surrogate characters are illegal
815                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
816                            // Codepoints outside the Unicode range are illegal
817                            ($mUcs4 > 0x10FFFF)) {
818
819                            if($strict){
820                                trigger_error(
821                                        'utf8_to_unicode: Illegal sequence or codepoint '.
822                                            'in UTF-8 at byte '.$i,
823                                        E_USER_WARNING
824                                    );
825
826                                return false;
827                            }
828
829                        }
830
831                        if (0xFEFF != $mUcs4) {
832                            // BOM is legal but we don't want to output it
833                            $out[] = $mUcs4;
834                        }
835
836                        //initialize UTF8 cache
837                        $mState = 0;
838                        $mUcs4  = 0;
839                        $mBytes = 1;
840                    }
841
842                } elseif($strict) {
843                    /**
844                     *((0xC0 & (*in) != 0x80) && (mState != 0))
845                     * Incomplete multi-octet sequence.
846                     */
847                    trigger_error(
848                            'utf8_to_unicode: Incomplete multi-octet '.
849                            '   sequence in UTF-8 at byte '.$i,
850                            E_USER_WARNING
851                        );
852
853                    return false;
854                }
855            }
856        }
857        return $out;
858    }
859}
860
861if(!function_exists('unicode_to_utf8')){
862    /**
863     * Takes an array of ints representing the Unicode characters and returns
864     * a UTF-8 string. Astral planes are supported ie. the ints in the
865     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
866     * are not allowed.
867     *
868     * If $strict is set to true the function returns false if the input
869     * array contains ints that represent surrogates or are outside the
870     * Unicode range and raises a PHP error at level E_USER_WARNING
871     *
872     * Note: this function has been modified slightly in this library to use
873     * output buffering to concatenate the UTF-8 string (faster) as well as
874     * reference the array by it's keys
875     *
876     * @param  array $arr of unicode code points representing a string
877     * @param  boolean $strict Check for invalid sequences?
878     * @return string|false UTF-8 string or false if array contains invalid code points
879     *
880     * @author <hsivonen@iki.fi>
881     * @author Harry Fuecks <hfuecks@gmail.com>
882     * @see    utf8_to_unicode
883     * @link   http://hsivonen.iki.fi/php-utf8/
884     * @link   http://sourceforge.net/projects/phputf8/
885     */
886    function unicode_to_utf8($arr,$strict=false) {
887        if (!is_array($arr)) return '';
888        ob_start();
889
890        foreach (array_keys($arr) as $k) {
891
892            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
893                # ASCII range (including control chars)
894
895                echo chr($arr[$k]);
896
897            } else if ($arr[$k] <= 0x07ff) {
898                # 2 byte sequence
899
900                echo chr(0xc0 | ($arr[$k] >> 6));
901                echo chr(0x80 | ($arr[$k] & 0x003f));
902
903            } else if($arr[$k] == 0xFEFF) {
904                # Byte order mark (skip)
905
906                // nop -- zap the BOM
907
908            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
909                # Test for illegal surrogates
910
911                // found a surrogate
912                if($strict){
913                    trigger_error(
914                        'unicode_to_utf8: Illegal surrogate '.
915                            'at index: '.$k.', value: '.$arr[$k],
916                        E_USER_WARNING
917                        );
918                    return false;
919                }
920
921            } else if ($arr[$k] <= 0xffff) {
922                # 3 byte sequence
923
924                echo chr(0xe0 | ($arr[$k] >> 12));
925                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
926                echo chr(0x80 | ($arr[$k] & 0x003f));
927
928            } else if ($arr[$k] <= 0x10ffff) {
929                # 4 byte sequence
930
931                echo chr(0xf0 | ($arr[$k] >> 18));
932                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
933                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
934                echo chr(0x80 | ($arr[$k] & 0x3f));
935
936            } elseif($strict) {
937
938                trigger_error(
939                    'unicode_to_utf8: Codepoint out of Unicode range '.
940                        'at index: '.$k.', value: '.$arr[$k],
941                    E_USER_WARNING
942                    );
943
944                // out of range
945                return false;
946            }
947        }
948
949        $result = ob_get_contents();
950        ob_end_clean();
951        return $result;
952    }
953}
954
955if(!function_exists('utf8_to_utf16be')){
956    /**
957     * UTF-8 to UTF-16BE conversion.
958     *
959     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
960     *
961     * @param string $str
962     * @param bool $bom
963     * @return string
964     */
965    function utf8_to_utf16be(&$str, $bom = false) {
966        $out = $bom ? "\xFE\xFF" : '';
967        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
968
969        $uni = utf8_to_unicode($str);
970        foreach($uni as $cp){
971            $out .= pack('n',$cp);
972        }
973        return $out;
974    }
975}
976
977if(!function_exists('utf16be_to_utf8')){
978    /**
979     * UTF-8 to UTF-16BE conversion.
980     *
981     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
982     *
983     * @param string $str
984     * @return false|string
985     */
986    function utf16be_to_utf8(&$str) {
987        $uni = unpack('n*',$str);
988        return unicode_to_utf8($uni);
989    }
990}
991
992if(!function_exists('utf8_bad_replace')){
993    /**
994     * Replace bad bytes with an alternative character
995     *
996     * ASCII character is recommended for replacement char
997     *
998     * PCRE Pattern to locate bad bytes in a UTF-8 string
999     * Comes from W3 FAQ: Multilingual Forms
1000     * Note: modified to include full ASCII range including control chars
1001     *
1002     * @author Harry Fuecks <hfuecks@gmail.com>
1003     * @see http://www.w3.org/International/questions/qa-forms-utf-8
1004     *
1005     * @param string $str to search
1006     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
1007     * @return string
1008     */
1009    function utf8_bad_replace($str, $replace = '') {
1010        $UTF8_BAD =
1011         '([\x00-\x7F]'.                          # ASCII (including control chars)
1012         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
1013         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
1014         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
1015         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
1016         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
1017         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
1018         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
1019         '|(.{1}))';                              # invalid byte
1020        ob_start();
1021        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
1022            if ( !isset($matches[2])) {
1023                echo $matches[0];
1024            } else {
1025                echo $replace;
1026            }
1027            $str = substr($str,strlen($matches[0]));
1028        }
1029        $result = ob_get_contents();
1030        ob_end_clean();
1031        return $result;
1032    }
1033}
1034
1035if(!function_exists('utf8_correctIdx')){
1036    /**
1037     * adjust a byte index into a utf8 string to a utf8 character boundary
1038     *
1039     * @param string $str   utf8 character string
1040     * @param int    $i     byte index into $str
1041     * @param $next  bool     direction to search for boundary,
1042     *                           false = up (current character)
1043     *                           true = down (next character)
1044     *
1045     * @return int            byte index into $str now pointing to a utf8 character boundary
1046     *
1047     * @author       chris smith <chris@jalakai.co.uk>
1048     */
1049    function utf8_correctIdx(&$str,$i,$next=false) {
1050
1051        if ($i <= 0) return 0;
1052
1053        $limit = strlen($str);
1054        if ($i>=$limit) return $limit;
1055
1056        if ($next) {
1057            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
1058        } else {
1059            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
1060        }
1061
1062        return $i;
1063    }
1064}
1065
1066// only needed if no mb_string available
1067if(!UTF8_MBSTRING){
1068    /**
1069     * UTF-8 Case lookup table
1070     *
1071     * This lookuptable defines the upper case letters to their correspponding
1072     * lower case letter in UTF-8
1073     *
1074     * @author Andreas Gohr <andi@splitbrain.org>
1075     */
1076    global $UTF8_LOWER_TO_UPPER;
1077    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1078            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
1079            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
1080            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1081            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1082            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1083            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1084            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1085            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1086            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1087            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1088            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1089            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1090            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1091            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1092            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1093            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1094            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1095            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1096            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1097            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1098            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1099            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1100            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1101            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1102            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1103            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1104            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1105            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1106            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1107            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1108            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1109            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1110            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1111            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1112            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1113            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1114            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1115            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1116            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1117            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1118            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1119            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1120            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1121            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1122            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1123            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1124            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1125            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1126            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1127            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1128            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1129            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1130            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1131            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1132            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1133            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1134            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1135            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1136            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1137            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1138            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1139            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1140            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1141            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1142            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1143            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1144            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1145                );
1146
1147    /**
1148     * UTF-8 Case lookup table
1149     *
1150     * This lookuptable defines the lower case letters to their corresponding
1151     * upper case letter in UTF-8
1152     *
1153     * @author Andreas Gohr <andi@splitbrain.org>
1154     */
1155    global $UTF8_UPPER_TO_LOWER;
1156    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1157            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1158            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1159            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1160            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1161            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1162            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1163            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1164            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1165            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1166            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1167            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1168            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1169            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1170            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1171            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1172            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1173            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1174            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1175            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1176            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1177            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1178            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1179            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1180            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1181            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1182            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1183            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1184            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1185            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1186            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1187            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1188            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1189            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1190            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1191            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1192            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1193            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1194            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1195            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1196            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1197            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1198            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1199            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1200            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1201            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1202            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1203            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1204            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1205            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1206            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1207            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1208            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1209            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1210            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1211            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1212            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1213            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1214            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1215            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1216            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1217            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1218            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1219            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1220            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1221            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1222            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1223            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1224                );
1225}; // end of case lookup tables
1226
1227/**
1228 * UTF-8 lookup table for lower case accented letters
1229 *
1230 * This lookuptable defines replacements for accented characters from the ASCII-7
1231 * range. This are lower case letters only.
1232 *
1233 * @author Andreas Gohr <andi@splitbrain.org>
1234 * @see    utf8_deaccent()
1235 */
1236global $UTF8_LOWER_ACCENTS;
1237if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1238  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1239  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1240  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1241  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1242  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1243  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1244  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1245  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1246  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1247  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1248  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1249  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1250  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1251  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1252  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1253);
1254
1255/**
1256 * UTF-8 lookup table for upper case accented letters
1257 *
1258 * This lookuptable defines replacements for accented characters from the ASCII-7
1259 * range. This are upper case letters only.
1260 *
1261 * @author Andreas Gohr <andi@splitbrain.org>
1262 * @see    utf8_deaccent()
1263 */
1264global $UTF8_UPPER_ACCENTS;
1265if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1266  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1267  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1268  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1269  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1270  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1271  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1272  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1273  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1274  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1275  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1276  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1277  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1278  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1279  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1280  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1281);
1282
1283/**
1284 * UTF-8 array of common special characters
1285 *
1286 * This array should contain all special characters (not a letter or digit)
1287 * defined in the various local charsets - it's not a complete list of non-alphanum
1288 * characters in UTF-8. It's not perfect but should match most cases of special
1289 * chars.
1290 *
1291 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1292 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1293 *
1294 * @author Andreas Gohr <andi@splitbrain.org>
1295 * @see    utf8_stripspecials()
1296 */
1297global $UTF8_SPECIAL_CHARS;
1298if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1299  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1300  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1301          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1302  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1303  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1304  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1305  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1306  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1307  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1308  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1309  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1310  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1311  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1312  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1313  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1314  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1315  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1316  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1317  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1318  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1319  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1320  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1321  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1322  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1323  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1324  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1325  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1326  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1327  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1328  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1329  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1330  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1331  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1332  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1333  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1334  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1335  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1336  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1337  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1338  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1339  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1340  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1341  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1342  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1343  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1344  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1345  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1346  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1347  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1348  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1349  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1350  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1351  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1352          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1353  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1354  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1355  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1356  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1357  0xffeb, 0xffec, 0xffed, 0xffee,
1358  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1359  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1360  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1361  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1362  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1363);
1364
1365// utf8 version of above data
1366global $UTF8_SPECIAL_CHARS2;
1367if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1368    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1369    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1370    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1371    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1372    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1373    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1374    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1375    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1376    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1377    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1378    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1379    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1380    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1381    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1382    '➷➸➹➺➻➼➽➾'.
1383    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1384    '�'.
1385    '�ﹼﹽ'.
1386    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1387    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1388    '����������������������������������������������������������������'.
1389    '   ⁠';
1390
1391/**
1392 * Romanization lookup table
1393 *
1394 * This lookup tables provides a way to transform strings written in a language
1395 * different from the ones based upon latin letters into plain ASCII.
1396 *
1397 * Please note: this is not a scientific transliteration table. It only works
1398 * oneway from nonlatin to ASCII and it works by simple character replacement
1399 * only. Specialities of each language are not supported.
1400 *
1401 * @author Andreas Gohr <andi@splitbrain.org>
1402 * @author Vitaly Blokhin <vitinfo@vitn.com>
1403 * @link   http://www.uconv.com/translit.htm
1404 * @author Bisqwit <bisqwit@iki.fi>
1405 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1406 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1407 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1408 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1409 * @author Arthit Suriyawongkul <arthit@gmail.com>
1410 * @author Denis Scheither <amorphis@uni-bremen.de>
1411 * @author Eivind Morland <eivind.morland@gmail.com>
1412 */
1413global $UTF8_ROMANIZATION;
1414if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1415  // scandinavian - differs from what we do in deaccent
1416  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1417
1418  //russian cyrillic
1419  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1420  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1421  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1422  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1423  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1424  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1425  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1426  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1427  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1428  // Ukrainian cyrillic
1429  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1430  // Georgian
1431  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1432  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1433  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1434  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1435  'ჰ'=>'xh',
1436  //Sanskrit
1437  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1438  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1439  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1440  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1441  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1442  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1443  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1444  //Sanskrit diacritics
1445  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1446  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1447  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1448  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1449  //Hebrew
1450  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1451  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1452  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1453  'ש'=>'sh','ת'=>'t',
1454  //Arabic
1455  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1456  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1457  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1458  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1459
1460  // Japanese characters  (last update: 2008-05-09)
1461
1462  // Japanese hiragana
1463
1464  // 3 character syllables, っ doubles the consonant after
1465  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1466  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1467  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1468  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1469  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1470  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1471  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1472  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1473  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1474  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1475  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1476
1477  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1478  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1479  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1480
1481   // 2 character syllables - normal
1482  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1483  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1484  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1485  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1486  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1487  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1488  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1489  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1490  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1491  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1492  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1493  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1494  'うぇ'=>'we','うぃ'=>'wi',
1495  'いぇ'=>'ye',
1496
1497  // 2 character syllables, っ doubles the consonant after
1498  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1499  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1500  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1501  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1502  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1503  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1504  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1505  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1506  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1507  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1508  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1509
1510  // 1 character syllabels
1511  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1512  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1513  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1514  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1515  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1516  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1517  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1518  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1519  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1520  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1521  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1522  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1523  'わ'=>'wa','を'=>'wo',
1524  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1525  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1526  // old characters
1527  'ゑ'=>'we','ゐ'=>'wi',
1528
1529  //  convert what's left (probably only kicks in when something's missing above)
1530  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1531  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1532
1533  // never seen one of those (disabled for the moment)
1534  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1535  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1536  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1537  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1538  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1539  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1540  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1541  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1542  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1543  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1544  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1545  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1546  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1547  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1548
1549  // 'spare' characters from other romanization systems
1550  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1551  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1552  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1553  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1554  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1555  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1556  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1557  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1558  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1559  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1560
1561
1562  // Japanese katakana
1563
1564  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1565  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1566  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1567  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1568  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1569  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1570  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1571  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1572  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1573  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1574  'ッティー'=>'ttii',
1575  'ッヂィー'=>'ddii',
1576
1577  // 3 character syllables - doubled vowels
1578  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1579  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1580  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1581  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1582  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1583  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1584  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1585  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1586  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1587  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1588  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1589  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1590  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1591  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1592  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1593  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1594  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1595  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1596  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1597  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1598  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1599  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1600  'ウェー'=>'wee','ウィー'=>'wii',
1601  'イェー'=>'yee',
1602  'ティー'=>'tii',
1603  'ヂィー'=>'dii',
1604
1605  // 3 character syllables - doubled consonants
1606  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1607  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1608  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1609  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1610  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1611  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1612  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1613  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1614  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1615  'ッティ'=>'tti',
1616  'ッヂィ'=>'ddi',
1617
1618  // 3 character syllables - doubled vowel and consonants
1619  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1620  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1621  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1622  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1623  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1624  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1625  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1626  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1627  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1628  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1629  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1630
1631  // 2 character syllables - normal
1632  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1633  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1634  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1635  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1636  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1637  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1638  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1639  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1640  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1641  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1642  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1643  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1644  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1645  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1646  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1647  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1648  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1649  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1650  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1651  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1652  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1653  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1654  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1655  'ウェ'=>'we','ウィ'=>'wi',
1656  'イェ'=>'ye',
1657  'ティ'=>'ti',
1658  'ヂィ'=>'di',
1659
1660  // 2 character syllables - doubled vocal
1661  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1662  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1663  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1664  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1665  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1666  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1667  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1668  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1669  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1670  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1671  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1672  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1673  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1674  'ワー'=>'waa','ヲー'=>'woo',
1675  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1676  'ヵー'=>'kaa','ヶー'=>'kee',
1677  // old characters
1678  'ヱー'=>'wee','ヰー'=>'wii',
1679
1680  // seperate katakana 'n'
1681  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1682  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1683
1684  // 2 character syllables - doubled consonants
1685  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1686  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1687  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1688  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1689  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1690  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1691  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1692  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1693  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1694  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1695  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1696
1697  // 1 character syllables
1698  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1699  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1700  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1701  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1702  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1703  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1704  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1705  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1706  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1707  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1708  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1709  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1710  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1711  'ワ'=>'wa','ヲ'=>'wo',
1712  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1713  'ヵ'=>'ka','ヶ'=>'ke',
1714  // old characters
1715  'ヱ'=>'we','ヰ'=>'wi',
1716
1717  //  convert what's left (probably only kicks in when something's missing above)
1718  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1719  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1720
1721  // special characters
1722  '・'=>'_','、'=>'_',
1723  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1724
1725  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1726  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1727  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1728  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1729  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1730  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1731  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1732  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1733  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1734  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1735  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1736  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1737
1738  // "Greeklish"
1739  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1740  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1741
1742  // Thai
1743  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1744  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1745  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1746  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1747  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1748  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1749  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1750  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1751  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1752  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1753  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1754  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1755  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1756  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1757  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1758  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1759  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1760  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1761
1762  // Korean
1763  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1764  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1765  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1766  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1767  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1768  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1769);
1770
1771
1772