xref: /dokuwiki/inc/utf8.php (revision 1e8d98e3f18a82c252778432b5c3e338b725c121)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20/**
21 * Check if PREG was compiled with UTF-8 support
22 *
23 * Without this many of the functions below will not work, so this is a minimal requirement
24 */
25if(!defined('UTF8_PREGSUPPORT')){
26    define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
27}
28
29/**
30 * Check if PREG was compiled with Unicode Property support
31 *
32 * This is not required for the functions below, but might be needed in a UTF-8 aware application
33 */
34if(!defined('UTF8_PROPERTYSUPPORT')){
35    define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
36}
37
38
39if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
40
41if(!function_exists('utf8_isASCII')){
42    /**
43     * Checks if a string contains 7bit ASCII only
44     *
45     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
46     *
47     * @param string $str
48     * @return bool
49     */
50    function utf8_isASCII($str){
51        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
52    }
53}
54
55if(!function_exists('utf8_strip')){
56    /**
57     * Strips all highbyte chars
58     *
59     * Returns a pure ASCII7 string
60     *
61     * @author Andreas Gohr <andi@splitbrain.org>
62     *
63     * @param string $str
64     * @return string
65     */
66    function utf8_strip($str){
67        $ascii = '';
68        $len = strlen($str);
69        for($i=0; $i<$len; $i++){
70            if(ord($str{$i}) <128){
71                $ascii .= $str{$i};
72            }
73        }
74        return $ascii;
75    }
76}
77
78if(!function_exists('utf8_check')){
79    /**
80     * Tries to detect if a string is in Unicode encoding
81     *
82     * @author <bmorel@ssi.fr>
83     * @link   http://php.net/manual/en/function.utf8-encode.php
84     *
85     * @param string $Str
86     * @return bool
87     */
88    function utf8_check($Str) {
89        $len = strlen($Str);
90        for ($i=0; $i<$len; $i++) {
91            $b = ord($Str[$i]);
92            if ($b < 0x80) continue; # 0bbbbbbb
93            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
94            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
95            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
96            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
97            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
98            else return false; # Does not match any model
99
100            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
101                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
102                    return false;
103            }
104        }
105        return true;
106    }
107}
108
109if(!function_exists('utf8_basename')){
110    /**
111     * A locale independent basename() implementation
112     *
113     * works around a bug in PHP's basename() implementation
114     *
115     * @see basename()
116     * @link   https://bugs.php.net/bug.php?id=37738
117     *
118     * @param string $path     A path
119     * @param string $suffix   If the name component ends in suffix this will also be cut off
120     * @return string
121     */
122    function utf8_basename($path, $suffix=''){
123        $path = trim($path,'\\/');
124        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
125        if($rpos) $path = substr($path, $rpos+1);
126
127        $suflen = strlen($suffix);
128        if($suflen && (substr($path, -$suflen) == $suffix)){
129            $path = substr($path, 0, -$suflen);
130        }
131
132        return $path;
133    }
134}
135
136if(!function_exists('utf8_strlen')){
137    /**
138     * Unicode aware replacement for strlen()
139     *
140     * utf8_decode() converts characters that are not in ISO-8859-1
141     * to '?', which, for the purpose of counting, is alright - It's
142     * even faster than mb_strlen.
143     *
144     * @author <chernyshevsky at hotmail dot com>
145     * @see    strlen()
146     * @see    utf8_decode()
147     *
148     * @param string $string
149     * @return int
150     */
151    function utf8_strlen($string){
152        return strlen(utf8_decode($string));
153    }
154}
155
156if(!function_exists('utf8_substr')){
157    /**
158     * UTF-8 aware alternative to substr
159     *
160     * Return part of a string given character offset (and optionally length)
161     *
162     * @author Harry Fuecks <hfuecks@gmail.com>
163     * @author Chris Smith <chris@jalakai.co.uk>
164     *
165     * @param string $str
166     * @param int $offset number of UTF-8 characters offset (from left)
167     * @param int $length (optional) length in UTF-8 characters from offset
168     * @return string
169     */
170    function utf8_substr($str, $offset, $length = null) {
171        if(UTF8_MBSTRING){
172            if( $length === null ){
173                return mb_substr($str, $offset);
174            }else{
175                return mb_substr($str, $offset, $length);
176            }
177        }
178
179        /*
180         * Notes:
181         *
182         * no mb string support, so we'll use pcre regex's with 'u' flag
183         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
184         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
185         *
186         * substr documentation states false can be returned in some cases (e.g. offset > string length)
187         * mb_substr never returns false, it will return an empty string instead.
188         *
189         * calculating the number of characters in the string is a relatively expensive operation, so
190         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
191         */
192
193        // cast parameters to appropriate types to avoid multiple notices/warnings
194        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
195        $offset = (int)$offset;
196        if (!is_null($length)) $length = (int)$length;
197
198        // handle trivial cases
199        if ($length === 0) return '';
200        if ($offset < 0 && $length < 0 && $length < $offset) return '';
201
202        $offset_pattern = '';
203        $length_pattern = '';
204
205        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
206        if ($offset < 0) {
207            $strlen = strlen(utf8_decode($str));        // see notes
208            $offset = $strlen + $offset;
209            if ($offset < 0) $offset = 0;
210        }
211
212        // establish a pattern for offset, a non-captured group equal in length to offset
213        if ($offset > 0) {
214            $Ox = (int)($offset/65535);
215            $Oy = $offset%65535;
216
217            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
218            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
219        } else {
220            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
221        }
222
223        // establish a pattern for length
224        if (is_null($length)) {
225            $length_pattern = '(.*)$';                  // the rest of the string
226        } else {
227
228            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
229            if ($offset > $strlen) return '';           // another trivial case
230
231            if ($length > 0) {
232
233                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
234
235                $Lx = (int)($length/65535);
236                $Ly = $length%65535;
237
238                // +ve length requires ... a captured group of length characters
239                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
240                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
241
242            } else if ($length < 0) {
243
244                if ($length < ($offset - $strlen)) return '';
245
246                $Lx = (int)((-$length)/65535);
247                $Ly = (-$length)%65535;
248
249                // -ve length requires ... capture everything except a group of -length characters
250                //                         anchored at the tail-end of the string
251                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
252                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
253            }
254        }
255
256        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
257        return $match[1];
258    }
259}
260
261if(!function_exists('utf8_substr_replace')){
262    /**
263     * Unicode aware replacement for substr_replace()
264     *
265     * @author Andreas Gohr <andi@splitbrain.org>
266     * @see    substr_replace()
267     *
268     * @param string $string      input string
269     * @param string $replacement the replacement
270     * @param int    $start       the replacing will begin at the start'th offset into string.
271     * @param int    $length      If given and is positive, it represents the length of the portion of string which is
272     *                            to be replaced. If length is zero then this function will have the effect of inserting
273     *                            replacement into string at the given start offset.
274     * @return string
275     */
276    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
277        $ret = '';
278        if($start>0) $ret .= utf8_substr($string, 0, $start);
279        $ret .= $replacement;
280        $ret .= utf8_substr($string, $start+$length);
281        return $ret;
282    }
283}
284
285if(!function_exists('utf8_ltrim')){
286    /**
287     * Unicode aware replacement for ltrim()
288     *
289     * @author Andreas Gohr <andi@splitbrain.org>
290     * @see    ltrim()
291     *
292     * @param  string $str
293     * @param  string $charlist
294     * @return string
295     */
296    function utf8_ltrim($str,$charlist=''){
297        if($charlist == '') return ltrim($str);
298
299        //quote charlist for use in a characterclass
300        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
301
302        return preg_replace('/^['.$charlist.']+/u','',$str);
303    }
304}
305
306if(!function_exists('utf8_rtrim')){
307    /**
308     * Unicode aware replacement for rtrim()
309     *
310     * @author Andreas Gohr <andi@splitbrain.org>
311     * @see    rtrim()
312     *
313     * @param  string $str
314     * @param  string $charlist
315     * @return string
316     */
317    function  utf8_rtrim($str,$charlist=''){
318        if($charlist == '') return rtrim($str);
319
320        //quote charlist for use in a characterclass
321        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
322
323        return preg_replace('/['.$charlist.']+$/u','',$str);
324    }
325}
326
327if(!function_exists('utf8_trim')){
328    /**
329     * Unicode aware replacement for trim()
330     *
331     * @author Andreas Gohr <andi@splitbrain.org>
332     * @see    trim()
333     *
334     * @param  string $str
335     * @param  string $charlist
336     * @return string
337     */
338    function  utf8_trim($str,$charlist='') {
339        if($charlist == '') return trim($str);
340
341        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
342    }
343}
344
345if(!function_exists('utf8_strtolower')){
346    /**
347     * This is a unicode aware replacement for strtolower()
348     *
349     * Uses mb_string extension if available
350     *
351     * @author Leo Feyer <leo@typolight.org>
352     * @see    strtolower()
353     * @see    utf8_strtoupper()
354     *
355     * @param string $string
356     * @return string
357     */
358    function utf8_strtolower($string){
359        if(UTF8_MBSTRING) {
360            if (class_exists("Normalizer", $autoload = false))
361                return normalizer::normalize(mb_strtolower($string,'utf-8'));
362            else
363                return (mb_strtolower($string,'utf-8'));
364        }
365        global $UTF8_UPPER_TO_LOWER;
366        return strtr($string,$UTF8_UPPER_TO_LOWER);
367    }
368}
369
370if(!function_exists('utf8_strtoupper')){
371    /**
372     * This is a unicode aware replacement for strtoupper()
373     *
374     * Uses mb_string extension if available
375     *
376     * @author Leo Feyer <leo@typolight.org>
377     * @see    strtoupper()
378     * @see    utf8_strtoupper()
379     *
380     * @param string $string
381     * @return string
382     */
383    function utf8_strtoupper($string){
384        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
385
386        global $UTF8_LOWER_TO_UPPER;
387        return strtr($string,$UTF8_LOWER_TO_UPPER);
388    }
389}
390
391if(!function_exists('utf8_ucfirst')){
392    /**
393     * UTF-8 aware alternative to ucfirst
394     * Make a string's first character uppercase
395     *
396     * @author Harry Fuecks
397     *
398     * @param string $str
399     * @return string with first character as upper case (if applicable)
400     */
401    function utf8_ucfirst($str){
402        switch ( utf8_strlen($str) ) {
403            case 0:
404                return '';
405            case 1:
406                return utf8_strtoupper($str);
407            default:
408                preg_match('/^(.{1})(.*)$/us', $str, $matches);
409                return utf8_strtoupper($matches[1]).$matches[2];
410        }
411    }
412}
413
414if(!function_exists('utf8_ucwords')){
415    /**
416     * UTF-8 aware alternative to ucwords
417     * Uppercase the first character of each word in a string
418     *
419     * @author Harry Fuecks
420     * @see http://php.net/ucwords
421     *
422     * @param string $str
423     * @return string with first char of each word uppercase
424     */
425    function utf8_ucwords($str) {
426        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
427        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
428        // This corresponds to the definition of a "word" defined at http://php.net/ucwords
429        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
430
431        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
432    }
433
434    /**
435     * Callback function for preg_replace_callback call in utf8_ucwords
436     * You don't need to call this yourself
437     *
438     * @author Harry Fuecks
439     * @see utf8_ucwords
440     * @see utf8_strtoupper
441     *
442     * @param  array $matches matches corresponding to a single word
443     * @return string with first char of the word in uppercase
444     */
445    function utf8_ucwords_callback($matches) {
446        $leadingws = $matches[2];
447        $ucfirst = utf8_strtoupper($matches[3]);
448        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
449        return $leadingws . $ucword;
450    }
451}
452
453if(!function_exists('utf8_deaccent')){
454    /**
455     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
456     *
457     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
458     * letters. Default is to deaccent both cases ($case = 0)
459     *
460     * @author Andreas Gohr <andi@splitbrain.org>
461     *
462     * @param string $string
463     * @param int $case
464     * @return string
465     */
466    function utf8_deaccent($string,$case=0){
467        if($case <= 0){
468            global $UTF8_LOWER_ACCENTS;
469            $string = strtr($string,$UTF8_LOWER_ACCENTS);
470        }
471        if($case >= 0){
472            global $UTF8_UPPER_ACCENTS;
473            $string = strtr($string,$UTF8_UPPER_ACCENTS);
474        }
475        return $string;
476    }
477}
478
479if(!function_exists('utf8_romanize')){
480    /**
481     * Romanize a non-latin string
482     *
483     * @author Andreas Gohr <andi@splitbrain.org>
484     *
485     * @param string $string
486     * @return string
487     */
488    function utf8_romanize($string){
489        if(utf8_isASCII($string)) return $string; //nothing to do
490
491        global $UTF8_ROMANIZATION;
492        return strtr($string,$UTF8_ROMANIZATION);
493    }
494}
495
496if(!function_exists('utf8_stripspecials')){
497    /**
498     * Removes special characters (nonalphanumeric) from a UTF-8 string
499     *
500     * This function adds the controlchars 0x00 to 0x19 to the array of
501     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
502     *
503     * @author Andreas Gohr <andi@splitbrain.org>
504     *
505     * @param  string $string     The UTF8 string to strip of special chars
506     * @param  string $repl       Replace special with this string
507     * @param  string $additional Additional chars to strip (used in regexp char class)
508     * @return string
509     */
510    function utf8_stripspecials($string,$repl='',$additional=''){
511        global $UTF8_SPECIAL_CHARS2;
512
513        static $specials = null;
514        if(is_null($specials)){
515            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
516            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
517        }
518
519        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
520    }
521}
522
523if(!function_exists('utf8_strpos')){
524    /**
525     * This is an Unicode aware replacement for strpos
526     *
527     * @author Leo Feyer <leo@typolight.org>
528     * @see    strpos()
529     *
530     * @param  string  $haystack
531     * @param  string  $needle
532     * @param  integer $offset
533     * @return integer
534     */
535    function utf8_strpos($haystack, $needle, $offset=0){
536        $comp = 0;
537        $length = null;
538
539        while (is_null($length) || $length < $offset) {
540            $pos = strpos($haystack, $needle, $offset + $comp);
541
542            if ($pos === false)
543                return false;
544
545            $length = utf8_strlen(substr($haystack, 0, $pos));
546
547            if ($length < $offset)
548                $comp = $pos - $length;
549        }
550
551        return $length;
552    }
553}
554
555if(!function_exists('utf8_tohtml')){
556    /**
557     * Encodes UTF-8 characters to HTML entities
558     *
559     * @author Tom N Harris <tnharris@whoopdedo.org>
560     * @author <vpribish at shopping dot com>
561     * @link   http://php.net/manual/en/function.utf8-decode.php
562     *
563     * @param string $str
564     * @return string
565     */
566    function utf8_tohtml ($str) {
567        $ret = '';
568        foreach (utf8_to_unicode($str) as $cp) {
569            if ($cp < 0x80)
570                $ret .= chr($cp);
571            elseif ($cp < 0x100)
572                $ret .= "&#$cp;";
573            else
574                $ret .= '&#x'.dechex($cp).';';
575        }
576        return $ret;
577    }
578}
579
580if(!function_exists('utf8_unhtml')){
581    /**
582     * Decodes HTML entities to UTF-8 characters
583     *
584     * Convert any &#..; entity to a codepoint,
585     * The entities flag defaults to only decoding numeric entities.
586     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
587     * are handled as well. Avoids the problem that would occur if you
588     * had to decode "&amp;#38;&#38;amp;#38;"
589     *
590     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
591     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
592     * what it should be                   -> "&#38;&amp#38;"
593     *
594     * @author Tom N Harris <tnharris@whoopdedo.org>
595     *
596     * @param  string  $str      UTF-8 encoded string
597     * @param  boolean $entities Flag controlling decoding of named entities.
598     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
599     */
600    function utf8_unhtml($str, $entities=null) {
601        static $decoder = null;
602        if (is_null($decoder))
603            $decoder = new utf8_entity_decoder();
604        if (is_null($entities))
605            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
606                                         'utf8_decode_numeric', $str);
607        else
608            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
609                                         array(&$decoder, 'decode'), $str);
610    }
611}
612
613if(!function_exists('utf8_decode_numeric')){
614    /**
615     * Decodes numeric HTML entities to their correct UTF-8 characters
616     *
617     * @param $ent string A numeric entity
618     * @return string|false
619     */
620    function utf8_decode_numeric($ent) {
621        switch ($ent[2]) {
622            case 'X':
623            case 'x':
624                $cp = hexdec($ent[3]);
625                break;
626            default:
627                $cp = intval($ent[3]);
628                break;
629        }
630        return unicode_to_utf8(array($cp));
631    }
632}
633
634if(!class_exists('utf8_entity_decoder')){
635    /**
636     * Encapsulate HTML entity decoding tables
637     */
638    class utf8_entity_decoder {
639        var $table;
640
641        /**
642         * Initializes the decoding tables
643         */
644        function __construct() {
645            $table = get_html_translation_table(HTML_ENTITIES);
646            $table = array_flip($table);
647            $this->table = array_map(array(&$this,'makeutf8'), $table);
648        }
649
650        /**
651         * Wrapper around unicode_to_utf8()
652         *
653         * @param string $c
654         * @return string|false
655         */
656        function makeutf8($c) {
657            return unicode_to_utf8(array(ord($c)));
658        }
659
660        /**
661         * Decodes any HTML entity to it's correct UTF-8 char equivalent
662         *
663         * @param string $ent An entity
664         * @return string|false
665         */
666        function decode($ent) {
667            if ($ent[1] == '#') {
668                return utf8_decode_numeric($ent);
669            } elseif (array_key_exists($ent[0],$this->table)) {
670                return $this->table[$ent[0]];
671            } else {
672                return $ent[0];
673            }
674        }
675    }
676}
677
678if(!function_exists('utf8_to_unicode')){
679    /**
680     * Takes an UTF-8 string and returns an array of ints representing the
681     * Unicode characters. Astral planes are supported ie. the ints in the
682     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
683     * are not allowed.
684     *
685     * If $strict is set to true the function returns false if the input
686     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
687     * level E_USER_WARNING
688     *
689     * Note: this function has been modified slightly in this library to
690     * trigger errors on encountering bad bytes
691     *
692     * @author <hsivonen@iki.fi>
693     * @author Harry Fuecks <hfuecks@gmail.com>
694     * @see    unicode_to_utf8
695     * @link   http://hsivonen.iki.fi/php-utf8/
696     * @link   http://sourceforge.net/projects/phputf8/
697     *
698     * @param  string  $str UTF-8 encoded string
699     * @param  boolean $strict Check for invalid sequences?
700     * @return mixed array of unicode code points or false if UTF-8 invalid
701     */
702    function utf8_to_unicode($str,$strict=false) {
703        $mState = 0;     // cached expected number of octets after the current octet
704                         // until the beginning of the next UTF8 character sequence
705        $mUcs4  = 0;     // cached Unicode character
706        $mBytes = 1;     // cached expected number of octets in the current sequence
707
708        $out = array();
709
710        $len = strlen($str);
711
712        for($i = 0; $i < $len; $i++) {
713
714            $in = ord($str{$i});
715
716            if ( $mState == 0) {
717
718                // When mState is zero we expect either a US-ASCII character or a
719                // multi-octet sequence.
720                if (0 == (0x80 & ($in))) {
721                    // US-ASCII, pass straight through.
722                    $out[] = $in;
723                    $mBytes = 1;
724
725                } else if (0xC0 == (0xE0 & ($in))) {
726                    // First octet of 2 octet sequence
727                    $mUcs4 = ($in);
728                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
729                    $mState = 1;
730                    $mBytes = 2;
731
732                } else if (0xE0 == (0xF0 & ($in))) {
733                    // First octet of 3 octet sequence
734                    $mUcs4 = ($in);
735                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
736                    $mState = 2;
737                    $mBytes = 3;
738
739                } else if (0xF0 == (0xF8 & ($in))) {
740                    // First octet of 4 octet sequence
741                    $mUcs4 = ($in);
742                    $mUcs4 = ($mUcs4 & 0x07) << 18;
743                    $mState = 3;
744                    $mBytes = 4;
745
746                } else if (0xF8 == (0xFC & ($in))) {
747                    /* First octet of 5 octet sequence.
748                     *
749                     * This is illegal because the encoded codepoint must be either
750                     * (a) not the shortest form or
751                     * (b) outside the Unicode range of 0-0x10FFFF.
752                     * Rather than trying to resynchronize, we will carry on until the end
753                     * of the sequence and let the later error handling code catch it.
754                     */
755                    $mUcs4 = ($in);
756                    $mUcs4 = ($mUcs4 & 0x03) << 24;
757                    $mState = 4;
758                    $mBytes = 5;
759
760                } else if (0xFC == (0xFE & ($in))) {
761                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
762                    $mUcs4 = ($in);
763                    $mUcs4 = ($mUcs4 & 1) << 30;
764                    $mState = 5;
765                    $mBytes = 6;
766
767                } elseif($strict) {
768                    /* Current octet is neither in the US-ASCII range nor a legal first
769                     * octet of a multi-octet sequence.
770                     */
771                    trigger_error(
772                            'utf8_to_unicode: Illegal sequence identifier '.
773                                'in UTF-8 at byte '.$i,
774                            E_USER_WARNING
775                        );
776                    return false;
777
778                }
779
780            } else {
781
782                // When mState is non-zero, we expect a continuation of the multi-octet
783                // sequence
784                if (0x80 == (0xC0 & ($in))) {
785
786                    // Legal continuation.
787                    $shift = ($mState - 1) * 6;
788                    $tmp = $in;
789                    $tmp = ($tmp & 0x0000003F) << $shift;
790                    $mUcs4 |= $tmp;
791
792                    /**
793                     * End of the multi-octet sequence. mUcs4 now contains the final
794                     * Unicode codepoint to be output
795                     */
796                    if (0 == --$mState) {
797
798                        /*
799                         * Check for illegal sequences and codepoints.
800                         */
801                        // From Unicode 3.1, non-shortest form is illegal
802                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
803                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
804                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
805                            (4 < $mBytes) ||
806                            // From Unicode 3.2, surrogate characters are illegal
807                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
808                            // Codepoints outside the Unicode range are illegal
809                            ($mUcs4 > 0x10FFFF)) {
810
811                            if($strict){
812                                trigger_error(
813                                        'utf8_to_unicode: Illegal sequence or codepoint '.
814                                            'in UTF-8 at byte '.$i,
815                                        E_USER_WARNING
816                                    );
817
818                                return false;
819                            }
820
821                        }
822
823                        if (0xFEFF != $mUcs4) {
824                            // BOM is legal but we don't want to output it
825                            $out[] = $mUcs4;
826                        }
827
828                        //initialize UTF8 cache
829                        $mState = 0;
830                        $mUcs4  = 0;
831                        $mBytes = 1;
832                    }
833
834                } elseif($strict) {
835                    /**
836                     *((0xC0 & (*in) != 0x80) && (mState != 0))
837                     * Incomplete multi-octet sequence.
838                     */
839                    trigger_error(
840                            'utf8_to_unicode: Incomplete multi-octet '.
841                            '   sequence in UTF-8 at byte '.$i,
842                            E_USER_WARNING
843                        );
844
845                    return false;
846                }
847            }
848        }
849        return $out;
850    }
851}
852
853if(!function_exists('unicode_to_utf8')){
854    /**
855     * Takes an array of ints representing the Unicode characters and returns
856     * a UTF-8 string. Astral planes are supported ie. the ints in the
857     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
858     * are not allowed.
859     *
860     * If $strict is set to true the function returns false if the input
861     * array contains ints that represent surrogates or are outside the
862     * Unicode range and raises a PHP error at level E_USER_WARNING
863     *
864     * Note: this function has been modified slightly in this library to use
865     * output buffering to concatenate the UTF-8 string (faster) as well as
866     * reference the array by it's keys
867     *
868     * @param  array $arr of unicode code points representing a string
869     * @param  boolean $strict Check for invalid sequences?
870     * @return string|false UTF-8 string or false if array contains invalid code points
871     *
872     * @author <hsivonen@iki.fi>
873     * @author Harry Fuecks <hfuecks@gmail.com>
874     * @see    utf8_to_unicode
875     * @link   http://hsivonen.iki.fi/php-utf8/
876     * @link   http://sourceforge.net/projects/phputf8/
877     */
878    function unicode_to_utf8($arr,$strict=false) {
879        if (!is_array($arr)) return '';
880        ob_start();
881
882        foreach (array_keys($arr) as $k) {
883
884            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
885                # ASCII range (including control chars)
886
887                echo chr($arr[$k]);
888
889            } else if ($arr[$k] <= 0x07ff) {
890                # 2 byte sequence
891
892                echo chr(0xc0 | ($arr[$k] >> 6));
893                echo chr(0x80 | ($arr[$k] & 0x003f));
894
895            } else if($arr[$k] == 0xFEFF) {
896                # Byte order mark (skip)
897
898                // nop -- zap the BOM
899
900            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
901                # Test for illegal surrogates
902
903                // found a surrogate
904                if($strict){
905                    trigger_error(
906                        'unicode_to_utf8: Illegal surrogate '.
907                            'at index: '.$k.', value: '.$arr[$k],
908                        E_USER_WARNING
909                        );
910                    return false;
911                }
912
913            } else if ($arr[$k] <= 0xffff) {
914                # 3 byte sequence
915
916                echo chr(0xe0 | ($arr[$k] >> 12));
917                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
918                echo chr(0x80 | ($arr[$k] & 0x003f));
919
920            } else if ($arr[$k] <= 0x10ffff) {
921                # 4 byte sequence
922
923                echo chr(0xf0 | ($arr[$k] >> 18));
924                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
925                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
926                echo chr(0x80 | ($arr[$k] & 0x3f));
927
928            } elseif($strict) {
929
930                trigger_error(
931                    'unicode_to_utf8: Codepoint out of Unicode range '.
932                        'at index: '.$k.', value: '.$arr[$k],
933                    E_USER_WARNING
934                    );
935
936                // out of range
937                return false;
938            }
939        }
940
941        $result = ob_get_contents();
942        ob_end_clean();
943        return $result;
944    }
945}
946
947if(!function_exists('utf8_to_utf16be')){
948    /**
949     * UTF-8 to UTF-16BE conversion.
950     *
951     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
952     *
953     * @param string $str
954     * @param bool $bom
955     * @return string
956     */
957    function utf8_to_utf16be(&$str, $bom = false) {
958        $out = $bom ? "\xFE\xFF" : '';
959        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
960
961        $uni = utf8_to_unicode($str);
962        foreach($uni as $cp){
963            $out .= pack('n',$cp);
964        }
965        return $out;
966    }
967}
968
969if(!function_exists('utf16be_to_utf8')){
970    /**
971     * UTF-8 to UTF-16BE conversion.
972     *
973     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
974     *
975     * @param string $str
976     * @return false|string
977     */
978    function utf16be_to_utf8(&$str) {
979        $uni = unpack('n*',$str);
980        return unicode_to_utf8($uni);
981    }
982}
983
984if(!function_exists('utf8_bad_replace')){
985    /**
986     * Replace bad bytes with an alternative character
987     *
988     * ASCII character is recommended for replacement char
989     *
990     * PCRE Pattern to locate bad bytes in a UTF-8 string
991     * Comes from W3 FAQ: Multilingual Forms
992     * Note: modified to include full ASCII range including control chars
993     *
994     * @author Harry Fuecks <hfuecks@gmail.com>
995     * @see http://www.w3.org/International/questions/qa-forms-utf-8
996     *
997     * @param string $str to search
998     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
999     * @return string
1000     */
1001    function utf8_bad_replace($str, $replace = '') {
1002        $UTF8_BAD =
1003         '([\x00-\x7F]'.                          # ASCII (including control chars)
1004         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
1005         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
1006         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
1007         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
1008         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
1009         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
1010         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
1011         '|(.{1}))';                              # invalid byte
1012        ob_start();
1013        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
1014            if ( !isset($matches[2])) {
1015                echo $matches[0];
1016            } else {
1017                echo $replace;
1018            }
1019            $str = substr($str,strlen($matches[0]));
1020        }
1021        $result = ob_get_contents();
1022        ob_end_clean();
1023        return $result;
1024    }
1025}
1026
1027if(!function_exists('utf8_correctIdx')){
1028    /**
1029     * adjust a byte index into a utf8 string to a utf8 character boundary
1030     *
1031     * @param string $str   utf8 character string
1032     * @param int    $i     byte index into $str
1033     * @param $next  bool     direction to search for boundary,
1034     *                           false = up (current character)
1035     *                           true = down (next character)
1036     *
1037     * @return int            byte index into $str now pointing to a utf8 character boundary
1038     *
1039     * @author       chris smith <chris@jalakai.co.uk>
1040     */
1041    function utf8_correctIdx(&$str,$i,$next=false) {
1042
1043        if ($i <= 0) return 0;
1044
1045        $limit = strlen($str);
1046        if ($i>=$limit) return $limit;
1047
1048        if ($next) {
1049            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
1050        } else {
1051            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
1052        }
1053
1054        return $i;
1055    }
1056}
1057
1058// only needed if no mb_string available
1059if(!UTF8_MBSTRING){
1060    /**
1061     * UTF-8 Case lookup table
1062     *
1063     * This lookuptable defines the upper case letters to their correspponding
1064     * lower case letter in UTF-8
1065     *
1066     * @author Andreas Gohr <andi@splitbrain.org>
1067     */
1068    global $UTF8_LOWER_TO_UPPER;
1069    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1070            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
1071            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
1072            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1073            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1074            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1075            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1076            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1077            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1078            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1079            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1080            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1081            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1082            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1083            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1084            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1085            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1086            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1087            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1088            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1089            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1090            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1091            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1092            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1093            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1094            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1095            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1096            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1097            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1098            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1099            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1100            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1101            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1102            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1103            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1104            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1105            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1106            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1107            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1108            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1109            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1110            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1111            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1112            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1113            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1114            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1115            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1116            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1117            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1118            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1119            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1120            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1121            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1122            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1123            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1124            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1125            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1126            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1127            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1128            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1129            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1130            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1131            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1132            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1133            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1134            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1135            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1136            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1137                );
1138
1139    /**
1140     * UTF-8 Case lookup table
1141     *
1142     * This lookuptable defines the lower case letters to their corresponding
1143     * upper case letter in UTF-8
1144     *
1145     * @author Andreas Gohr <andi@splitbrain.org>
1146     */
1147    global $UTF8_UPPER_TO_LOWER;
1148    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1149            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1150            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1151            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1152            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1153            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1154            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1155            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1156            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1157            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1158            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1159            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1160            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1161            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1162            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1163            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1164            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1165            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1166            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1167            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1168            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1169            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1170            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1171            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1172            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1173            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1174            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1175            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1176            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1177            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1178            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1179            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1180            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1181            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1182            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1183            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1184            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1185            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1186            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1187            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1188            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1189            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1190            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1191            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1192            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1193            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1194            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1195            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1196            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1197            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1198            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1199            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1200            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1201            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1202            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1203            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1204            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1205            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1206            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1207            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1208            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1209            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1210            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1211            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1212            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1213            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1214            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1215            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1216                );
1217}; // end of case lookup tables
1218
1219/**
1220 * UTF-8 lookup table for lower case accented letters
1221 *
1222 * This lookuptable defines replacements for accented characters from the ASCII-7
1223 * range. This are lower case letters only.
1224 *
1225 * @author Andreas Gohr <andi@splitbrain.org>
1226 * @see    utf8_deaccent()
1227 */
1228global $UTF8_LOWER_ACCENTS;
1229if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1230  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1231  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1232  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1233  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1234  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1235  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1236  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1237  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1238  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1239  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1240  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1241  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1242  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1243  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1244  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1245);
1246
1247/**
1248 * UTF-8 lookup table for upper case accented letters
1249 *
1250 * This lookuptable defines replacements for accented characters from the ASCII-7
1251 * range. This are upper case letters only.
1252 *
1253 * @author Andreas Gohr <andi@splitbrain.org>
1254 * @see    utf8_deaccent()
1255 */
1256global $UTF8_UPPER_ACCENTS;
1257if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1258  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1259  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1260  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1261  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1262  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1263  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1264  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1265  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1266  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1267  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1268  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1269  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1270  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1271  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1272  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1273);
1274
1275/**
1276 * UTF-8 array of common special characters
1277 *
1278 * This array should contain all special characters (not a letter or digit)
1279 * defined in the various local charsets - it's not a complete list of non-alphanum
1280 * characters in UTF-8. It's not perfect but should match most cases of special
1281 * chars.
1282 *
1283 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1284 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1285 *
1286 * @author Andreas Gohr <andi@splitbrain.org>
1287 * @see    utf8_stripspecials()
1288 */
1289global $UTF8_SPECIAL_CHARS;
1290if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1291  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1292  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1293          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1294  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1295  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1296  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1297  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1298  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1299  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1300  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1301  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1302  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1303  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1304  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1305  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1306  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1307  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1308  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1309  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1310  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1311  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1312  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1313  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1314  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1315  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1316  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1317  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1318  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1319  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1320  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1321  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1322  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1323  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1324  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1325  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1326  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1327  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1328  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1329  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1330  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1331  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1332  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1333  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1334  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1335  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1336  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1337  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1338  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1339  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1340  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1341  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1342  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1343  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1344          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1345  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1346  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1347  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1348  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1349  0xffeb, 0xffec, 0xffed, 0xffee,
1350  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1351  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1352  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1353  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1354  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1355);
1356
1357// utf8 version of above data
1358global $UTF8_SPECIAL_CHARS2;
1359if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1360    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1361    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1362    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1363    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1364    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1365    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1366    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1367    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1368    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1369    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1370    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1371    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1372    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1373    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1374    '➷➸➹➺➻➼➽➾'.
1375    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1376    '�'.
1377    '�ﹼﹽ'.
1378    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1379    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1380    '����������������������������������������������������������������'.
1381    '   ⁠';
1382
1383/**
1384 * Romanization lookup table
1385 *
1386 * This lookup tables provides a way to transform strings written in a language
1387 * different from the ones based upon latin letters into plain ASCII.
1388 *
1389 * Please note: this is not a scientific transliteration table. It only works
1390 * oneway from nonlatin to ASCII and it works by simple character replacement
1391 * only. Specialities of each language are not supported.
1392 *
1393 * @author Andreas Gohr <andi@splitbrain.org>
1394 * @author Vitaly Blokhin <vitinfo@vitn.com>
1395 * @link   http://www.uconv.com/translit.htm
1396 * @author Bisqwit <bisqwit@iki.fi>
1397 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1398 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1399 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1400 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1401 * @author Arthit Suriyawongkul <arthit@gmail.com>
1402 * @author Denis Scheither <amorphis@uni-bremen.de>
1403 * @author Eivind Morland <eivind.morland@gmail.com>
1404 */
1405global $UTF8_ROMANIZATION;
1406if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1407  // scandinavian - differs from what we do in deaccent
1408  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1409
1410  //russian cyrillic
1411  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1412  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1413  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1414  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1415  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1416  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1417  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1418  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1419  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1420  // Ukrainian cyrillic
1421  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1422  // Georgian
1423  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1424  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1425  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1426  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1427  'ჰ'=>'xh',
1428  //Sanskrit
1429  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1430  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1431  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1432  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1433  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1434  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1435  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1436  //Sanskrit diacritics
1437  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1438  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1439  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1440  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1441  //Hebrew
1442  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1443  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1444  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1445  'ש'=>'sh','ת'=>'t',
1446  //Arabic
1447  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1448  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1449  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1450  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1451
1452  // Japanese characters  (last update: 2008-05-09)
1453
1454  // Japanese hiragana
1455
1456  // 3 character syllables, っ doubles the consonant after
1457  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1458  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1459  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1460  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1461  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1462  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1463  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1464  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1465  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1466  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1467  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1468
1469  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1470  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1471  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1472
1473   // 2 character syllables - normal
1474  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1475  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1476  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1477  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1478  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1479  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1480  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1481  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1482  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1483  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1484  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1485  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1486  'うぇ'=>'we','うぃ'=>'wi',
1487  'いぇ'=>'ye',
1488
1489  // 2 character syllables, っ doubles the consonant after
1490  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1491  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1492  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1493  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1494  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1495  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1496  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1497  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1498  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1499  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1500  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1501
1502  // 1 character syllabels
1503  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1504  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1505  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1506  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1507  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1508  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1509  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1510  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1511  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1512  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1513  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1514  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1515  'わ'=>'wa','を'=>'wo',
1516  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1517  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1518  // old characters
1519  'ゑ'=>'we','ゐ'=>'wi',
1520
1521  //  convert what's left (probably only kicks in when something's missing above)
1522  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1523  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1524
1525  // never seen one of those (disabled for the moment)
1526  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1527  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1528  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1529  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1530  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1531  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1532  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1533  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1534  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1535  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1536  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1537  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1538  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1539  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1540
1541  // 'spare' characters from other romanization systems
1542  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1543  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1544  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1545  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1546  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1547  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1548  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1549  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1550  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1551  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1552
1553
1554  // Japanese katakana
1555
1556  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1557  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1558  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1559  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1560  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1561  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1562  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1563  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1564  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1565  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1566  'ッティー'=>'ttii',
1567  'ッヂィー'=>'ddii',
1568
1569  // 3 character syllables - doubled vowels
1570  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1571  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1572  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1573  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1574  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1575  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1576  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1577  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1578  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1579  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1580  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1581  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1582  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1583  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1584  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1585  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1586  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1587  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1588  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1589  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1590  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1591  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1592  'ウェー'=>'wee','ウィー'=>'wii',
1593  'イェー'=>'yee',
1594  'ティー'=>'tii',
1595  'ヂィー'=>'dii',
1596
1597  // 3 character syllables - doubled consonants
1598  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1599  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1600  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1601  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1602  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1603  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1604  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1605  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1606  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1607  'ッティ'=>'tti',
1608  'ッヂィ'=>'ddi',
1609
1610  // 3 character syllables - doubled vowel and consonants
1611  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1612  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1613  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1614  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1615  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1616  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1617  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1618  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1619  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1620  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1621  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1622
1623  // 2 character syllables - normal
1624  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1625  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1626  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1627  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1628  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1629  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1630  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1631  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1632  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1633  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1634  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1635  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1636  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1637  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1638  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1639  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1640  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1641  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1642  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1643  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1644  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1645  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1646  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1647  'ウェ'=>'we','ウィ'=>'wi',
1648  'イェ'=>'ye',
1649  'ティ'=>'ti',
1650  'ヂィ'=>'di',
1651
1652  // 2 character syllables - doubled vocal
1653  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1654  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1655  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1656  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1657  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1658  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1659  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1660  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1661  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1662  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1663  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1664  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1665  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1666  'ワー'=>'waa','ヲー'=>'woo',
1667  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1668  'ヵー'=>'kaa','ヶー'=>'kee',
1669  // old characters
1670  'ヱー'=>'wee','ヰー'=>'wii',
1671
1672  // seperate katakana 'n'
1673  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1674  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1675
1676  // 2 character syllables - doubled consonants
1677  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1678  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1679  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1680  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1681  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1682  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1683  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1684  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1685  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1686  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1687  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1688
1689  // 1 character syllables
1690  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1691  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1692  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1693  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1694  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1695  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1696  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1697  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1698  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1699  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1700  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1701  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1702  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1703  'ワ'=>'wa','ヲ'=>'wo',
1704  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1705  'ヵ'=>'ka','ヶ'=>'ke',
1706  // old characters
1707  'ヱ'=>'we','ヰ'=>'wi',
1708
1709  //  convert what's left (probably only kicks in when something's missing above)
1710  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1711  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1712
1713  // special characters
1714  '・'=>'_','、'=>'_',
1715  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1716
1717  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1718  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1719  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1720  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1721  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1722  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1723  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1724  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1725  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1726  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1727  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1728  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1729
1730  // "Greeklish"
1731  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1732  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1733
1734  // Thai
1735  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1736  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1737  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1738  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1739  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1740  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1741  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1742  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1743  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1744  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1745  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1746  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1747  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1748  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1749  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1750  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1751  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1752  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1753
1754  // Korean
1755  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1756  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1757  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1758  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1759  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1760  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1761);
1762
1763
1764