xref: /dokuwiki/inc/utf8.php (revision f2cfd2ce9ab3c204e78cd3e6589f8bb8d0079621)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14        define('UTF8_MBSTRING',1);
15    }else{
16        define('UTF8_MBSTRING',0);
17    }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_isASCII')){
23    /**
24     * Checks if a string contains 7bit ASCII only
25     *
26     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
27     */
28    function utf8_isASCII($str){
29        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30    }
31}
32
33if(!function_exists('utf8_strip')){
34    /**
35     * Strips all highbyte chars
36     *
37     * Returns a pure ASCII7 string
38     *
39     * @author Andreas Gohr <andi@splitbrain.org>
40     */
41    function utf8_strip($str){
42        $ascii = '';
43        $len = strlen($str);
44        for($i=0; $i<$len; $i++){
45            if(ord($str{$i}) <128){
46                $ascii .= $str{$i};
47            }
48        }
49        return $ascii;
50    }
51}
52
53if(!function_exists('utf8_check')){
54    /**
55     * Tries to detect if a string is in Unicode encoding
56     *
57     * @author <bmorel@ssi.fr>
58     * @link   http://www.php.net/manual/en/function.utf8-encode.php
59     */
60    function utf8_check($Str) {
61        $len = strlen($Str);
62        for ($i=0; $i<$len; $i++) {
63            $b = ord($Str[$i]);
64            if ($b < 0x80) continue; # 0bbbbbbb
65            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70            else return false; # Does not match any model
71
72            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74                    return false;
75            }
76        }
77        return true;
78    }
79}
80
81if(!function_exists('utf8_strlen')){
82    /**
83     * Unicode aware replacement for strlen()
84     *
85     * utf8_decode() converts characters that are not in ISO-8859-1
86     * to '?', which, for the purpose of counting, is alright - It's
87     * even faster than mb_strlen.
88     *
89     * @author <chernyshevsky at hotmail dot com>
90     * @see    strlen()
91     * @see    utf8_decode()
92     */
93    function utf8_strlen($string){
94        return strlen(utf8_decode($string));
95    }
96}
97
98if(!function_exists('utf8_substr')){
99    /**
100     * UTF-8 aware alternative to substr
101     *
102     * Return part of a string given character offset (and optionally length)
103     *
104     * @author Harry Fuecks <hfuecks@gmail.com>
105     * @author Chris Smith <chris@jalakai.co.uk>
106     * @param string
107     * @param integer number of UTF-8 characters offset (from left)
108     * @param integer (optional) length in UTF-8 characters from offset
109     * @return mixed string or false if failure
110     */
111    function utf8_substr($str, $offset, $length = null) {
112        if(UTF8_MBSTRING){
113            if( $length === null ){
114                return mb_substr($str, $offset);
115            }else{
116                return mb_substr($str, $offset, $length);
117            }
118        }
119
120        /*
121         * Notes:
122         *
123         * no mb string support, so we'll use pcre regex's with 'u' flag
124         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
125         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
126         *
127         * substr documentation states false can be returned in some cases (e.g. offset > string length)
128         * mb_substr never returns false, it will return an empty string instead.
129         *
130         * calculating the number of characters in the string is a relatively expensive operation, so
131         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
132         */
133
134        // cast parameters to appropriate types to avoid multiple notices/warnings
135        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
136        $offset = (int)$offset;
137        if (!is_null($length)) $length = (int)$length;
138
139        // handle trivial cases
140        if ($length === 0) return '';
141        if ($offset < 0 && $length < 0 && $length < $offset) return '';
142
143        $offset_pattern = '';
144        $length_pattern = '';
145
146        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
147        if ($offset < 0) {
148            $strlen = strlen(utf8_decode($str));        // see notes
149            $offset = $strlen + $offset;
150            if ($offset < 0) $offset = 0;
151        }
152
153        // establish a pattern for offset, a non-captured group equal in length to offset
154        if ($offset > 0) {
155            $Ox = (int)($offset/65535);
156            $Oy = $offset%65535;
157
158            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
159            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
160        } else {
161            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
162        }
163
164        // establish a pattern for length
165        if (is_null($length)) {
166            $length_pattern = '(.*)$';                  // the rest of the string
167        } else {
168
169            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
170            if ($offset > $strlen) return '';           // another trivial case
171
172            if ($length > 0) {
173
174                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
175
176                $Lx = (int)($length/65535);
177                $Ly = $length%65535;
178
179                // +ve length requires ... a captured group of length characters
180                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
181                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
182
183            } else if ($length < 0) {
184
185                if ($length < ($offset - $strlen)) return '';
186
187                $Lx = (int)((-$length)/65535);
188                $Ly = (-$length)%65535;
189
190                // -ve length requires ... capture everything except a group of -length characters
191                //                         anchored at the tail-end of the string
192                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
193                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
194            }
195        }
196
197        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
198        return $match[1];
199    }
200}
201
202if(!function_exists('utf8_substr_replace')){
203    /**
204     * Unicode aware replacement for substr_replace()
205     *
206     * @author Andreas Gohr <andi@splitbrain.org>
207     * @see    substr_replace()
208     */
209    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
210        $ret = '';
211        if($start>0) $ret .= utf8_substr($string, 0, $start);
212        $ret .= $replacement;
213        $ret .= utf8_substr($string, $start+$length);
214        return $ret;
215    }
216}
217
218if(!function_exists('utf8_ltrim')){
219    /**
220     * Unicode aware replacement for ltrim()
221     *
222     * @author Andreas Gohr <andi@splitbrain.org>
223     * @see    ltrim()
224     * @return string
225     */
226    function utf8_ltrim($str,$charlist=''){
227        if($charlist == '') return ltrim($str);
228
229        //quote charlist for use in a characterclass
230        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
231
232        return preg_replace('/^['.$charlist.']+/u','',$str);
233    }
234}
235
236if(!function_exists('utf8_rtrim')){
237    /**
238     * Unicode aware replacement for rtrim()
239     *
240     * @author Andreas Gohr <andi@splitbrain.org>
241     * @see    rtrim()
242     * @return string
243     */
244    function  utf8_rtrim($str,$charlist=''){
245        if($charlist == '') return rtrim($str);
246
247        //quote charlist for use in a characterclass
248        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
249
250        return preg_replace('/['.$charlist.']+$/u','',$str);
251    }
252}
253
254if(!function_exists('utf8_trim')){
255    /**
256     * Unicode aware replacement for trim()
257     *
258     * @author Andreas Gohr <andi@splitbrain.org>
259     * @see    trim()
260     * @return string
261     */
262    function  utf8_trim($str,$charlist='') {
263        if($charlist == '') return trim($str);
264
265        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
266    }
267}
268
269if(!function_exists('utf8_strtolower')){
270    /**
271     * This is a unicode aware replacement for strtolower()
272     *
273     * Uses mb_string extension if available
274     *
275     * @author Leo Feyer <leo@typolight.org>
276     * @see    strtolower()
277     * @see    utf8_strtoupper()
278     */
279    function utf8_strtolower($string){
280        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
281
282        global $UTF8_UPPER_TO_LOWER;
283        return strtr($string,$UTF8_UPPER_TO_LOWER);
284    }
285}
286
287if(!function_exists('utf8_strtoupper')){
288    /**
289     * This is a unicode aware replacement for strtoupper()
290     *
291     * Uses mb_string extension if available
292     *
293     * @author Leo Feyer <leo@typolight.org>
294     * @see    strtoupper()
295     * @see    utf8_strtoupper()
296     */
297    function utf8_strtoupper($string){
298        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
299
300        global $UTF8_LOWER_TO_UPPER;
301        return strtr($string,$UTF8_LOWER_TO_UPPER);
302    }
303}
304
305if(!function_exists('utf8_ucfirst')){
306    /**
307     * UTF-8 aware alternative to ucfirst
308     * Make a string's first character uppercase
309     *
310     * @author Harry Fuecks
311     * @param string
312     * @return string with first character as upper case (if applicable)
313     */
314    function utf8_ucfirst($str){
315        switch ( utf8_strlen($str) ) {
316            case 0:
317                return '';
318            case 1:
319                return utf8_strtoupper($str);
320            default:
321                preg_match('/^(.{1})(.*)$/us', $str, $matches);
322                return utf8_strtoupper($matches[1]).$matches[2];
323        }
324    }
325}
326
327if(!function_exists('utf8_ucwords')){
328    /**
329     * UTF-8 aware alternative to ucwords
330     * Uppercase the first character of each word in a string
331     *
332     * @author Harry Fuecks
333     * @param string
334     * @return string with first char of each word uppercase
335     * @see http://www.php.net/ucwords
336     */
337    function utf8_ucwords($str) {
338        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
339        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
340        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
341        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
342
343        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
344    }
345
346    /**
347     * Callback function for preg_replace_callback call in utf8_ucwords
348     * You don't need to call this yourself
349     *
350     * @author Harry Fuecks
351     * @param array of matches corresponding to a single word
352     * @return string with first char of the word in uppercase
353     * @see utf8_ucwords
354     * @see utf8_strtoupper
355     */
356    function utf8_ucwords_callback($matches) {
357        $leadingws = $matches[2];
358        $ucfirst = utf8_strtoupper($matches[3]);
359        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
360        return $leadingws . $ucword;
361    }
362}
363
364if(!function_exists('utf8_deaccent')){
365    /**
366     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
367     *
368     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
369     * letters. Default is to deaccent both cases ($case = 0)
370     *
371     * @author Andreas Gohr <andi@splitbrain.org>
372     */
373    function utf8_deaccent($string,$case=0){
374        if($case <= 0){
375            global $UTF8_LOWER_ACCENTS;
376            $string = strtr($string,$UTF8_LOWER_ACCENTS);
377        }
378        if($case >= 0){
379            global $UTF8_UPPER_ACCENTS;
380            $string = strtr($string,$UTF8_UPPER_ACCENTS);
381        }
382        return $string;
383    }
384}
385
386if(!function_exists('utf8_romanize')){
387    /**
388     * Romanize a non-latin string
389     *
390     * @author Andreas Gohr <andi@splitbrain.org>
391     */
392    function utf8_romanize($string){
393        if(utf8_isASCII($string)) return $string; //nothing to do
394
395        global $UTF8_ROMANIZATION;
396        return strtr($string,$UTF8_ROMANIZATION);
397    }
398}
399
400if(!function_exists('utf8_stripspecials')){
401    /**
402     * Removes special characters (nonalphanumeric) from a UTF-8 string
403     *
404     * This function adds the controlchars 0x00 to 0x19 to the array of
405     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
406     *
407     * @author Andreas Gohr <andi@splitbrain.org>
408     * @param  string $string     The UTF8 string to strip of special chars
409     * @param  string $repl       Replace special with this string
410     * @param  string $additional Additional chars to strip (used in regexp char class)
411     */
412    function utf8_stripspecials($string,$repl='',$additional=''){
413        global $UTF8_SPECIAL_CHARS;
414        global $UTF8_SPECIAL_CHARS2;
415
416        static $specials = null;
417        if(is_null($specials)){
418            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
419            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
420        }
421
422        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
423    }
424}
425
426if(!function_exists('utf8_strpos')){
427    /**
428     * This is an Unicode aware replacement for strpos
429     *
430     * @author Leo Feyer <leo@typolight.org>
431     * @see    strpos()
432     * @param  string
433     * @param  string
434     * @param  integer
435     * @return integer
436     */
437    function utf8_strpos($haystack, $needle, $offset=0){
438        $comp = 0;
439        $length = null;
440
441        while (is_null($length) || $length < $offset) {
442            $pos = strpos($haystack, $needle, $offset + $comp);
443
444            if ($pos === false)
445                return false;
446
447            $length = utf8_strlen(substr($haystack, 0, $pos));
448
449            if ($length < $offset)
450                $comp = $pos - $length;
451        }
452
453        return $length;
454    }
455}
456
457if(!function_exists('utf8_tohtml')){
458    /**
459     * Encodes UTF-8 characters to HTML entities
460     *
461     * @author Tom N Harris <tnharris@whoopdedo.org>
462     * @author <vpribish at shopping dot com>
463     * @link   http://www.php.net/manual/en/function.utf8-decode.php
464     */
465    function utf8_tohtml ($str) {
466        $ret = '';
467        foreach (utf8_to_unicode($str) as $cp) {
468            if ($cp < 0x80)
469                $ret .= chr($cp);
470            elseif ($cp < 0x100)
471                $ret .= "&#$cp;";
472            else
473                $ret .= '&#x'.dechex($cp).';';
474        }
475        return $ret;
476    }
477}
478
479if(!function_exists('utf8_unhtml')){
480    /**
481     * Decodes HTML entities to UTF-8 characters
482     *
483     * Convert any &#..; entity to a codepoint,
484     * The entities flag defaults to only decoding numeric entities.
485     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
486     * are handled as well. Avoids the problem that would occur if you
487     * had to decode "&amp;#38;&#38;amp;#38;"
488     *
489     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
490     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
491     * what it should be                   -> "&#38;&amp#38;"
492     *
493     * @author Tom N Harris <tnharris@whoopdedo.org>
494     * @param  string  $str      UTF-8 encoded string
495     * @param  boolean $entities Flag controlling decoding of named entities.
496     * @return UTF-8 encoded string with numeric (and named) entities replaced.
497     */
498    function utf8_unhtml($str, $entities=null) {
499        static $decoder = null;
500        if (is_null($decoder))
501            $decoder = new utf8_entity_decoder();
502        if (is_null($entities))
503            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
504                                         'utf8_decode_numeric', $str);
505        else
506            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
507                                         array(&$decoder, 'decode'), $str);
508    }
509}
510
511if(!function_exists('utf8_decode_numeric')){
512    function utf8_decode_numeric($ent) {
513        switch ($ent[2]) {
514            case 'X':
515            case 'x':
516                $cp = hexdec($ent[3]);
517                break;
518            default:
519                $cp = intval($ent[3]);
520                break;
521        }
522        return unicode_to_utf8(array($cp));
523    }
524}
525
526if(!class_exists('utf8_entity_decoder')){
527    class utf8_entity_decoder {
528        var $table;
529        function utf8_entity_decoder() {
530            $table = get_html_translation_table(HTML_ENTITIES);
531            $table = array_flip($table);
532            $this->table = array_map(array(&$this,'makeutf8'), $table);
533        }
534        function makeutf8($c) {
535            return unicode_to_utf8(array(ord($c)));
536        }
537        function decode($ent) {
538            if ($ent[1] == '#') {
539                return utf8_decode_numeric($ent);
540            } elseif (array_key_exists($ent[0],$this->table)) {
541                return $this->table[$ent[0]];
542            } else {
543                return $ent[0];
544            }
545        }
546    }
547}
548
549if(!function_exists('utf8_to_unicode')){
550    /**
551     * Takes an UTF-8 string and returns an array of ints representing the
552     * Unicode characters. Astral planes are supported ie. the ints in the
553     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
554     * are not allowed.
555     *
556     * If $strict is set to true the function returns false if the input
557     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
558     * level E_USER_WARNING
559     *
560     * Note: this function has been modified slightly in this library to
561     * trigger errors on encountering bad bytes
562     *
563     * @author <hsivonen@iki.fi>
564     * @author Harry Fuecks <hfuecks@gmail.com>
565     * @param  string  UTF-8 encoded string
566     * @param  boolean Check for invalid sequences?
567     * @return mixed array of unicode code points or false if UTF-8 invalid
568     * @see    unicode_to_utf8
569     * @link   http://hsivonen.iki.fi/php-utf8/
570     * @link   http://sourceforge.net/projects/phputf8/
571     */
572    function utf8_to_unicode($str,$strict=false) {
573        $mState = 0;     // cached expected number of octets after the current octet
574                         // until the beginning of the next UTF8 character sequence
575        $mUcs4  = 0;     // cached Unicode character
576        $mBytes = 1;     // cached expected number of octets in the current sequence
577
578        $out = array();
579
580        $len = strlen($str);
581
582        for($i = 0; $i < $len; $i++) {
583
584            $in = ord($str{$i});
585
586            if ( $mState == 0) {
587
588                // When mState is zero we expect either a US-ASCII character or a
589                // multi-octet sequence.
590                if (0 == (0x80 & ($in))) {
591                    // US-ASCII, pass straight through.
592                    $out[] = $in;
593                    $mBytes = 1;
594
595                } else if (0xC0 == (0xE0 & ($in))) {
596                    // First octet of 2 octet sequence
597                    $mUcs4 = ($in);
598                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
599                    $mState = 1;
600                    $mBytes = 2;
601
602                } else if (0xE0 == (0xF0 & ($in))) {
603                    // First octet of 3 octet sequence
604                    $mUcs4 = ($in);
605                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
606                    $mState = 2;
607                    $mBytes = 3;
608
609                } else if (0xF0 == (0xF8 & ($in))) {
610                    // First octet of 4 octet sequence
611                    $mUcs4 = ($in);
612                    $mUcs4 = ($mUcs4 & 0x07) << 18;
613                    $mState = 3;
614                    $mBytes = 4;
615
616                } else if (0xF8 == (0xFC & ($in))) {
617                    /* First octet of 5 octet sequence.
618                     *
619                     * This is illegal because the encoded codepoint must be either
620                     * (a) not the shortest form or
621                     * (b) outside the Unicode range of 0-0x10FFFF.
622                     * Rather than trying to resynchronize, we will carry on until the end
623                     * of the sequence and let the later error handling code catch it.
624                     */
625                    $mUcs4 = ($in);
626                    $mUcs4 = ($mUcs4 & 0x03) << 24;
627                    $mState = 4;
628                    $mBytes = 5;
629
630                } else if (0xFC == (0xFE & ($in))) {
631                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
632                    $mUcs4 = ($in);
633                    $mUcs4 = ($mUcs4 & 1) << 30;
634                    $mState = 5;
635                    $mBytes = 6;
636
637                } elseif($strict) {
638                    /* Current octet is neither in the US-ASCII range nor a legal first
639                     * octet of a multi-octet sequence.
640                     */
641                    trigger_error(
642                            'utf8_to_unicode: Illegal sequence identifier '.
643                                'in UTF-8 at byte '.$i,
644                            E_USER_WARNING
645                        );
646                    return false;
647
648                }
649
650            } else {
651
652                // When mState is non-zero, we expect a continuation of the multi-octet
653                // sequence
654                if (0x80 == (0xC0 & ($in))) {
655
656                    // Legal continuation.
657                    $shift = ($mState - 1) * 6;
658                    $tmp = $in;
659                    $tmp = ($tmp & 0x0000003F) << $shift;
660                    $mUcs4 |= $tmp;
661
662                    /**
663                     * End of the multi-octet sequence. mUcs4 now contains the final
664                     * Unicode codepoint to be output
665                     */
666                    if (0 == --$mState) {
667
668                        /*
669                         * Check for illegal sequences and codepoints.
670                         */
671                        // From Unicode 3.1, non-shortest form is illegal
672                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
673                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
674                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
675                            (4 < $mBytes) ||
676                            // From Unicode 3.2, surrogate characters are illegal
677                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
678                            // Codepoints outside the Unicode range are illegal
679                            ($mUcs4 > 0x10FFFF)) {
680
681                            if($strict){
682                                trigger_error(
683                                        'utf8_to_unicode: Illegal sequence or codepoint '.
684                                            'in UTF-8 at byte '.$i,
685                                        E_USER_WARNING
686                                    );
687
688                                return false;
689                            }
690
691                        }
692
693                        if (0xFEFF != $mUcs4) {
694                            // BOM is legal but we don't want to output it
695                            $out[] = $mUcs4;
696                        }
697
698                        //initialize UTF8 cache
699                        $mState = 0;
700                        $mUcs4  = 0;
701                        $mBytes = 1;
702                    }
703
704                } elseif($strict) {
705                    /**
706                     *((0xC0 & (*in) != 0x80) && (mState != 0))
707                     * Incomplete multi-octet sequence.
708                     */
709                    trigger_error(
710                            'utf8_to_unicode: Incomplete multi-octet '.
711                            '   sequence in UTF-8 at byte '.$i,
712                            E_USER_WARNING
713                        );
714
715                    return false;
716                }
717            }
718        }
719        return $out;
720    }
721}
722
723if(!function_exists('unicode_to_utf8')){
724    /**
725     * Takes an array of ints representing the Unicode characters and returns
726     * a UTF-8 string. Astral planes are supported ie. the ints in the
727     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
728     * are not allowed.
729     *
730     * If $strict is set to true the function returns false if the input
731     * array contains ints that represent surrogates or are outside the
732     * Unicode range and raises a PHP error at level E_USER_WARNING
733     *
734     * Note: this function has been modified slightly in this library to use
735     * output buffering to concatenate the UTF-8 string (faster) as well as
736     * reference the array by it's keys
737     *
738     * @param  array of unicode code points representing a string
739     * @param  boolean Check for invalid sequences?
740     * @return mixed UTF-8 string or false if array contains invalid code points
741     * @author <hsivonen@iki.fi>
742     * @author Harry Fuecks <hfuecks@gmail.com>
743     * @see    utf8_to_unicode
744     * @link   http://hsivonen.iki.fi/php-utf8/
745     * @link   http://sourceforge.net/projects/phputf8/
746     */
747    function unicode_to_utf8($arr,$strict=false) {
748        if (!is_array($arr)) return '';
749        ob_start();
750
751        foreach (array_keys($arr) as $k) {
752
753            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
754                # ASCII range (including control chars)
755
756                echo chr($arr[$k]);
757
758            } else if ($arr[$k] <= 0x07ff) {
759                # 2 byte sequence
760
761                echo chr(0xc0 | ($arr[$k] >> 6));
762                echo chr(0x80 | ($arr[$k] & 0x003f));
763
764            } else if($arr[$k] == 0xFEFF) {
765                # Byte order mark (skip)
766
767                // nop -- zap the BOM
768
769            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
770                # Test for illegal surrogates
771
772                // found a surrogate
773                if($strict){
774                    trigger_error(
775                        'unicode_to_utf8: Illegal surrogate '.
776                            'at index: '.$k.', value: '.$arr[$k],
777                        E_USER_WARNING
778                        );
779                    return false;
780                }
781
782            } else if ($arr[$k] <= 0xffff) {
783                # 3 byte sequence
784
785                echo chr(0xe0 | ($arr[$k] >> 12));
786                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
787                echo chr(0x80 | ($arr[$k] & 0x003f));
788
789            } else if ($arr[$k] <= 0x10ffff) {
790                # 4 byte sequence
791
792                echo chr(0xf0 | ($arr[$k] >> 18));
793                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
794                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
795                echo chr(0x80 | ($arr[$k] & 0x3f));
796
797            } elseif($strict) {
798
799                trigger_error(
800                    'unicode_to_utf8: Codepoint out of Unicode range '.
801                        'at index: '.$k.', value: '.$arr[$k],
802                    E_USER_WARNING
803                    );
804
805                // out of range
806                return false;
807            }
808        }
809
810        $result = ob_get_contents();
811        ob_end_clean();
812        return $result;
813    }
814}
815
816if(!function_exists('utf8_to_utf16be')){
817    /**
818     * UTF-8 to UTF-16BE conversion.
819     *
820     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
821     */
822    function utf8_to_utf16be(&$str, $bom = false) {
823        $out = $bom ? "\xFE\xFF" : '';
824        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
825
826        $uni = utf8_to_unicode($str);
827        foreach($uni as $cp){
828            $out .= pack('n',$cp);
829        }
830        return $out;
831    }
832}
833
834if(!function_exists('utf16be_to_utf8')){
835    /**
836     * UTF-8 to UTF-16BE conversion.
837     *
838     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
839     */
840    function utf16be_to_utf8(&$str) {
841        $uni = unpack('n*',$str);
842        return unicode_to_utf8($uni);
843    }
844}
845
846if(!function_exists('utf8_bad_replace')){
847    /**
848     * Replace bad bytes with an alternative character
849     *
850     * ASCII character is recommended for replacement char
851     *
852     * PCRE Pattern to locate bad bytes in a UTF-8 string
853     * Comes from W3 FAQ: Multilingual Forms
854     * Note: modified to include full ASCII range including control chars
855     *
856     * @author Harry Fuecks <hfuecks@gmail.com>
857     * @see http://www.w3.org/International/questions/qa-forms-utf-8
858     * @param string to search
859     * @param string to replace bad bytes with (defaults to '?') - use ASCII
860     * @return string
861     */
862    function utf8_bad_replace($str, $replace = '') {
863        $UTF8_BAD =
864         '([\x00-\x7F]'.                          # ASCII (including control chars)
865         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
866         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
867         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
868         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
869         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
870         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
871         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
872         '|(.{1}))';                              # invalid byte
873        ob_start();
874        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
875            if ( !isset($matches[2])) {
876                echo $matches[0];
877            } else {
878                echo $replace;
879            }
880            $str = substr($str,strlen($matches[0]));
881        }
882        $result = ob_get_contents();
883        ob_end_clean();
884        return $result;
885    }
886}
887
888if(!function_exists('utf8_correctIdx')){
889    /**
890     * adjust a byte index into a utf8 string to a utf8 character boundary
891     *
892     * @param $str   string   utf8 character string
893     * @param $i     int      byte index into $str
894     * @param $next  bool     direction to search for boundary,
895     *                           false = up (current character)
896     *                           true = down (next character)
897     *
898     * @return int            byte index into $str now pointing to a utf8 character boundary
899     *
900     * @author       chris smith <chris@jalakai.co.uk>
901     */
902    function utf8_correctIdx(&$str,$i,$next=false) {
903
904        if ($i <= 0) return 0;
905
906        $limit = strlen($str);
907        if ($i>=$limit) return $limit;
908
909        if ($next) {
910            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
911        } else {
912            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
913        }
914
915        return $i;
916    }
917}
918
919// only needed if no mb_string available
920if(!UTF8_MBSTRING){
921    /**
922     * UTF-8 Case lookup table
923     *
924     * This lookuptable defines the upper case letters to their correspponding
925     * lower case letter in UTF-8
926     *
927     * @author Andreas Gohr <andi@splitbrain.org>
928     */
929    global $UTF8_LOWER_TO_UPPER;
930    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
931            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
932            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
933            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
934            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
935            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
936            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
937            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
938            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
939            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
940            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
941            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
942            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
943            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
944            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
945            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
946            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
947            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
948            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
949            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
950            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
951            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
952            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
953            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
954            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
955            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
956            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
957            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
958            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
959            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
960            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
961            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
962            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
963            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
964            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
965            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
966            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
967            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
968            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
969            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
970            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
971            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
972            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
973            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
974            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
975            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
976            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
977            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
978            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
979            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
980            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
981            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
982            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
983            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
984            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
985            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
986            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
987            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
988            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
989            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
990            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
991            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
992            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
993            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
994            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
995            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
996            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
997            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
998                );
999
1000    /**
1001     * UTF-8 Case lookup table
1002     *
1003     * This lookuptable defines the lower case letters to their correspponding
1004     * upper case letter in UTF-8
1005     *
1006     * @author Andreas Gohr <andi@splitbrain.org>
1007     */
1008    global $UTF8_UPPER_TO_LOWER;
1009    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1010            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1011            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1012            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1013            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1014            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1015            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1016            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1017            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1018            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1019            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1020            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1021            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1022            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1023            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1024            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1025            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1026            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1027            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1028            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1029            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1030            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1031            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1032            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1033            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1034            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1035            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1036            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1037            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1038            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1039            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1040            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1041            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1042            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1043            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1044            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1045            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1046            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1047            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1048            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1049            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1050            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1051            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1052            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1053            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1054            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1055            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1056            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1057            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1058            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1059            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1060            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1061            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1062            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1063            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1064            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1065            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1066            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1067            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1068            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1069            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1070            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1071            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1072            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1073            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1074            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1075            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1076            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1077                );
1078}; // end of case lookup tables
1079
1080/**
1081 * UTF-8 lookup table for lower case accented letters
1082 *
1083 * This lookuptable defines replacements for accented characters from the ASCII-7
1084 * range. This are lower case letters only.
1085 *
1086 * @author Andreas Gohr <andi@splitbrain.org>
1087 * @see    utf8_deaccent()
1088 */
1089global $UTF8_LOWER_ACCENTS;
1090if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1091  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1092  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1093  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1094  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1095  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1096  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1097  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1098  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1099  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1100  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1101  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1102  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1103  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1104  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1105  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1106);
1107
1108/**
1109 * UTF-8 lookup table for upper case accented letters
1110 *
1111 * This lookuptable defines replacements for accented characters from the ASCII-7
1112 * range. This are upper case letters only.
1113 *
1114 * @author Andreas Gohr <andi@splitbrain.org>
1115 * @see    utf8_deaccent()
1116 */
1117global $UTF8_UPPER_ACCENTS;
1118if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1119  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1120  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1121  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1122  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1123  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1124  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1125  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1126  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1127  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1128  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1129  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1130  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1131  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1132  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1133  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1134);
1135
1136/**
1137 * UTF-8 array of common special characters
1138 *
1139 * This array should contain all special characters (not a letter or digit)
1140 * defined in the various local charsets - it's not a complete list of non-alphanum
1141 * characters in UTF-8. It's not perfect but should match most cases of special
1142 * chars.
1143 *
1144 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1145 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1146 *
1147 * @author Andreas Gohr <andi@splitbrain.org>
1148 * @see    utf8_stripspecials()
1149 */
1150global $UTF8_SPECIAL_CHARS;
1151if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1152  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1153  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1154          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1155  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1156  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1157  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1158  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1159  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1160  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1161  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1162  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1163  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1164  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1165  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1166  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1167  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1168  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1169  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1170  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1171  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1172  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1173  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1174  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1175  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1176  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1177  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1178  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1179  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1180  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1181  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1182  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1183  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1184  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1185  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1186  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1187  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1188  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1189  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1190  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1191  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1192  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1193  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1194  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1195  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1196  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1197  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1198  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1199  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1200  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1201  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1202  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1203  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1204  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1205          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1206  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1207  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1208  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1209  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1210  0xffeb, 0xffec, 0xffed, 0xffee,
1211  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1212  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1213  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1214  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1215  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1216);
1217
1218// utf8 version of above data
1219global $UTF8_SPECIAL_CHARS2;
1220if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1221    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1222    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1223    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1224    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1225    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1226    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1227    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1228    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1229    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1230    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1231    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1232    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1233    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1234    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1235    '➷➸➹➺➻➼➽➾'.
1236    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1237    '�'.
1238    '�ﹼﹽ'.
1239    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1240    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1241    '����������������������������������������������������������������'.
1242    '   ⁠';
1243
1244/**
1245 * Romanization lookup table
1246 *
1247 * This lookup tables provides a way to transform strings written in a language
1248 * different from the ones based upon latin letters into plain ASCII.
1249 *
1250 * Please note: this is not a scientific transliteration table. It only works
1251 * oneway from nonlatin to ASCII and it works by simple character replacement
1252 * only. Specialities of each language are not supported.
1253 *
1254 * @author Andreas Gohr <andi@splitbrain.org>
1255 * @author Vitaly Blokhin <vitinfo@vitn.com>
1256 * @link   http://www.uconv.com/translit.htm
1257 * @author Bisqwit <bisqwit@iki.fi>
1258 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1259 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1260 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1261 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1262 * @author Arthit Suriyawongkul <arthit@gmail.com>
1263 * @author Denis Scheither <amorphis@uni-bremen.de>
1264 */
1265global $UTF8_ROMANIZATION;
1266if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1267  // scandinavian - differs from what we do in deaccent
1268  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1269
1270  //russian cyrillic
1271  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1272  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1273  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1274  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1275  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1276  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1277  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1278  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1279  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1280  // Ukrainian cyrillic
1281  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1282  // Georgian
1283  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1284  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1285  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1286  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1287  'ჰ'=>'xh',
1288  //Sanskrit
1289  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1290  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1291  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1292  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1293  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1294  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1295  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1296  //Hebrew
1297  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1298  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1299  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1300  'ש'=>'sh','ת'=>'t',
1301  //Arabic
1302  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1303  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1304  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1305  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1306
1307  // Japanese characters  (last update: 2008-05-09)
1308
1309  // Japanese hiragana
1310
1311  // 3 character syllables, っ doubles the consonant after
1312  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1313  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1314  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1315  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1316  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1317  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1318  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1319  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1320  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1321  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1322  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1323
1324  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1325  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1326  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1327
1328   // 2 character syllables - normal
1329  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1330  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1331  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1332  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1333  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1334  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1335  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1336  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1337  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1338  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1339  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1340  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1341  'うぇ'=>'we','うぃ'=>'wi',
1342  'いぇ'=>'ye',
1343
1344  // 2 character syllables, っ doubles the consonant after
1345  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1346  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1347  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1348  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1349  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1350  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1351  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1352  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1353  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1354  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1355  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1356
1357  // 1 character syllabels
1358  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1359  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1360  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1361  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1362  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1363  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1364  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1365  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1366  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1367  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1368  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1369  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1370  'わ'=>'wa','を'=>'wo',
1371  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1372  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1373  // old characters
1374  'ゑ'=>'we','ゐ'=>'wi',
1375
1376  //  convert what's left (probably only kicks in when something's missing above)
1377  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1378  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1379
1380  // never seen one of those (disabled for the moment)
1381  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1382  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1383  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1384  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1385  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1386  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1387  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1388  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1389  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1390  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1391  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1392  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1393  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1394  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1395
1396  // 'spare' characters from other romanization systems
1397  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1398  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1399  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1400  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1401  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1402  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1403  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1404  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1405  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1406  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1407
1408
1409  // Japanese katakana
1410
1411  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1412  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1413  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1414  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1415  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1416  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1417  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1418  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1419  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1420  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1421  'ッティー'=>'ttii',
1422  'ッヂィー'=>'ddii',
1423
1424  // 3 character syllables - doubled vowels
1425  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1426  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1427  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1428  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1429  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1430  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1431  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1432  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1433  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1434  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1435  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1436  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1437  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1438  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1439  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1440  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1441  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1442  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1443  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1444  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1445  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1446  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1447  'ウェー'=>'wee','ウィー'=>'wii',
1448  'イェー'=>'yee',
1449  'ティー'=>'tii',
1450  'ヂィー'=>'dii',
1451
1452  // 3 character syllables - doubled consonants
1453  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1454  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1455  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1456  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1457  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1458  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1459  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1460  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1461  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1462  'ッティ'=>'tti',
1463  'ッヂィ'=>'ddi',
1464
1465  // 3 character syllables - doubled vowel and consonants
1466  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1467  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1468  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1469  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1470  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1471  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1472  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1473  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1474  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1475  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1476  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1477
1478  // 2 character syllables - normal
1479  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1480  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1481  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1482  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1483  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1484  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1485  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1486  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1487  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1488  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1489  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1490  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1491  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1492  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1493  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1494  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1495  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1496  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1497  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1498  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1499  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1500  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1501  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1502  'ウェ'=>'we','ウィ'=>'wi',
1503  'イェ'=>'ye',
1504  'ティ'=>'ti',
1505  'ヂィ'=>'di',
1506
1507  // 2 character syllables - doubled vocal
1508  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1509  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1510  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1511  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1512  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1513  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1514  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1515  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1516  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1517  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1518  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1519  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1520  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1521  'ワー'=>'waa','ヲー'=>'woo',
1522  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1523  'ヵー'=>'kaa','ヶー'=>'kee',
1524  // old characters
1525  'ヱー'=>'wee','ヰー'=>'wii',
1526
1527  // seperate katakana 'n'
1528  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1529  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1530
1531  // 2 character syllables - doubled consonants
1532  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1533  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1534  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1535  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1536  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1537  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1538  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1539  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1540  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1541  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1542  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1543
1544  // 1 character syllables
1545  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1546  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1547  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1548  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1549  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1550  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1551  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1552  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1553  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1554  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1555  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1556  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1557  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1558  'ワ'=>'wa','ヲ'=>'wo',
1559  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1560  'ヵ'=>'ka','ヶ'=>'ke',
1561  // old characters
1562  'ヱ'=>'we','ヰ'=>'wi',
1563
1564  //  convert what's left (probably only kicks in when something's missing above)
1565  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1566  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1567
1568  // special characters
1569  '・'=>'_','、'=>'_',
1570  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1571
1572  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1573  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1574  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1575  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1576  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1577  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1578  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1579  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1580  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1581  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1582  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1583  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1584
1585  // "Greeklish"
1586  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1587  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1588
1589  // Thai
1590  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1591  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1592  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1593  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1594  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1595  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1596  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1597  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1598  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1599  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1600  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1601  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1602  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1603  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1604  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1605  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1606  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1607  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1608
1609  // Korean
1610  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1611  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1612  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1613  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1614  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1615  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1616);
1617
1618
1619