xref: /dokuwiki/inc/utf8.php (revision 3161005d07beb46bb8a866ec56a768938571ec9d)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
51f2058faSAndreas Gohr * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr        define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr    }else{
16ab77016bSAndreas Gohr        define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr    }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
20*3161005dSAndreas Gohr/**
21*3161005dSAndreas Gohr * Check if PREG was compiled with UTF-8 support
22*3161005dSAndreas Gohr *
23*3161005dSAndreas Gohr * Without this many of the functions below will not work, so this is a minimal requirement
24*3161005dSAndreas Gohr */
25*3161005dSAndreas Gohrif(!defined('UTF8_PREGSUPPORT')){
26*3161005dSAndreas Gohr    define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
27*3161005dSAndreas Gohr}
28*3161005dSAndreas Gohr
29*3161005dSAndreas Gohr/**
30*3161005dSAndreas Gohr * Check if PREG was compiled with Unicode Property support
31*3161005dSAndreas Gohr *
32*3161005dSAndreas Gohr * This is not required for the functions below, but might be needed in a UTF-8 aware application
33*3161005dSAndreas Gohr */
34*3161005dSAndreas Gohrif(!defined('UTF8_PROPERTYSUPPORT')){
35*3161005dSAndreas Gohr    define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
36*3161005dSAndreas Gohr}
37*3161005dSAndreas Gohr
38*3161005dSAndreas Gohr
395e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
405e613a5cSchris
41df957b36SAndreas Gohrif(!function_exists('utf8_isASCII')){
42f29bd553Sandi    /**
4344f669e9Sandi     * Checks if a string contains 7bit ASCII only
4444f669e9Sandi     *
453cf90024SMichael Hamann     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
4644f669e9Sandi     */
4744f669e9Sandi    function utf8_isASCII($str){
487e6f32c4SAndreas Gohr        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
4944f669e9Sandi    }
50df957b36SAndreas Gohr}
5144f669e9Sandi
52df957b36SAndreas Gohrif(!function_exists('utf8_strip')){
5344f669e9Sandi    /**
54e1906e6eSandi     * Strips all highbyte chars
55e1906e6eSandi     *
56e1906e6eSandi     * Returns a pure ASCII7 string
57e1906e6eSandi     *
58e1906e6eSandi     * @author Andreas Gohr <andi@splitbrain.org>
59e1906e6eSandi     */
60e1906e6eSandi    function utf8_strip($str){
61e1906e6eSandi        $ascii = '';
628ec3f7bdSAndreas Gohr        $len = strlen($str);
638ec3f7bdSAndreas Gohr        for($i=0; $i<$len; $i++){
64e1906e6eSandi            if(ord($str{$i}) <128){
65e1906e6eSandi                $ascii .= $str{$i};
66e1906e6eSandi            }
67e1906e6eSandi        }
68e1906e6eSandi        return $ascii;
69e1906e6eSandi    }
70df957b36SAndreas Gohr}
71e1906e6eSandi
72df957b36SAndreas Gohrif(!function_exists('utf8_check')){
73e1906e6eSandi    /**
74f29bd553Sandi     * Tries to detect if a string is in Unicode encoding
75f29bd553Sandi     *
76f29bd553Sandi     * @author <bmorel@ssi.fr>
77f29bd553Sandi     * @link   http://www.php.net/manual/en/function.utf8-encode.php
78f29bd553Sandi     */
79f29bd553Sandi    function utf8_check($Str) {
808ec3f7bdSAndreas Gohr        $len = strlen($Str);
818ec3f7bdSAndreas Gohr        for ($i=0; $i<$len; $i++) {
825e613a5cSchris            $b = ord($Str[$i]);
835e613a5cSchris            if ($b < 0x80) continue; # 0bbbbbbb
845e613a5cSchris            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
855e613a5cSchris            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
865e613a5cSchris            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
875e613a5cSchris            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
885e613a5cSchris            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
89f29bd553Sandi            else return false; # Does not match any model
90df957b36SAndreas Gohr
91f29bd553Sandi            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
928ec3f7bdSAndreas Gohr                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
93f29bd553Sandi                    return false;
94f29bd553Sandi            }
95f29bd553Sandi        }
96f29bd553Sandi        return true;
97f29bd553Sandi    }
98df957b36SAndreas Gohr}
9949c713a3Sandi
100f393a4ebSAndreas Gohrif(!function_exists('utf8_basename')){
101f393a4ebSAndreas Gohr    /**
102f393a4ebSAndreas Gohr     * A locale independent basename() implementation
103f393a4ebSAndreas Gohr     *
104f393a4ebSAndreas Gohr     * works around a bug in PHP's basename() implementation
105f393a4ebSAndreas Gohr     *
106f393a4ebSAndreas Gohr     * @see basename()
107f393a4ebSAndreas Gohr     * @link   https://bugs.php.net/bug.php?id=37738
108f393a4ebSAndreas Gohr     * @param string $path     A path
109f393a4ebSAndreas Gohr     * @param string $suffix   If the name component ends in suffix this will also be cut off
110f393a4ebSAndreas Gohr     * @return string
111f393a4ebSAndreas Gohr     */
112f393a4ebSAndreas Gohr    function utf8_basename($path, $suffix=''){
113fa446926SAndreas Gohr        $path = trim($path,'\\/');
114fa446926SAndreas Gohr        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
115fa446926SAndreas Gohr        if($rpos) $path = substr($path, $rpos+1);
116f393a4ebSAndreas Gohr
117f393a4ebSAndreas Gohr        $suflen = strlen($suffix);
118420addb2SAndreas Gohr        if($suflen && (substr($path, -$suflen) == $suffix)){
119420addb2SAndreas Gohr            $path = substr($path, 0, -$suflen);
120f393a4ebSAndreas Gohr        }
121f393a4ebSAndreas Gohr
122420addb2SAndreas Gohr        return $path;
123f393a4ebSAndreas Gohr    }
124f393a4ebSAndreas Gohr}
125f393a4ebSAndreas Gohr
126df957b36SAndreas Gohrif(!function_exists('utf8_strlen')){
1272f954959Sandi    /**
128f29317c1Sandi     * Unicode aware replacement for strlen()
1292f954959Sandi     *
130f29317c1Sandi     * utf8_decode() converts characters that are not in ISO-8859-1
131f29317c1Sandi     * to '?', which, for the purpose of counting, is alright - It's
132f29317c1Sandi     * even faster than mb_strlen.
1332f954959Sandi     *
134f29317c1Sandi     * @author <chernyshevsky at hotmail dot com>
1352f954959Sandi     * @see    strlen()
136f29317c1Sandi     * @see    utf8_decode()
1372f954959Sandi     */
1382f954959Sandi    function utf8_strlen($string){
139dc57ef04Sandi        return strlen(utf8_decode($string));
1402f954959Sandi    }
141df957b36SAndreas Gohr}
1422f954959Sandi
143df957b36SAndreas Gohrif(!function_exists('utf8_substr')){
1447077c942Sandi    /**
14510f09f2aSAndreas Gohr     * UTF-8 aware alternative to substr
1467077c942Sandi     *
14710f09f2aSAndreas Gohr     * Return part of a string given character offset (and optionally length)
14810f09f2aSAndreas Gohr     *
14910f09f2aSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
1505e613a5cSchris     * @author Chris Smith <chris@jalakai.co.uk>
151e3736c26SAndreas Gohr     * @param string $str
152e3736c26SAndreas Gohr     * @param int $offset number of UTF-8 characters offset (from left)
153e3736c26SAndreas Gohr     * @param int $length (optional) length in UTF-8 characters from offset
15444881bd0Shenning.noren     * @return mixed string or false if failure
1557077c942Sandi     */
15610f09f2aSAndreas Gohr    function utf8_substr($str, $offset, $length = null) {
157ab77016bSAndreas Gohr        if(UTF8_MBSTRING){
15810f09f2aSAndreas Gohr            if( $length === null ){
15919a32233Schris                return mb_substr($str, $offset);
1607d8be200Sandi            }else{
16119a32233Schris                return mb_substr($str, $offset, $length);
162f29317c1Sandi            }
163f29317c1Sandi        }
164f29317c1Sandi
1652626ee0cSchris        /*
1662626ee0cSchris         * Notes:
1672626ee0cSchris         *
1682626ee0cSchris         * no mb string support, so we'll use pcre regex's with 'u' flag
1692626ee0cSchris         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1702626ee0cSchris         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1712626ee0cSchris         *
1722626ee0cSchris         * substr documentation states false can be returned in some cases (e.g. offset > string length)
1732626ee0cSchris         * mb_substr never returns false, it will return an empty string instead.
1742626ee0cSchris         *
1752626ee0cSchris         * calculating the number of characters in the string is a relatively expensive operation, so
1762626ee0cSchris         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1772626ee0cSchris         */
17810f09f2aSAndreas Gohr
1792626ee0cSchris        // cast parameters to appropriate types to avoid multiple notices/warnings
1802626ee0cSchris        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1812626ee0cSchris        $offset = (int)$offset;
1822626ee0cSchris        if (!is_null($length)) $length = (int)$length;
18310f09f2aSAndreas Gohr
1842626ee0cSchris        // handle trivial cases
1855e613a5cSchris        if ($length === 0) return '';
1862626ee0cSchris        if ($offset < 0 && $length < 0 && $length < $offset) return '';
1875e613a5cSchris
1882626ee0cSchris        $offset_pattern = '';
1892626ee0cSchris        $length_pattern = '';
1902626ee0cSchris
1912626ee0cSchris        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1922626ee0cSchris        if ($offset < 0) {
1932626ee0cSchris            $strlen = strlen(utf8_decode($str));        // see notes
1942626ee0cSchris            $offset = $strlen + $offset;
1952626ee0cSchris            if ($offset < 0) $offset = 0;
1962626ee0cSchris        }
1972626ee0cSchris
1982626ee0cSchris        // establish a pattern for offset, a non-captured group equal in length to offset
1992626ee0cSchris        if ($offset > 0) {
2002626ee0cSchris            $Ox = (int)($offset/65535);
2012626ee0cSchris            $Oy = $offset%65535;
2022626ee0cSchris
2032626ee0cSchris            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
2042626ee0cSchris            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
2052626ee0cSchris        } else {
2062626ee0cSchris            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
2072626ee0cSchris        }
2082626ee0cSchris
2092626ee0cSchris        // establish a pattern for length
2102626ee0cSchris        if (is_null($length)) {
2112626ee0cSchris            $length_pattern = '(.*)$';                  // the rest of the string
2122626ee0cSchris        } else {
2132626ee0cSchris
2142626ee0cSchris            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
2152626ee0cSchris            if ($offset > $strlen) return '';           // another trivial case
2162626ee0cSchris
2172626ee0cSchris            if ($length > 0) {
2182626ee0cSchris
2192626ee0cSchris                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2202626ee0cSchris
2212626ee0cSchris                $Lx = (int)($length/65535);
2222626ee0cSchris                $Ly = $length%65535;
2232626ee0cSchris
2242626ee0cSchris                // +ve length requires ... a captured group of length characters
2252626ee0cSchris                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2262626ee0cSchris                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2272626ee0cSchris
2282626ee0cSchris            } else if ($length < 0) {
2292626ee0cSchris
2302626ee0cSchris                if ($length < ($offset - $strlen)) return '';
2312626ee0cSchris
2322626ee0cSchris                $Lx = (int)((-$length)/65535);
2332626ee0cSchris                $Ly = (-$length)%65535;
2342626ee0cSchris
2352626ee0cSchris                // -ve length requires ... capture everything except a group of -length characters
2362626ee0cSchris                //                         anchored at the tail-end of the string
2372626ee0cSchris                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2382626ee0cSchris                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
23910f09f2aSAndreas Gohr            }
24010f09f2aSAndreas Gohr        }
24110f09f2aSAndreas Gohr
2422626ee0cSchris        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2432626ee0cSchris        return $match[1];
2442626ee0cSchris    }
245df957b36SAndreas Gohr}
24610f09f2aSAndreas Gohr
247df957b36SAndreas Gohrif(!function_exists('utf8_substr_replace')){
248f29317c1Sandi    /**
249dc57ef04Sandi     * Unicode aware replacement for substr_replace()
250dc57ef04Sandi     *
251dc57ef04Sandi     * @author Andreas Gohr <andi@splitbrain.org>
252dc57ef04Sandi     * @see    substr_replace()
253dc57ef04Sandi     */
254dc57ef04Sandi    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
255dc57ef04Sandi        $ret = '';
256dc57ef04Sandi        if($start>0) $ret .= utf8_substr($string, 0, $start);
257dc57ef04Sandi        $ret .= $replacement;
258dc57ef04Sandi        $ret .= utf8_substr($string, $start+$length);
259dc57ef04Sandi        return $ret;
260dc57ef04Sandi    }
261df957b36SAndreas Gohr}
262dc57ef04Sandi
263df957b36SAndreas Gohrif(!function_exists('utf8_ltrim')){
264dc57ef04Sandi    /**
265f29317c1Sandi     * Unicode aware replacement for ltrim()
266f29317c1Sandi     *
267f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
268f29317c1Sandi     * @see    ltrim()
269e3736c26SAndreas Gohr     * @param  string $str
270e3736c26SAndreas Gohr     * @param  string $charlist
271f29317c1Sandi     * @return string
272f29317c1Sandi     */
273f29317c1Sandi    function utf8_ltrim($str,$charlist=''){
274f29317c1Sandi        if($charlist == '') return ltrim($str);
275f29317c1Sandi
276f29317c1Sandi        //quote charlist for use in a characterclass
277f29317c1Sandi        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
278f29317c1Sandi
279f29317c1Sandi        return preg_replace('/^['.$charlist.']+/u','',$str);
280f29317c1Sandi    }
281df957b36SAndreas Gohr}
282f29317c1Sandi
283df957b36SAndreas Gohrif(!function_exists('utf8_rtrim')){
284f29317c1Sandi    /**
285ea2eed85Sandi     * Unicode aware replacement for rtrim()
286f29317c1Sandi     *
287f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
288f29317c1Sandi     * @see    rtrim()
289e3736c26SAndreas Gohr     * @param  string $str
290e3736c26SAndreas Gohr     * @param  string $charlist
291f29317c1Sandi     * @return string
292f29317c1Sandi     */
293f29317c1Sandi    function  utf8_rtrim($str,$charlist=''){
294f29317c1Sandi        if($charlist == '') return rtrim($str);
295f29317c1Sandi
296f29317c1Sandi        //quote charlist for use in a characterclass
297f29317c1Sandi        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
298f29317c1Sandi
299f29317c1Sandi        return preg_replace('/['.$charlist.']+$/u','',$str);
300f29317c1Sandi    }
301df957b36SAndreas Gohr}
302f29317c1Sandi
303df957b36SAndreas Gohrif(!function_exists('utf8_trim')){
304f29317c1Sandi    /**
305f29317c1Sandi     * Unicode aware replacement for trim()
306f29317c1Sandi     *
307f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
308f29317c1Sandi     * @see    trim()
309e3736c26SAndreas Gohr     * @param  string $str
310e3736c26SAndreas Gohr     * @param  string $charlist
311f29317c1Sandi     * @return string
312f29317c1Sandi     */
313f29317c1Sandi    function  utf8_trim($str,$charlist='') {
314f29317c1Sandi        if($charlist == '') return trim($str);
315f29317c1Sandi
31640421069SAndreas Gohr        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
317f29317c1Sandi    }
318df957b36SAndreas Gohr}
319f29317c1Sandi
320df957b36SAndreas Gohrif(!function_exists('utf8_strtolower')){
32149c713a3Sandi    /**
32282257610Sandi     * This is a unicode aware replacement for strtolower()
32382257610Sandi     *
32482257610Sandi     * Uses mb_string extension if available
32582257610Sandi     *
32672de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
32782257610Sandi     * @see    strtolower()
32882257610Sandi     * @see    utf8_strtoupper()
32982257610Sandi     */
33082257610Sandi    function utf8_strtolower($string){
331ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
33282257610Sandi
33382257610Sandi        global $UTF8_UPPER_TO_LOWER;
33472de9068SAndreas Gohr        return strtr($string,$UTF8_UPPER_TO_LOWER);
33582257610Sandi    }
336df957b36SAndreas Gohr}
33782257610Sandi
338df957b36SAndreas Gohrif(!function_exists('utf8_strtoupper')){
33982257610Sandi    /**
34082257610Sandi     * This is a unicode aware replacement for strtoupper()
34182257610Sandi     *
34282257610Sandi     * Uses mb_string extension if available
34382257610Sandi     *
34472de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
34582257610Sandi     * @see    strtoupper()
34682257610Sandi     * @see    utf8_strtoupper()
34782257610Sandi     */
34882257610Sandi    function utf8_strtoupper($string){
349ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
35082257610Sandi
35182257610Sandi        global $UTF8_LOWER_TO_UPPER;
35272de9068SAndreas Gohr        return strtr($string,$UTF8_LOWER_TO_UPPER);
35382257610Sandi    }
354df957b36SAndreas Gohr}
35582257610Sandi
356df957b36SAndreas Gohrif(!function_exists('utf8_ucfirst')){
35782257610Sandi    /**
35826ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucfirst
35926ece5a7SAndreas Gohr     * Make a string's first character uppercase
36026ece5a7SAndreas Gohr     *
36126ece5a7SAndreas Gohr     * @author Harry Fuecks
36226ece5a7SAndreas Gohr     * @param string
36326ece5a7SAndreas Gohr     * @return string with first character as upper case (if applicable)
36426ece5a7SAndreas Gohr     */
36526ece5a7SAndreas Gohr    function utf8_ucfirst($str){
36626ece5a7SAndreas Gohr        switch ( utf8_strlen($str) ) {
36726ece5a7SAndreas Gohr            case 0:
36826ece5a7SAndreas Gohr                return '';
36926ece5a7SAndreas Gohr            case 1:
37026ece5a7SAndreas Gohr                return utf8_strtoupper($str);
37126ece5a7SAndreas Gohr            default:
37226ece5a7SAndreas Gohr                preg_match('/^(.{1})(.*)$/us', $str, $matches);
37326ece5a7SAndreas Gohr                return utf8_strtoupper($matches[1]).$matches[2];
37426ece5a7SAndreas Gohr        }
37526ece5a7SAndreas Gohr    }
376df957b36SAndreas Gohr}
37726ece5a7SAndreas Gohr
378df957b36SAndreas Gohrif(!function_exists('utf8_ucwords')){
37926ece5a7SAndreas Gohr    /**
38026ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucwords
38126ece5a7SAndreas Gohr     * Uppercase the first character of each word in a string
38226ece5a7SAndreas Gohr     *
38326ece5a7SAndreas Gohr     * @author Harry Fuecks
38426ece5a7SAndreas Gohr     * @param string
38526ece5a7SAndreas Gohr     * @return string with first char of each word uppercase
38626ece5a7SAndreas Gohr     * @see http://www.php.net/ucwords
38726ece5a7SAndreas Gohr     */
38826ece5a7SAndreas Gohr    function utf8_ucwords($str) {
38926ece5a7SAndreas Gohr        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
39026ece5a7SAndreas Gohr        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
39126ece5a7SAndreas Gohr        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
39226ece5a7SAndreas Gohr        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
39326ece5a7SAndreas Gohr
39426ece5a7SAndreas Gohr        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
39526ece5a7SAndreas Gohr    }
39626ece5a7SAndreas Gohr
39726ece5a7SAndreas Gohr    /**
39826ece5a7SAndreas Gohr     * Callback function for preg_replace_callback call in utf8_ucwords
39926ece5a7SAndreas Gohr     * You don't need to call this yourself
40026ece5a7SAndreas Gohr     *
40126ece5a7SAndreas Gohr     * @author Harry Fuecks
402e3736c26SAndreas Gohr     * @param  array $matches matches corresponding to a single word
40326ece5a7SAndreas Gohr     * @return string with first char of the word in uppercase
40426ece5a7SAndreas Gohr     * @see utf8_ucwords
40526ece5a7SAndreas Gohr     * @see utf8_strtoupper
40626ece5a7SAndreas Gohr     */
40726ece5a7SAndreas Gohr    function utf8_ucwords_callback($matches) {
40826ece5a7SAndreas Gohr        $leadingws = $matches[2];
40926ece5a7SAndreas Gohr        $ucfirst = utf8_strtoupper($matches[3]);
41026ece5a7SAndreas Gohr        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
41126ece5a7SAndreas Gohr        return $leadingws . $ucword;
41226ece5a7SAndreas Gohr    }
413df957b36SAndreas Gohr}
41426ece5a7SAndreas Gohr
415df957b36SAndreas Gohrif(!function_exists('utf8_deaccent')){
41626ece5a7SAndreas Gohr    /**
41782257610Sandi     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
41882257610Sandi     *
41982257610Sandi     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
42082257610Sandi     * letters. Default is to deaccent both cases ($case = 0)
42182257610Sandi     *
42282257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
42382257610Sandi     */
42482257610Sandi    function utf8_deaccent($string,$case=0){
42582257610Sandi        if($case <= 0){
42682257610Sandi            global $UTF8_LOWER_ACCENTS;
42772de9068SAndreas Gohr            $string = strtr($string,$UTF8_LOWER_ACCENTS);
42882257610Sandi        }
42982257610Sandi        if($case >= 0){
43082257610Sandi            global $UTF8_UPPER_ACCENTS;
43172de9068SAndreas Gohr            $string = strtr($string,$UTF8_UPPER_ACCENTS);
43282257610Sandi        }
43382257610Sandi        return $string;
43482257610Sandi    }
435df957b36SAndreas Gohr}
43682257610Sandi
437df957b36SAndreas Gohrif(!function_exists('utf8_romanize')){
43882257610Sandi    /**
4398a831f2bSAndreas Gohr     * Romanize a non-latin string
4408a831f2bSAndreas Gohr     *
4418a831f2bSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
4428a831f2bSAndreas Gohr     */
4438a831f2bSAndreas Gohr    function utf8_romanize($string){
4448a831f2bSAndreas Gohr        if(utf8_isASCII($string)) return $string; //nothing to do
4458a831f2bSAndreas Gohr
4468a831f2bSAndreas Gohr        global $UTF8_ROMANIZATION;
4478a831f2bSAndreas Gohr        return strtr($string,$UTF8_ROMANIZATION);
4488a831f2bSAndreas Gohr    }
449df957b36SAndreas Gohr}
4508a831f2bSAndreas Gohr
451df957b36SAndreas Gohrif(!function_exists('utf8_stripspecials')){
4528a831f2bSAndreas Gohr    /**
453099ada41Sandi     * Removes special characters (nonalphanumeric) from a UTF-8 string
454099ada41Sandi     *
455099ada41Sandi     * This function adds the controlchars 0x00 to 0x19 to the array of
456099ada41Sandi     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
457099ada41Sandi     *
458099ada41Sandi     * @author Andreas Gohr <andi@splitbrain.org>
459099ada41Sandi     * @param  string $string     The UTF8 string to strip of special chars
460099ada41Sandi     * @param  string $repl       Replace special with this string
461b4ce25e9SAndreas Gohr     * @param  string $additional Additional chars to strip (used in regexp char class)
462e3736c26SAndreas Gohr     * @return string
463099ada41Sandi     */
464b4ce25e9SAndreas Gohr    function utf8_stripspecials($string,$repl='',$additional=''){
465720307d9Schris        global $UTF8_SPECIAL_CHARS2;
466099ada41Sandi
4675c812709Sandi        static $specials = null;
4685c812709Sandi        if(is_null($specials)){
469720307d9Schris            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
470720307d9Schris            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4715c812709Sandi        }
472099ada41Sandi
473b4ce25e9SAndreas Gohr        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
474099ada41Sandi    }
475df957b36SAndreas Gohr}
476099ada41Sandi
477df957b36SAndreas Gohrif(!function_exists('utf8_strpos')){
478099ada41Sandi    /**
4792f954959Sandi     * This is an Unicode aware replacement for strpos
4802f954959Sandi     *
48172de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
4822f954959Sandi     * @see    strpos()
48372de9068SAndreas Gohr     * @param  string
48472de9068SAndreas Gohr     * @param  string
48572de9068SAndreas Gohr     * @param  integer
48672de9068SAndreas Gohr     * @return integer
4872f954959Sandi     */
4882f954959Sandi    function utf8_strpos($haystack, $needle, $offset=0){
48972de9068SAndreas Gohr        $comp = 0;
49072de9068SAndreas Gohr        $length = null;
4912f954959Sandi
49272de9068SAndreas Gohr        while (is_null($length) || $length < $offset) {
49372de9068SAndreas Gohr            $pos = strpos($haystack, $needle, $offset + $comp);
49472de9068SAndreas Gohr
49572de9068SAndreas Gohr            if ($pos === false)
496f29317c1Sandi                return false;
49772de9068SAndreas Gohr
49872de9068SAndreas Gohr            $length = utf8_strlen(substr($haystack, 0, $pos));
49972de9068SAndreas Gohr
50072de9068SAndreas Gohr            if ($length < $offset)
50172de9068SAndreas Gohr                $comp = $pos - $length;
502f29317c1Sandi        }
5032f954959Sandi
50472de9068SAndreas Gohr        return $length;
50572de9068SAndreas Gohr    }
506df957b36SAndreas Gohr}
507f29317c1Sandi
508df957b36SAndreas Gohrif(!function_exists('utf8_tohtml')){
5092f954959Sandi    /**
510ea2eed85Sandi     * Encodes UTF-8 characters to HTML entities
511ea2eed85Sandi     *
5129f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
513ea2eed85Sandi     * @author <vpribish at shopping dot com>
514ea2eed85Sandi     * @link   http://www.php.net/manual/en/function.utf8-decode.php
515ea2eed85Sandi     */
516ea2eed85Sandi    function utf8_tohtml ($str) {
517ea2eed85Sandi        $ret = '';
5189f9fb0e5STom N Harris        foreach (utf8_to_unicode($str) as $cp) {
5199f9fb0e5STom N Harris            if ($cp < 0x80)
5209f9fb0e5STom N Harris                $ret .= chr($cp);
5219f9fb0e5STom N Harris            elseif ($cp < 0x100)
5229f9fb0e5STom N Harris                $ret .= "&#$cp;";
5239f9fb0e5STom N Harris            else
5249f9fb0e5STom N Harris                $ret .= '&#x'.dechex($cp).';';
5259f9fb0e5STom N Harris        }
5269f9fb0e5STom N Harris        return $ret;
5279f9fb0e5STom N Harris    }
528df957b36SAndreas Gohr}
5299f9fb0e5STom N Harris
530df957b36SAndreas Gohrif(!function_exists('utf8_unhtml')){
5319f9fb0e5STom N Harris    /**
5329f9fb0e5STom N Harris     * Decodes HTML entities to UTF-8 characters
5339f9fb0e5STom N Harris     *
5349f9fb0e5STom N Harris     * Convert any &#..; entity to a codepoint,
5359f9fb0e5STom N Harris     * The entities flag defaults to only decoding numeric entities.
5369f9fb0e5STom N Harris     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
5379f9fb0e5STom N Harris     * are handled as well. Avoids the problem that would occur if you
5389f9fb0e5STom N Harris     * had to decode "&amp;#38;&#38;amp;#38;"
5399f9fb0e5STom N Harris     *
5409f9fb0e5STom N Harris     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
5419f9fb0e5STom N Harris     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
5429f9fb0e5STom N Harris     * what it should be                   -> "&#38;&amp#38;"
5439f9fb0e5STom N Harris     *
5449f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
5459f9fb0e5STom N Harris     * @param  string  $str      UTF-8 encoded string
5469f9fb0e5STom N Harris     * @param  boolean $entities Flag controlling decoding of named entities.
547e3736c26SAndreas Gohr     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
5489f9fb0e5STom N Harris     */
5499f9fb0e5STom N Harris    function utf8_unhtml($str, $entities=null) {
5509f9fb0e5STom N Harris        static $decoder = null;
5519f9fb0e5STom N Harris        if (is_null($decoder))
5529f9fb0e5STom N Harris            $decoder = new utf8_entity_decoder();
5539f9fb0e5STom N Harris        if (is_null($entities))
5549f9fb0e5STom N Harris            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
5559f9fb0e5STom N Harris                                         'utf8_decode_numeric', $str);
5569f9fb0e5STom N Harris        else
5579f9fb0e5STom N Harris            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
5589f9fb0e5STom N Harris                                         array(&$decoder, 'decode'), $str);
5599f9fb0e5STom N Harris    }
560df957b36SAndreas Gohr}
561df957b36SAndreas Gohr
562df957b36SAndreas Gohrif(!function_exists('utf8_decode_numeric')){
563e3736c26SAndreas Gohr    /**
564e3736c26SAndreas Gohr     * Decodes numeric HTML entities to their correct UTF-8 characters
565e3736c26SAndreas Gohr     *
566e3736c26SAndreas Gohr     * @param $ent string A numeric entity
567e3736c26SAndreas Gohr     * @return string
568e3736c26SAndreas Gohr     */
5699f9fb0e5STom N Harris    function utf8_decode_numeric($ent) {
5709f9fb0e5STom N Harris        switch ($ent[2]) {
5719f9fb0e5STom N Harris            case 'X':
5729f9fb0e5STom N Harris            case 'x':
5739f9fb0e5STom N Harris                $cp = hexdec($ent[3]);
5749f9fb0e5STom N Harris                break;
5759f9fb0e5STom N Harris            default:
5769f9fb0e5STom N Harris                $cp = intval($ent[3]);
5779f9fb0e5STom N Harris                break;
5789f9fb0e5STom N Harris        }
5799f9fb0e5STom N Harris        return unicode_to_utf8(array($cp));
5809f9fb0e5STom N Harris    }
581df957b36SAndreas Gohr}
582df957b36SAndreas Gohr
583df957b36SAndreas Gohrif(!class_exists('utf8_entity_decoder')){
584e3736c26SAndreas Gohr    /**
585e3736c26SAndreas Gohr     * Encapsulate HTML entity decoding tables
586e3736c26SAndreas Gohr     */
5879f9fb0e5STom N Harris    class utf8_entity_decoder {
5889f9fb0e5STom N Harris        var $table;
589e3736c26SAndreas Gohr
590e3736c26SAndreas Gohr        /**
591e3736c26SAndreas Gohr         * Initializes the decoding tables
592e3736c26SAndreas Gohr         */
59363703ba5SAndreas Gohr        function __construct() {
5949f9fb0e5STom N Harris            $table = get_html_translation_table(HTML_ENTITIES);
5959f9fb0e5STom N Harris            $table = array_flip($table);
5969f9fb0e5STom N Harris            $this->table = array_map(array(&$this,'makeutf8'), $table);
5979f9fb0e5STom N Harris        }
598e3736c26SAndreas Gohr
599e3736c26SAndreas Gohr        /**
600e3736c26SAndreas Gohr         * Wrapper aorund unicode_to_utf8()
601e3736c26SAndreas Gohr         *
602e3736c26SAndreas Gohr         * @param $c string
603e3736c26SAndreas Gohr         * @return mixed
604e3736c26SAndreas Gohr         */
6059f9fb0e5STom N Harris        function makeutf8($c) {
6069f9fb0e5STom N Harris            return unicode_to_utf8(array(ord($c)));
6079f9fb0e5STom N Harris        }
608e3736c26SAndreas Gohr
609e3736c26SAndreas Gohr        /**
610e3736c26SAndreas Gohr         * Decodes any HTML entity to it's correct UTF-8 char equivalent
611e3736c26SAndreas Gohr         *
612e3736c26SAndreas Gohr         * @param $ent string An entity
613e3736c26SAndreas Gohr         * @return string
614e3736c26SAndreas Gohr         */
6159f9fb0e5STom N Harris        function decode($ent) {
6169f9fb0e5STom N Harris            if ($ent[1] == '#') {
6179f9fb0e5STom N Harris                return utf8_decode_numeric($ent);
6189f9fb0e5STom N Harris            } elseif (array_key_exists($ent[0],$this->table)) {
6199f9fb0e5STom N Harris                return $this->table[$ent[0]];
6209f9fb0e5STom N Harris            } else {
6219f9fb0e5STom N Harris                return $ent[0];
622ea2eed85Sandi            }
623ea2eed85Sandi        }
624ea2eed85Sandi    }
625df957b36SAndreas Gohr}
626ea2eed85Sandi
627df957b36SAndreas Gohrif(!function_exists('utf8_to_unicode')){
628ea2eed85Sandi    /**
6291abfaba4SAndreas Gohr     * Takes an UTF-8 string and returns an array of ints representing the
6301abfaba4SAndreas Gohr     * Unicode characters. Astral planes are supported ie. the ints in the
6311abfaba4SAndreas Gohr     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6321abfaba4SAndreas Gohr     * are not allowed.
63382257610Sandi     *
6341abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
6351abfaba4SAndreas Gohr     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
6361abfaba4SAndreas Gohr     * level E_USER_WARNING
6371abfaba4SAndreas Gohr     *
6381abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to
6391abfaba4SAndreas Gohr     * trigger errors on encountering bad bytes
6401abfaba4SAndreas Gohr     *
6411abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
6421abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
643e3736c26SAndreas Gohr     * @param  string  $str UTF-8 encoded string
644e3736c26SAndreas Gohr     * @param  boolean $strict Check for invalid sequences?
64544881bd0Shenning.noren     * @return mixed array of unicode code points or false if UTF-8 invalid
6461abfaba4SAndreas Gohr     * @see    unicode_to_utf8
6471abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
6481abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
64982257610Sandi     */
6501abfaba4SAndreas Gohr    function utf8_to_unicode($str,$strict=false) {
6511abfaba4SAndreas Gohr        $mState = 0;     // cached expected number of octets after the current octet
6521abfaba4SAndreas Gohr                         // until the beginning of the next UTF8 character sequence
6531abfaba4SAndreas Gohr        $mUcs4  = 0;     // cached Unicode character
6541abfaba4SAndreas Gohr        $mBytes = 1;     // cached expected number of octets in the current sequence
65582257610Sandi
6561abfaba4SAndreas Gohr        $out = array();
6571abfaba4SAndreas Gohr
6581abfaba4SAndreas Gohr        $len = strlen($str);
6591abfaba4SAndreas Gohr
6601abfaba4SAndreas Gohr        for($i = 0; $i < $len; $i++) {
6611abfaba4SAndreas Gohr
6621abfaba4SAndreas Gohr            $in = ord($str{$i});
6631abfaba4SAndreas Gohr
6641abfaba4SAndreas Gohr            if ( $mState == 0) {
6651abfaba4SAndreas Gohr
6661abfaba4SAndreas Gohr                // When mState is zero we expect either a US-ASCII character or a
6671abfaba4SAndreas Gohr                // multi-octet sequence.
6681abfaba4SAndreas Gohr                if (0 == (0x80 & ($in))) {
6691abfaba4SAndreas Gohr                    // US-ASCII, pass straight through.
6701abfaba4SAndreas Gohr                    $out[] = $in;
6711abfaba4SAndreas Gohr                    $mBytes = 1;
6721abfaba4SAndreas Gohr
6731abfaba4SAndreas Gohr                } else if (0xC0 == (0xE0 & ($in))) {
6741abfaba4SAndreas Gohr                    // First octet of 2 octet sequence
6751abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6761abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
6771abfaba4SAndreas Gohr                    $mState = 1;
6781abfaba4SAndreas Gohr                    $mBytes = 2;
6791abfaba4SAndreas Gohr
6801abfaba4SAndreas Gohr                } else if (0xE0 == (0xF0 & ($in))) {
6811abfaba4SAndreas Gohr                    // First octet of 3 octet sequence
6821abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6831abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
6841abfaba4SAndreas Gohr                    $mState = 2;
6851abfaba4SAndreas Gohr                    $mBytes = 3;
6861abfaba4SAndreas Gohr
6871abfaba4SAndreas Gohr                } else if (0xF0 == (0xF8 & ($in))) {
6881abfaba4SAndreas Gohr                    // First octet of 4 octet sequence
6891abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6901abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x07) << 18;
6911abfaba4SAndreas Gohr                    $mState = 3;
6921abfaba4SAndreas Gohr                    $mBytes = 4;
6931abfaba4SAndreas Gohr
6941abfaba4SAndreas Gohr                } else if (0xF8 == (0xFC & ($in))) {
6951abfaba4SAndreas Gohr                    /* First octet of 5 octet sequence.
6961abfaba4SAndreas Gohr                     *
6971abfaba4SAndreas Gohr                     * This is illegal because the encoded codepoint must be either
6981abfaba4SAndreas Gohr                     * (a) not the shortest form or
6991abfaba4SAndreas Gohr                     * (b) outside the Unicode range of 0-0x10FFFF.
7001abfaba4SAndreas Gohr                     * Rather than trying to resynchronize, we will carry on until the end
7011abfaba4SAndreas Gohr                     * of the sequence and let the later error handling code catch it.
7021abfaba4SAndreas Gohr                     */
7031abfaba4SAndreas Gohr                    $mUcs4 = ($in);
7041abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x03) << 24;
7051abfaba4SAndreas Gohr                    $mState = 4;
7061abfaba4SAndreas Gohr                    $mBytes = 5;
7071abfaba4SAndreas Gohr
7081abfaba4SAndreas Gohr                } else if (0xFC == (0xFE & ($in))) {
7091abfaba4SAndreas Gohr                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
7101abfaba4SAndreas Gohr                    $mUcs4 = ($in);
7111abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 1) << 30;
7121abfaba4SAndreas Gohr                    $mState = 5;
7131abfaba4SAndreas Gohr                    $mBytes = 6;
7141abfaba4SAndreas Gohr
7151abfaba4SAndreas Gohr                } elseif($strict) {
7161abfaba4SAndreas Gohr                    /* Current octet is neither in the US-ASCII range nor a legal first
7171abfaba4SAndreas Gohr                     * octet of a multi-octet sequence.
7181abfaba4SAndreas Gohr                     */
7191abfaba4SAndreas Gohr                    trigger_error(
7201abfaba4SAndreas Gohr                            'utf8_to_unicode: Illegal sequence identifier '.
7211abfaba4SAndreas Gohr                                'in UTF-8 at byte '.$i,
7221abfaba4SAndreas Gohr                            E_USER_WARNING
7231abfaba4SAndreas Gohr                        );
72444881bd0Shenning.noren                    return false;
7251abfaba4SAndreas Gohr
7261abfaba4SAndreas Gohr                }
7271abfaba4SAndreas Gohr
7281abfaba4SAndreas Gohr            } else {
7291abfaba4SAndreas Gohr
7301abfaba4SAndreas Gohr                // When mState is non-zero, we expect a continuation of the multi-octet
7311abfaba4SAndreas Gohr                // sequence
7321abfaba4SAndreas Gohr                if (0x80 == (0xC0 & ($in))) {
7331abfaba4SAndreas Gohr
7341abfaba4SAndreas Gohr                    // Legal continuation.
7351abfaba4SAndreas Gohr                    $shift = ($mState - 1) * 6;
7361abfaba4SAndreas Gohr                    $tmp = $in;
7371abfaba4SAndreas Gohr                    $tmp = ($tmp & 0x0000003F) << $shift;
7381abfaba4SAndreas Gohr                    $mUcs4 |= $tmp;
7391abfaba4SAndreas Gohr
7401abfaba4SAndreas Gohr                    /**
7411abfaba4SAndreas Gohr                     * End of the multi-octet sequence. mUcs4 now contains the final
7421abfaba4SAndreas Gohr                     * Unicode codepoint to be output
7431abfaba4SAndreas Gohr                     */
7441abfaba4SAndreas Gohr                    if (0 == --$mState) {
7451abfaba4SAndreas Gohr
7461abfaba4SAndreas Gohr                        /*
7471abfaba4SAndreas Gohr                         * Check for illegal sequences and codepoints.
7481abfaba4SAndreas Gohr                         */
7491abfaba4SAndreas Gohr                        // From Unicode 3.1, non-shortest form is illegal
7501abfaba4SAndreas Gohr                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
7511abfaba4SAndreas Gohr                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
7521abfaba4SAndreas Gohr                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
7531abfaba4SAndreas Gohr                            (4 < $mBytes) ||
7541abfaba4SAndreas Gohr                            // From Unicode 3.2, surrogate characters are illegal
7551abfaba4SAndreas Gohr                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
7561abfaba4SAndreas Gohr                            // Codepoints outside the Unicode range are illegal
7571abfaba4SAndreas Gohr                            ($mUcs4 > 0x10FFFF)) {
7581abfaba4SAndreas Gohr
7591abfaba4SAndreas Gohr                            if($strict){
7601abfaba4SAndreas Gohr                                trigger_error(
7611abfaba4SAndreas Gohr                                        'utf8_to_unicode: Illegal sequence or codepoint '.
7621abfaba4SAndreas Gohr                                            'in UTF-8 at byte '.$i,
7631abfaba4SAndreas Gohr                                        E_USER_WARNING
7641abfaba4SAndreas Gohr                                    );
7651abfaba4SAndreas Gohr
76644881bd0Shenning.noren                                return false;
7671abfaba4SAndreas Gohr                            }
7681abfaba4SAndreas Gohr
7691abfaba4SAndreas Gohr                        }
7701abfaba4SAndreas Gohr
7711abfaba4SAndreas Gohr                        if (0xFEFF != $mUcs4) {
7721abfaba4SAndreas Gohr                            // BOM is legal but we don't want to output it
7731abfaba4SAndreas Gohr                            $out[] = $mUcs4;
7741abfaba4SAndreas Gohr                        }
7751abfaba4SAndreas Gohr
7761abfaba4SAndreas Gohr                        //initialize UTF8 cache
7771abfaba4SAndreas Gohr                        $mState = 0;
7781abfaba4SAndreas Gohr                        $mUcs4  = 0;
7791abfaba4SAndreas Gohr                        $mBytes = 1;
7801abfaba4SAndreas Gohr                    }
7811abfaba4SAndreas Gohr
7821abfaba4SAndreas Gohr                } elseif($strict) {
7831abfaba4SAndreas Gohr                    /**
7841abfaba4SAndreas Gohr                     *((0xC0 & (*in) != 0x80) && (mState != 0))
7851abfaba4SAndreas Gohr                     * Incomplete multi-octet sequence.
7861abfaba4SAndreas Gohr                     */
7871abfaba4SAndreas Gohr                    trigger_error(
7881abfaba4SAndreas Gohr                            'utf8_to_unicode: Incomplete multi-octet '.
7891abfaba4SAndreas Gohr                            '   sequence in UTF-8 at byte '.$i,
7901abfaba4SAndreas Gohr                            E_USER_WARNING
7911abfaba4SAndreas Gohr                        );
7921abfaba4SAndreas Gohr
79344881bd0Shenning.noren                    return false;
79482257610Sandi                }
79582257610Sandi            }
79682257610Sandi        }
7971abfaba4SAndreas Gohr        return $out;
79882257610Sandi    }
799df957b36SAndreas Gohr}
80082257610Sandi
801df957b36SAndreas Gohrif(!function_exists('unicode_to_utf8')){
80282257610Sandi    /**
8031abfaba4SAndreas Gohr     * Takes an array of ints representing the Unicode characters and returns
8041abfaba4SAndreas Gohr     * a UTF-8 string. Astral planes are supported ie. the ints in the
8051abfaba4SAndreas Gohr     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
8061abfaba4SAndreas Gohr     * are not allowed.
80782257610Sandi     *
8081abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
8091abfaba4SAndreas Gohr     * array contains ints that represent surrogates or are outside the
8101abfaba4SAndreas Gohr     * Unicode range and raises a PHP error at level E_USER_WARNING
8111abfaba4SAndreas Gohr     *
8121abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to use
8131abfaba4SAndreas Gohr     * output buffering to concatenate the UTF-8 string (faster) as well as
8141abfaba4SAndreas Gohr     * reference the array by it's keys
8151abfaba4SAndreas Gohr     *
816e3736c26SAndreas Gohr     * @param  array $arr of unicode code points representing a string
817e3736c26SAndreas Gohr     * @param  boolean $strict Check for invalid sequences?
81844881bd0Shenning.noren     * @return mixed UTF-8 string or false if array contains invalid code points
8191abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
8201abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
8211abfaba4SAndreas Gohr     * @see    utf8_to_unicode
8221abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
8231abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
82482257610Sandi     */
8251abfaba4SAndreas Gohr    function unicode_to_utf8($arr,$strict=false) {
8261abfaba4SAndreas Gohr        if (!is_array($arr)) return '';
8271abfaba4SAndreas Gohr        ob_start();
828f949a01cSAndreas Gohr
8291abfaba4SAndreas Gohr        foreach (array_keys($arr) as $k) {
8301abfaba4SAndreas Gohr
8311abfaba4SAndreas Gohr            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
832db959ae3SAndreas Gohr                # ASCII range (including control chars)
8331abfaba4SAndreas Gohr
8341abfaba4SAndreas Gohr                echo chr($arr[$k]);
8351abfaba4SAndreas Gohr
8361abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x07ff) {
837db959ae3SAndreas Gohr                # 2 byte sequence
8381abfaba4SAndreas Gohr
8391abfaba4SAndreas Gohr                echo chr(0xc0 | ($arr[$k] >> 6));
8401abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
8411abfaba4SAndreas Gohr
8421abfaba4SAndreas Gohr            } else if($arr[$k] == 0xFEFF) {
843db959ae3SAndreas Gohr                # Byte order mark (skip)
8441abfaba4SAndreas Gohr
8451abfaba4SAndreas Gohr                // nop -- zap the BOM
8461abfaba4SAndreas Gohr
8471abfaba4SAndreas Gohr            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
848db959ae3SAndreas Gohr                # Test for illegal surrogates
8491abfaba4SAndreas Gohr
8501abfaba4SAndreas Gohr                // found a surrogate
8511abfaba4SAndreas Gohr                if($strict){
8521abfaba4SAndreas Gohr                    trigger_error(
8531abfaba4SAndreas Gohr                        'unicode_to_utf8: Illegal surrogate '.
8541abfaba4SAndreas Gohr                            'at index: '.$k.', value: '.$arr[$k],
8551abfaba4SAndreas Gohr                        E_USER_WARNING
8561abfaba4SAndreas Gohr                        );
85744881bd0Shenning.noren                    return false;
8581abfaba4SAndreas Gohr                }
8591abfaba4SAndreas Gohr
8601abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0xffff) {
861db959ae3SAndreas Gohr                # 3 byte sequence
8621abfaba4SAndreas Gohr
8631abfaba4SAndreas Gohr                echo chr(0xe0 | ($arr[$k] >> 12));
8641abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
8651abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
8661abfaba4SAndreas Gohr
8671abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x10ffff) {
868db959ae3SAndreas Gohr                # 4 byte sequence
8691abfaba4SAndreas Gohr
8701abfaba4SAndreas Gohr                echo chr(0xf0 | ($arr[$k] >> 18));
8711abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
8721abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
8731abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x3f));
8741abfaba4SAndreas Gohr
8751abfaba4SAndreas Gohr            } elseif($strict) {
8761abfaba4SAndreas Gohr
8771abfaba4SAndreas Gohr                trigger_error(
8781abfaba4SAndreas Gohr                    'unicode_to_utf8: Codepoint out of Unicode range '.
8791abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
8801abfaba4SAndreas Gohr                    E_USER_WARNING
8811abfaba4SAndreas Gohr                    );
8821abfaba4SAndreas Gohr
8831abfaba4SAndreas Gohr                // out of range
88444881bd0Shenning.noren                return false;
88582257610Sandi            }
88682257610Sandi        }
8871abfaba4SAndreas Gohr
8881abfaba4SAndreas Gohr        $result = ob_get_contents();
8891abfaba4SAndreas Gohr        ob_end_clean();
8901abfaba4SAndreas Gohr        return $result;
89182257610Sandi    }
892df957b36SAndreas Gohr}
89382257610Sandi
894df957b36SAndreas Gohrif(!function_exists('utf8_to_utf16be')){
89582257610Sandi    /**
89615fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
89715fa0b4fSAndreas Gohr     *
89815fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
89915fa0b4fSAndreas Gohr     */
90015fa0b4fSAndreas Gohr    function utf8_to_utf16be(&$str, $bom = false) {
90115fa0b4fSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
902ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
90315fa0b4fSAndreas Gohr
90415fa0b4fSAndreas Gohr        $uni = utf8_to_unicode($str);
90515fa0b4fSAndreas Gohr        foreach($uni as $cp){
90615fa0b4fSAndreas Gohr            $out .= pack('n',$cp);
90715fa0b4fSAndreas Gohr        }
90815fa0b4fSAndreas Gohr        return $out;
90915fa0b4fSAndreas Gohr    }
910df957b36SAndreas Gohr}
91115fa0b4fSAndreas Gohr
912df957b36SAndreas Gohrif(!function_exists('utf16be_to_utf8')){
91315fa0b4fSAndreas Gohr    /**
91415fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
91515fa0b4fSAndreas Gohr     *
91615fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
91715fa0b4fSAndreas Gohr     */
91815fa0b4fSAndreas Gohr    function utf16be_to_utf8(&$str) {
91915fa0b4fSAndreas Gohr        $uni = unpack('n*',$str);
92015fa0b4fSAndreas Gohr        return unicode_to_utf8($uni);
92115fa0b4fSAndreas Gohr    }
922df957b36SAndreas Gohr}
92315fa0b4fSAndreas Gohr
924df957b36SAndreas Gohrif(!function_exists('utf8_bad_replace')){
9250eac1afbSAndreas Gohr    /**
9260eac1afbSAndreas Gohr     * Replace bad bytes with an alternative character
9270eac1afbSAndreas Gohr     *
9280eac1afbSAndreas Gohr     * ASCII character is recommended for replacement char
9290eac1afbSAndreas Gohr     *
9300eac1afbSAndreas Gohr     * PCRE Pattern to locate bad bytes in a UTF-8 string
9310eac1afbSAndreas Gohr     * Comes from W3 FAQ: Multilingual Forms
9320eac1afbSAndreas Gohr     * Note: modified to include full ASCII range including control chars
9330eac1afbSAndreas Gohr     *
9340eac1afbSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
9350eac1afbSAndreas Gohr     * @see http://www.w3.org/International/questions/qa-forms-utf-8
936e3736c26SAndreas Gohr     * @param string $str to search
937e3736c26SAndreas Gohr     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
9380eac1afbSAndreas Gohr     * @return string
9390eac1afbSAndreas Gohr     */
9400eac1afbSAndreas Gohr    function utf8_bad_replace($str, $replace = '') {
9410eac1afbSAndreas Gohr        $UTF8_BAD =
9420eac1afbSAndreas Gohr         '([\x00-\x7F]'.                          # ASCII (including control chars)
9430eac1afbSAndreas Gohr         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
9440eac1afbSAndreas Gohr         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
9450eac1afbSAndreas Gohr         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
9460eac1afbSAndreas Gohr         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
9470eac1afbSAndreas Gohr         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
9480eac1afbSAndreas Gohr         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
9490eac1afbSAndreas Gohr         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
9500eac1afbSAndreas Gohr         '|(.{1}))';                              # invalid byte
9510eac1afbSAndreas Gohr        ob_start();
9520eac1afbSAndreas Gohr        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
9530eac1afbSAndreas Gohr            if ( !isset($matches[2])) {
9540eac1afbSAndreas Gohr                echo $matches[0];
9550eac1afbSAndreas Gohr            } else {
9560eac1afbSAndreas Gohr                echo $replace;
9570eac1afbSAndreas Gohr            }
9580eac1afbSAndreas Gohr            $str = substr($str,strlen($matches[0]));
9590eac1afbSAndreas Gohr        }
9600eac1afbSAndreas Gohr        $result = ob_get_contents();
9610eac1afbSAndreas Gohr        ob_end_clean();
9620eac1afbSAndreas Gohr        return $result;
9630eac1afbSAndreas Gohr    }
964df957b36SAndreas Gohr}
965ab77016bSAndreas Gohr
966df957b36SAndreas Gohrif(!function_exists('utf8_correctIdx')){
9675953e889Schris    /**
9685953e889Schris     * adjust a byte index into a utf8 string to a utf8 character boundary
9695953e889Schris     *
9705953e889Schris     * @param $str   string   utf8 character string
9715953e889Schris     * @param $i     int      byte index into $str
9725953e889Schris     * @param $next  bool     direction to search for boundary,
9735953e889Schris     *                           false = up (current character)
9745953e889Schris     *                           true = down (next character)
9755953e889Schris     *
9765953e889Schris     * @return int            byte index into $str now pointing to a utf8 character boundary
9775953e889Schris     *
9785953e889Schris     * @author       chris smith <chris@jalakai.co.uk>
9795953e889Schris     */
9805953e889Schris    function utf8_correctIdx(&$str,$i,$next=false) {
9815953e889Schris
982f50163d1Schris        if ($i <= 0) return 0;
983f50163d1Schris
9845953e889Schris        $limit = strlen($str);
985f50163d1Schris        if ($i>=$limit) return $limit;
986f50163d1Schris
987f50163d1Schris        if ($next) {
9885953e889Schris            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
9895953e889Schris        } else {
9905953e889Schris            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
9915953e889Schris        }
9925953e889Schris
9935953e889Schris        return $i;
9945953e889Schris    }
995df957b36SAndreas Gohr}
9965953e889Schris
997ab77016bSAndreas Gohr// only needed if no mb_string available
998ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
99915fa0b4fSAndreas Gohr    /**
100082257610Sandi     * UTF-8 Case lookup table
100182257610Sandi     *
100282257610Sandi     * This lookuptable defines the upper case letters to their correspponding
100382257610Sandi     * lower case letter in UTF-8
100482257610Sandi     *
100582257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
100682257610Sandi     */
100754662a04SAndreas Gohr    global $UTF8_LOWER_TO_UPPER;
1008df957b36SAndreas Gohr    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
100972de9068SAndreas Gohr            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
101072de9068SAndreas Gohr            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
101172de9068SAndreas Gohr            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
101285b77bbdSAndreas Gohr            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
101372de9068SAndreas Gohr            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
101485b77bbdSAndreas Gohr            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
101585b77bbdSAndreas Gohr            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
101685b77bbdSAndreas Gohr            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
101772de9068SAndreas Gohr            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
101872de9068SAndreas Gohr            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
101972de9068SAndreas Gohr            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
102072de9068SAndreas Gohr            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
102172de9068SAndreas Gohr            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
102272de9068SAndreas Gohr            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
102372de9068SAndreas Gohr            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
102472de9068SAndreas Gohr            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
102572de9068SAndreas Gohr            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
102672de9068SAndreas Gohr            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
102772de9068SAndreas Gohr            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
102872de9068SAndreas Gohr            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
102972de9068SAndreas Gohr            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
103072de9068SAndreas Gohr            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
103172de9068SAndreas Gohr            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
103272de9068SAndreas Gohr            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
103372de9068SAndreas Gohr            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
103472de9068SAndreas Gohr            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
103572de9068SAndreas Gohr            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
103672de9068SAndreas Gohr            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
103772de9068SAndreas Gohr            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
103872de9068SAndreas Gohr            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
103972de9068SAndreas Gohr            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
104072de9068SAndreas Gohr            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
104172de9068SAndreas Gohr            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
104272de9068SAndreas Gohr            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
104372de9068SAndreas Gohr            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
104472de9068SAndreas Gohr            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
104572de9068SAndreas Gohr            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
104672de9068SAndreas Gohr            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
104772de9068SAndreas Gohr            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
104872de9068SAndreas Gohr            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
104972de9068SAndreas Gohr            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
105072de9068SAndreas Gohr            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
105172de9068SAndreas Gohr            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
105272de9068SAndreas Gohr            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
105372de9068SAndreas Gohr            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
105472de9068SAndreas Gohr            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
105572de9068SAndreas Gohr            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
105672de9068SAndreas Gohr            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
105772de9068SAndreas Gohr            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
105872de9068SAndreas Gohr            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
105972de9068SAndreas Gohr            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
106072de9068SAndreas Gohr            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
106172de9068SAndreas Gohr            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
106272de9068SAndreas Gohr            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
106372de9068SAndreas Gohr            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
106472de9068SAndreas Gohr            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
106572de9068SAndreas Gohr            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
106672de9068SAndreas Gohr            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
106772de9068SAndreas Gohr            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
106872de9068SAndreas Gohr            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
106972de9068SAndreas Gohr            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
107072de9068SAndreas Gohr            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
107172de9068SAndreas Gohr            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
107272de9068SAndreas Gohr            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
107372de9068SAndreas Gohr            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
107472de9068SAndreas Gohr            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
107572de9068SAndreas Gohr            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
107682257610Sandi                );
107782257610Sandi
107882257610Sandi    /**
107982257610Sandi     * UTF-8 Case lookup table
108082257610Sandi     *
1081e3736c26SAndreas Gohr     * This lookuptable defines the lower case letters to their corresponding
108272de9068SAndreas Gohr     * upper case letter in UTF-8
108382257610Sandi     *
108482257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
108582257610Sandi     */
108654662a04SAndreas Gohr    global $UTF8_UPPER_TO_LOWER;
1087df957b36SAndreas Gohr    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
108872de9068SAndreas Gohr            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
108972de9068SAndreas Gohr            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
109072de9068SAndreas Gohr            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
109185b77bbdSAndreas Gohr            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
109272de9068SAndreas Gohr            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
109385b77bbdSAndreas Gohr            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
109485b77bbdSAndreas Gohr            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
109585b77bbdSAndreas Gohr            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
109672de9068SAndreas Gohr            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
109772de9068SAndreas Gohr            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
109872de9068SAndreas Gohr            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
109972de9068SAndreas Gohr            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
110072de9068SAndreas Gohr            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
110172de9068SAndreas Gohr            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
110272de9068SAndreas Gohr            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
110372de9068SAndreas Gohr            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
110472de9068SAndreas Gohr            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
110572de9068SAndreas Gohr            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
110672de9068SAndreas Gohr            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
110772de9068SAndreas Gohr            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
110872de9068SAndreas Gohr            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
110972de9068SAndreas Gohr            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
111072de9068SAndreas Gohr            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
111172de9068SAndreas Gohr            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
111272de9068SAndreas Gohr            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
111372de9068SAndreas Gohr            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
111472de9068SAndreas Gohr            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
111572de9068SAndreas Gohr            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
111672de9068SAndreas Gohr            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
111772de9068SAndreas Gohr            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
111872de9068SAndreas Gohr            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
111972de9068SAndreas Gohr            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
112072de9068SAndreas Gohr            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
112172de9068SAndreas Gohr            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
112272de9068SAndreas Gohr            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
112372de9068SAndreas Gohr            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
112472de9068SAndreas Gohr            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
112572de9068SAndreas Gohr            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
112672de9068SAndreas Gohr            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
112772de9068SAndreas Gohr            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
112872de9068SAndreas Gohr            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
112972de9068SAndreas Gohr            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
113072de9068SAndreas Gohr            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
113172de9068SAndreas Gohr            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
113272de9068SAndreas Gohr            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
113372de9068SAndreas Gohr            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
113472de9068SAndreas Gohr            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
113572de9068SAndreas Gohr            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
113672de9068SAndreas Gohr            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
113772de9068SAndreas Gohr            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
113872de9068SAndreas Gohr            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
113972de9068SAndreas Gohr            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
114072de9068SAndreas Gohr            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
114172de9068SAndreas Gohr            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
114272de9068SAndreas Gohr            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
114372de9068SAndreas Gohr            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
114472de9068SAndreas Gohr            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
114572de9068SAndreas Gohr            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
114672de9068SAndreas Gohr            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
114772de9068SAndreas Gohr            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
114872de9068SAndreas Gohr            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
114972de9068SAndreas Gohr            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
115072de9068SAndreas Gohr            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
115172de9068SAndreas Gohr            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
115272de9068SAndreas Gohr            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
115372de9068SAndreas Gohr            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
115472de9068SAndreas Gohr            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
115572de9068SAndreas Gohr                );
115672de9068SAndreas Gohr}; // end of case lookup tables
1157ab77016bSAndreas Gohr
115882257610Sandi/**
115982257610Sandi * UTF-8 lookup table for lower case accented letters
116082257610Sandi *
116182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
116282257610Sandi * range. This are lower case letters only.
116382257610Sandi *
116482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
116582257610Sandi * @see    utf8_deaccent()
116682257610Sandi */
116754662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
1168df957b36SAndreas Gohrif(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
116982257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
117082257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
117182257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
117282257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
117382257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
117482257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
117582257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
117682257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
117782257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
117882257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
117982257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
118082257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
118182257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
118282257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
118374c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
118482257610Sandi);
118582257610Sandi
118682257610Sandi/**
118782257610Sandi * UTF-8 lookup table for upper case accented letters
118882257610Sandi *
118982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
119082257610Sandi * range. This are upper case letters only.
119182257610Sandi *
119282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
119382257610Sandi * @see    utf8_deaccent()
119482257610Sandi */
119554662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
1196df957b36SAndreas Gohrif(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1197df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1198df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1199df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1200df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1201df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1202df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1203df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1204df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1205df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1206df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1207df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1208df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1209df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1210df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
121174c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
121282257610Sandi);
121382257610Sandi
1214099ada41Sandi/**
1215099ada41Sandi * UTF-8 array of common special characters
1216099ada41Sandi *
1217099ada41Sandi * This array should contain all special characters (not a letter or digit)
1218099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1219099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1220099ada41Sandi * chars.
1221099ada41Sandi *
1222099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1223ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1224099ada41Sandi *
1225099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1226099ada41Sandi * @see    utf8_stripspecials()
1227099ada41Sandi */
122854662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1229df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1230099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1231ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
12325c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
12335c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1234099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1235099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1236099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1237099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1238099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1239099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1240099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1241099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1242fae4b5fcSAndreas Gohr  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1243099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1244099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1245099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1246099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1247099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1248099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1249099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1250099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1251099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1252099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1253099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1254099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1255099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1256099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1257099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1258099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1259099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1260099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1261099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1262099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1263099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1264099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1265099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1266099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1267099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1268099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1269099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1270099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1271099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1272099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1273099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1274099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1275d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1276d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1277d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1278d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1279099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1280099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1281099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1282099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1283d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1284d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1285d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1286d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1287d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1288d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1289fae4b5fcSAndreas Gohr  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1290fae4b5fcSAndreas Gohr  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1291fae4b5fcSAndreas Gohr  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
12927de9cff5SAndreas Gohr  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
12937de9cff5SAndreas Gohr  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1294099ada41Sandi);
1295340756e4Sandi
1296720307d9Schris// utf8 version of above data
1297720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1298df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
129937242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
130032261ab5SChristopher Smith    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
130185b77bbdSAndreas Gohr    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1302720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
130385b77bbdSAndreas Gohr    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1304720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
130585b77bbdSAndreas Gohr    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1306720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1307720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1308720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1309720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1310720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1311720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1312720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1313d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1314d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1315d5b23302STom N Harris    '�'.
1316d5b23302STom N Harris    '�ﹼﹽ'.
1317d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1318fae4b5fcSAndreas Gohr    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
13197de9cff5SAndreas Gohr    '����������������������������������������������������������������'.
13207de9cff5SAndreas Gohr    '   ⁠';
1321720307d9Schris
13228a831f2bSAndreas Gohr/**
13238a831f2bSAndreas Gohr * Romanization lookup table
13248a831f2bSAndreas Gohr *
13258a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
13268a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
13278a831f2bSAndreas Gohr *
13288a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
13298a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
13308a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
13318a831f2bSAndreas Gohr *
13328a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
13338a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
13348a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
13358a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
13368a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
13378a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
13388a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
13398a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
1340014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com>
1341fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de>
134256c92de6SEivind Morland * @author Eivind Morland <eivind.morland@gmail.com>
13438a831f2bSAndreas Gohr */
134454662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
1345df957b36SAndreas Gohrif(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1346176ae32bSAndreas Gohr  // scandinavian - differs from what we do in deaccent
1347176ae32bSAndreas Gohr  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1348176ae32bSAndreas Gohr
13498a831f2bSAndreas Gohr  //russian cyrillic
13508a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
13518a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
13528a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
13538a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
13548a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
13558a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1356d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1357f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
13588a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
13598a831f2bSAndreas Gohr  // Ukrainian cyrillic
13608a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
13618a831f2bSAndreas Gohr  // Georgian
13628a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
13638a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
13648a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
13658a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
13668a831f2bSAndreas Gohr  'ჰ'=>'xh',
13678a831f2bSAndreas Gohr  //Sanskrit
13688a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
13698a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
13708a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
13718a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
13728a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
13738a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
13748a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
137556c92de6SEivind Morland  //Sanskrit diacritics
137656c92de6SEivind Morland  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
137756c92de6SEivind Morland  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
137856c92de6SEivind Morland  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
137956c92de6SEivind Morland  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
13808a831f2bSAndreas Gohr  //Hebrew
13813dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
13823dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
13833dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
13848a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
13858a831f2bSAndreas Gohr  //Arabic
13868a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
13878a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
13888a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
13898a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
13908a831f2bSAndreas Gohr
1391799e0977SAndreas Gohr  // Japanese characters  (last update: 2008-05-09)
13929476a253SAndreas Gohr
13938a831f2bSAndreas Gohr  // Japanese hiragana
1394fed467f8SDenis Scheither
1395fed467f8SDenis Scheither  // 3 character syllables, っ doubles the consonant after
1396fed467f8SDenis Scheither  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1397879205e1SAndreas Gohr  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1398799e0977SAndreas Gohr  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1399879205e1SAndreas Gohr  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1400879205e1SAndreas Gohr  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1401879205e1SAndreas Gohr  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1402879205e1SAndreas Gohr  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1403879205e1SAndreas Gohr  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1404879205e1SAndreas Gohr  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1405879205e1SAndreas Gohr  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1406879205e1SAndreas Gohr  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1407879205e1SAndreas Gohr
1408879205e1SAndreas Gohr  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1409879205e1SAndreas Gohr  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1410879205e1SAndreas Gohr  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1411fed467f8SDenis Scheither
1412fed467f8SDenis Scheither   // 2 character syllables - normal
1413879205e1SAndreas Gohr  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1414fed467f8SDenis Scheither  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1415fed467f8SDenis Scheither  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1416799e0977SAndreas Gohr  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1417799e0977SAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1418fed467f8SDenis Scheither  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1419fed467f8SDenis Scheither  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1420fed467f8SDenis Scheither  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1421fed467f8SDenis Scheither  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1422fed467f8SDenis Scheither  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1423fed467f8SDenis Scheither  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1424879205e1SAndreas Gohr  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1425879205e1SAndreas Gohr  'うぇ'=>'we','うぃ'=>'wi',
1426879205e1SAndreas Gohr  'いぇ'=>'ye',
1427fed467f8SDenis Scheither
1428fed467f8SDenis Scheither  // 2 character syllables, っ doubles the consonant after
1429fed467f8SDenis Scheither  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1430fed467f8SDenis Scheither  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1431fed467f8SDenis Scheither  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1432fed467f8SDenis Scheither  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1433fed467f8SDenis Scheither  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1434fed467f8SDenis Scheither  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1435fed467f8SDenis Scheither  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1436fed467f8SDenis Scheither  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1437fed467f8SDenis Scheither  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1438fed467f8SDenis Scheither  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1439799e0977SAndreas Gohr  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1440fed467f8SDenis Scheither
1441fed467f8SDenis Scheither  // 1 character syllabels
1442fed467f8SDenis Scheither  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1443879205e1SAndreas Gohr  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1444fed467f8SDenis Scheither  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1445fed467f8SDenis Scheither  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
14469476a253SAndreas Gohr  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1447fed467f8SDenis Scheither  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1448fed467f8SDenis Scheither  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1449fed467f8SDenis Scheither  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1450fed467f8SDenis Scheither  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1451fed467f8SDenis Scheither  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1452fed467f8SDenis Scheither  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1453fed467f8SDenis Scheither  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1454879205e1SAndreas Gohr  'わ'=>'wa','を'=>'wo',
1455879205e1SAndreas Gohr  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1456879205e1SAndreas Gohr  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
14579476a253SAndreas Gohr  // old characters
14589476a253SAndreas Gohr  'ゑ'=>'we','ゐ'=>'wi',
1459fed467f8SDenis Scheither
14609476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
14619476a253SAndreas Gohr  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
14629476a253SAndreas Gohr  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1463fed467f8SDenis Scheither
14649476a253SAndreas Gohr  // never seen one of those (disabled for the moment)
1465879205e1SAndreas Gohr  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
14669476a253SAndreas Gohr  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
14679476a253SAndreas Gohr  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
14689476a253SAndreas Gohr  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
14699476a253SAndreas Gohr  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
14709476a253SAndreas Gohr  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
14719476a253SAndreas Gohr  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
14729476a253SAndreas Gohr  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
14739476a253SAndreas Gohr  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
14749476a253SAndreas Gohr  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
14759476a253SAndreas Gohr  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
14769476a253SAndreas Gohr  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
14779476a253SAndreas Gohr  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
14789476a253SAndreas Gohr  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1479fed467f8SDenis Scheither
1480fed467f8SDenis Scheither  // 'spare' characters from other romanization systems
1481fed467f8SDenis Scheither  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1482fed467f8SDenis Scheither  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1483fed467f8SDenis Scheither  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1484fed467f8SDenis Scheither  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1485fed467f8SDenis Scheither  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1486fed467f8SDenis Scheither  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1487fed467f8SDenis Scheither  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1488fed467f8SDenis Scheither  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1489fed467f8SDenis Scheither  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1490fed467f8SDenis Scheither  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1491fed467f8SDenis Scheither
1492fed467f8SDenis Scheither
14938a831f2bSAndreas Gohr  // Japanese katakana
1494fed467f8SDenis Scheither
1495fed467f8SDenis Scheither  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1496fed467f8SDenis Scheither  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1497fed467f8SDenis Scheither  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1498fed467f8SDenis Scheither  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1499fed467f8SDenis Scheither  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1500fed467f8SDenis Scheither  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1501fed467f8SDenis Scheither  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1502fed467f8SDenis Scheither  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1503fed467f8SDenis Scheither  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1504fed467f8SDenis Scheither  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1505799e0977SAndreas Gohr  'ッティー'=>'ttii',
1506799e0977SAndreas Gohr  'ッヂィー'=>'ddii',
1507fed467f8SDenis Scheither
1508fed467f8SDenis Scheither  // 3 character syllables - doubled vowels
1509fed467f8SDenis Scheither  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1510fed467f8SDenis Scheither  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1511fed467f8SDenis Scheither  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1512fed467f8SDenis Scheither  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1513fed467f8SDenis Scheither  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1514fed467f8SDenis Scheither  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1515fed467f8SDenis Scheither  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1516fed467f8SDenis Scheither  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1517fed467f8SDenis Scheither  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1518fed467f8SDenis Scheither  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1519fed467f8SDenis Scheither  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1520fed467f8SDenis Scheither  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1521fed467f8SDenis Scheither  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1522fed467f8SDenis Scheither  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1523fed467f8SDenis Scheither  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1524fed467f8SDenis Scheither  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1525fed467f8SDenis Scheither  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1526fed467f8SDenis Scheither  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1527fed467f8SDenis Scheither  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1528fed467f8SDenis Scheither  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1529fed467f8SDenis Scheither  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1530fed467f8SDenis Scheither  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1531fed467f8SDenis Scheither  'ウェー'=>'wee','ウィー'=>'wii',
1532fed467f8SDenis Scheither  'イェー'=>'yee',
1533799e0977SAndreas Gohr  'ティー'=>'tii',
1534799e0977SAndreas Gohr  'ヂィー'=>'dii',
1535fed467f8SDenis Scheither
1536fed467f8SDenis Scheither  // 3 character syllables - doubled consonants
1537fed467f8SDenis Scheither  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1538fed467f8SDenis Scheither  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1539fed467f8SDenis Scheither  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1540fed467f8SDenis Scheither  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1541fed467f8SDenis Scheither  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1542fed467f8SDenis Scheither  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1543fed467f8SDenis Scheither  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1544fed467f8SDenis Scheither  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1545fed467f8SDenis Scheither  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1546799e0977SAndreas Gohr  'ッティ'=>'tti',
1547799e0977SAndreas Gohr  'ッヂィ'=>'ddi',
1548fed467f8SDenis Scheither
1549fed467f8SDenis Scheither  // 3 character syllables - doubled vowel and consonants
1550fed467f8SDenis Scheither  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1551fed467f8SDenis Scheither  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1552fed467f8SDenis Scheither  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1553fed467f8SDenis Scheither  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1554fed467f8SDenis Scheither  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1555fed467f8SDenis Scheither  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1556fed467f8SDenis Scheither  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1557fed467f8SDenis Scheither  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1558799e0977SAndreas Gohr  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1559799e0977SAndreas Gohr  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1560fed467f8SDenis Scheither  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1561fed467f8SDenis Scheither
1562fed467f8SDenis Scheither  // 2 character syllables - normal
1563799e0977SAndreas Gohr  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1564799e0977SAndreas Gohr  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1565799e0977SAndreas Gohr  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1566fed467f8SDenis Scheither  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1567fed467f8SDenis Scheither  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1568fed467f8SDenis Scheither  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1569fed467f8SDenis Scheither  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1570fed467f8SDenis Scheither  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1571fed467f8SDenis Scheither  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1572fed467f8SDenis Scheither  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1573fed467f8SDenis Scheither  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1574879205e1SAndreas Gohr  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1575879205e1SAndreas Gohr  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1576fed467f8SDenis Scheither  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1577fed467f8SDenis Scheither  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1578fed467f8SDenis Scheither  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1579799e0977SAndreas Gohr  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1580fed467f8SDenis Scheither  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1581fed467f8SDenis Scheither  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1582fed467f8SDenis Scheither  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1583fed467f8SDenis Scheither  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1584fed467f8SDenis Scheither  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1585fed467f8SDenis Scheither  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1586fed467f8SDenis Scheither  'ウェ'=>'we','ウィ'=>'wi',
1587fed467f8SDenis Scheither  'イェ'=>'ye',
1588799e0977SAndreas Gohr  'ティ'=>'ti',
1589799e0977SAndreas Gohr  'ヂィ'=>'di',
1590fed467f8SDenis Scheither
1591fed467f8SDenis Scheither  // 2 character syllables - doubled vocal
1592fed467f8SDenis Scheither  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1593fed467f8SDenis Scheither  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1594fed467f8SDenis Scheither  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1595fed467f8SDenis Scheither  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1596fed467f8SDenis Scheither  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1597fed467f8SDenis Scheither  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1598fed467f8SDenis Scheither  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1599fed467f8SDenis Scheither  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1600fed467f8SDenis Scheither  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1601fed467f8SDenis Scheither  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1602fed467f8SDenis Scheither  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1603799e0977SAndreas Gohr  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1604fed467f8SDenis Scheither  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1605fed467f8SDenis Scheither  'ワー'=>'waa','ヲー'=>'woo',
1606fed467f8SDenis Scheither  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1607fed467f8SDenis Scheither  'ヵー'=>'kaa','ヶー'=>'kee',
16089476a253SAndreas Gohr  // old characters
16099476a253SAndreas Gohr  'ヱー'=>'wee','ヰー'=>'wii',
1610fed467f8SDenis Scheither
1611879205e1SAndreas Gohr  // seperate katakana 'n'
1612879205e1SAndreas Gohr  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1613879205e1SAndreas Gohr  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1614879205e1SAndreas Gohr
1615fed467f8SDenis Scheither  // 2 character syllables - doubled consonants
1616fed467f8SDenis Scheither  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1617fed467f8SDenis Scheither  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1618fed467f8SDenis Scheither  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1619fed467f8SDenis Scheither  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1620fed467f8SDenis Scheither  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1621fed467f8SDenis Scheither  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1622fed467f8SDenis Scheither  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1623fed467f8SDenis Scheither  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1624799e0977SAndreas Gohr  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1625799e0977SAndreas Gohr  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1626fed467f8SDenis Scheither  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1627fed467f8SDenis Scheither
1628fed467f8SDenis Scheither  // 1 character syllables
1629fed467f8SDenis Scheither  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1630fed467f8SDenis Scheither  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1631fed467f8SDenis Scheither  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1632fed467f8SDenis Scheither  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1633fed467f8SDenis Scheither  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1634fed467f8SDenis Scheither  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1635fed467f8SDenis Scheither  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1636fed467f8SDenis Scheither  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1637fed467f8SDenis Scheither  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1638fed467f8SDenis Scheither  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1639879205e1SAndreas Gohr  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1640fed467f8SDenis Scheither  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1641fed467f8SDenis Scheither  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1642fed467f8SDenis Scheither  'ワ'=>'wa','ヲ'=>'wo',
1643fed467f8SDenis Scheither  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1644fed467f8SDenis Scheither  'ヵ'=>'ka','ヶ'=>'ke',
16459476a253SAndreas Gohr  // old characters
16469476a253SAndreas Gohr  'ヱ'=>'we','ヰ'=>'wi',
1647fed467f8SDenis Scheither
16489476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1649fed467f8SDenis Scheither  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1650fed467f8SDenis Scheither  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1651fed467f8SDenis Scheither
1652799e0977SAndreas Gohr  // special characters
1653799e0977SAndreas Gohr  '・'=>'_','、'=>'_',
1654799e0977SAndreas Gohr  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1655799e0977SAndreas Gohr
1656fed467f8SDenis Scheither  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1657fed467f8SDenis Scheither  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1658fed467f8SDenis Scheither  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1659fed467f8SDenis Scheither  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1660fed467f8SDenis Scheither  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1661fed467f8SDenis Scheither  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1662fed467f8SDenis Scheither  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1663fed467f8SDenis Scheither  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1664fed467f8SDenis Scheither  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1665fed467f8SDenis Scheither  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1666fed467f8SDenis Scheither  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1667fed467f8SDenis Scheither  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
16688a831f2bSAndreas Gohr
16698a831f2bSAndreas Gohr  // "Greeklish"
16708a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
16718a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
16728a831f2bSAndreas Gohr
16738a831f2bSAndreas Gohr  // Thai
16748a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
16758a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
16768a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
16778a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
16788a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
16798a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1680014d0ab6SAndreas Gohr  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1681014d0ab6SAndreas Gohr  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1682014d0ab6SAndreas Gohr  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1683014d0ab6SAndreas Gohr  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1684014d0ab6SAndreas Gohr  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1685014d0ab6SAndreas Gohr  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1686014d0ab6SAndreas Gohr  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1687014d0ab6SAndreas Gohr  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1688014d0ab6SAndreas Gohr  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1689014d0ab6SAndreas Gohr  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1690014d0ab6SAndreas Gohr  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1691014d0ab6SAndreas Gohr  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
16928a831f2bSAndreas Gohr
16938a831f2bSAndreas Gohr  // Korean
16948a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
16958a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
16968a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
16978a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
16988a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
16998a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
17008a831f2bSAndreas Gohr);
1701340756e4Sandi
17028a831f2bSAndreas Gohr
1703