xref: /dokuwiki/inc/utf8.php (revision f393a4eb51a5f8ed0e64f09f76cbafe57d7dcb57)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
51f2058faSAndreas Gohr * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr        define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr    }else{
16ab77016bSAndreas Gohr        define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr    }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22df957b36SAndreas Gohrif(!function_exists('utf8_isASCII')){
23f29bd553Sandi    /**
2444f669e9Sandi     * Checks if a string contains 7bit ASCII only
2544f669e9Sandi     *
263cf90024SMichael Hamann     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
2744f669e9Sandi     */
2844f669e9Sandi    function utf8_isASCII($str){
297e6f32c4SAndreas Gohr        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
3044f669e9Sandi    }
31df957b36SAndreas Gohr}
3244f669e9Sandi
33df957b36SAndreas Gohrif(!function_exists('utf8_strip')){
3444f669e9Sandi    /**
35e1906e6eSandi     * Strips all highbyte chars
36e1906e6eSandi     *
37e1906e6eSandi     * Returns a pure ASCII7 string
38e1906e6eSandi     *
39e1906e6eSandi     * @author Andreas Gohr <andi@splitbrain.org>
40e1906e6eSandi     */
41e1906e6eSandi    function utf8_strip($str){
42e1906e6eSandi        $ascii = '';
438ec3f7bdSAndreas Gohr        $len = strlen($str);
448ec3f7bdSAndreas Gohr        for($i=0; $i<$len; $i++){
45e1906e6eSandi            if(ord($str{$i}) <128){
46e1906e6eSandi                $ascii .= $str{$i};
47e1906e6eSandi            }
48e1906e6eSandi        }
49e1906e6eSandi        return $ascii;
50e1906e6eSandi    }
51df957b36SAndreas Gohr}
52e1906e6eSandi
53df957b36SAndreas Gohrif(!function_exists('utf8_check')){
54e1906e6eSandi    /**
55f29bd553Sandi     * Tries to detect if a string is in Unicode encoding
56f29bd553Sandi     *
57f29bd553Sandi     * @author <bmorel@ssi.fr>
58f29bd553Sandi     * @link   http://www.php.net/manual/en/function.utf8-encode.php
59f29bd553Sandi     */
60f29bd553Sandi    function utf8_check($Str) {
618ec3f7bdSAndreas Gohr        $len = strlen($Str);
628ec3f7bdSAndreas Gohr        for ($i=0; $i<$len; $i++) {
635e613a5cSchris            $b = ord($Str[$i]);
645e613a5cSchris            if ($b < 0x80) continue; # 0bbbbbbb
655e613a5cSchris            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
665e613a5cSchris            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
675e613a5cSchris            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
685e613a5cSchris            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
695e613a5cSchris            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70f29bd553Sandi            else return false; # Does not match any model
71df957b36SAndreas Gohr
72f29bd553Sandi            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
738ec3f7bdSAndreas Gohr                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74f29bd553Sandi                    return false;
75f29bd553Sandi            }
76f29bd553Sandi        }
77f29bd553Sandi        return true;
78f29bd553Sandi    }
79df957b36SAndreas Gohr}
8049c713a3Sandi
81*f393a4ebSAndreas Gohrif(!function_exists('utf8_basename')){
82*f393a4ebSAndreas Gohr    /**
83*f393a4ebSAndreas Gohr     * A locale independent basename() implementation
84*f393a4ebSAndreas Gohr     *
85*f393a4ebSAndreas Gohr     * works around a bug in PHP's basename() implementation
86*f393a4ebSAndreas Gohr     *
87*f393a4ebSAndreas Gohr     * @see basename()
88*f393a4ebSAndreas Gohr     * @link   https://bugs.php.net/bug.php?id=37738
89*f393a4ebSAndreas Gohr     * @param string $path     A path
90*f393a4ebSAndreas Gohr     * @param string $suffix   If the name component ends in suffix this will also be cut off
91*f393a4ebSAndreas Gohr     * @return string
92*f393a4ebSAndreas Gohr     */
93*f393a4ebSAndreas Gohr    function utf8_basename($path, $suffix=''){
94*f393a4ebSAndreas Gohr        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
95*f393a4ebSAndreas Gohr        $file = substr($path, $rpos+1);
96*f393a4ebSAndreas Gohr
97*f393a4ebSAndreas Gohr        $suflen = strlen($suffix);
98*f393a4ebSAndreas Gohr        if($suflen && (substr($file, -$suflen) == $suffix)){
99*f393a4ebSAndreas Gohr            $file = substr($file, 0, -$suflen);
100*f393a4ebSAndreas Gohr        }
101*f393a4ebSAndreas Gohr
102*f393a4ebSAndreas Gohr        return $file;
103*f393a4ebSAndreas Gohr    }
104*f393a4ebSAndreas Gohr}
105*f393a4ebSAndreas Gohr
106df957b36SAndreas Gohrif(!function_exists('utf8_strlen')){
1072f954959Sandi    /**
108f29317c1Sandi     * Unicode aware replacement for strlen()
1092f954959Sandi     *
110f29317c1Sandi     * utf8_decode() converts characters that are not in ISO-8859-1
111f29317c1Sandi     * to '?', which, for the purpose of counting, is alright - It's
112f29317c1Sandi     * even faster than mb_strlen.
1132f954959Sandi     *
114f29317c1Sandi     * @author <chernyshevsky at hotmail dot com>
1152f954959Sandi     * @see    strlen()
116f29317c1Sandi     * @see    utf8_decode()
1172f954959Sandi     */
1182f954959Sandi    function utf8_strlen($string){
119dc57ef04Sandi        return strlen(utf8_decode($string));
1202f954959Sandi    }
121df957b36SAndreas Gohr}
1222f954959Sandi
123df957b36SAndreas Gohrif(!function_exists('utf8_substr')){
1247077c942Sandi    /**
12510f09f2aSAndreas Gohr     * UTF-8 aware alternative to substr
1267077c942Sandi     *
12710f09f2aSAndreas Gohr     * Return part of a string given character offset (and optionally length)
12810f09f2aSAndreas Gohr     *
12910f09f2aSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
1305e613a5cSchris     * @author Chris Smith <chris@jalakai.co.uk>
131e3736c26SAndreas Gohr     * @param string $str
132e3736c26SAndreas Gohr     * @param int $offset number of UTF-8 characters offset (from left)
133e3736c26SAndreas Gohr     * @param int $length (optional) length in UTF-8 characters from offset
13444881bd0Shenning.noren     * @return mixed string or false if failure
1357077c942Sandi     */
13610f09f2aSAndreas Gohr    function utf8_substr($str, $offset, $length = null) {
137ab77016bSAndreas Gohr        if(UTF8_MBSTRING){
13810f09f2aSAndreas Gohr            if( $length === null ){
13919a32233Schris                return mb_substr($str, $offset);
1407d8be200Sandi            }else{
14119a32233Schris                return mb_substr($str, $offset, $length);
142f29317c1Sandi            }
143f29317c1Sandi        }
144f29317c1Sandi
1452626ee0cSchris        /*
1462626ee0cSchris         * Notes:
1472626ee0cSchris         *
1482626ee0cSchris         * no mb string support, so we'll use pcre regex's with 'u' flag
1492626ee0cSchris         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1502626ee0cSchris         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1512626ee0cSchris         *
1522626ee0cSchris         * substr documentation states false can be returned in some cases (e.g. offset > string length)
1532626ee0cSchris         * mb_substr never returns false, it will return an empty string instead.
1542626ee0cSchris         *
1552626ee0cSchris         * calculating the number of characters in the string is a relatively expensive operation, so
1562626ee0cSchris         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1572626ee0cSchris         */
15810f09f2aSAndreas Gohr
1592626ee0cSchris        // cast parameters to appropriate types to avoid multiple notices/warnings
1602626ee0cSchris        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1612626ee0cSchris        $offset = (int)$offset;
1622626ee0cSchris        if (!is_null($length)) $length = (int)$length;
16310f09f2aSAndreas Gohr
1642626ee0cSchris        // handle trivial cases
1655e613a5cSchris        if ($length === 0) return '';
1662626ee0cSchris        if ($offset < 0 && $length < 0 && $length < $offset) return '';
1675e613a5cSchris
1682626ee0cSchris        $offset_pattern = '';
1692626ee0cSchris        $length_pattern = '';
1702626ee0cSchris
1712626ee0cSchris        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1722626ee0cSchris        if ($offset < 0) {
1732626ee0cSchris            $strlen = strlen(utf8_decode($str));        // see notes
1742626ee0cSchris            $offset = $strlen + $offset;
1752626ee0cSchris            if ($offset < 0) $offset = 0;
1762626ee0cSchris        }
1772626ee0cSchris
1782626ee0cSchris        // establish a pattern for offset, a non-captured group equal in length to offset
1792626ee0cSchris        if ($offset > 0) {
1802626ee0cSchris            $Ox = (int)($offset/65535);
1812626ee0cSchris            $Oy = $offset%65535;
1822626ee0cSchris
1832626ee0cSchris            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1842626ee0cSchris            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1852626ee0cSchris        } else {
1862626ee0cSchris            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1872626ee0cSchris        }
1882626ee0cSchris
1892626ee0cSchris        // establish a pattern for length
1902626ee0cSchris        if (is_null($length)) {
1912626ee0cSchris            $length_pattern = '(.*)$';                  // the rest of the string
1922626ee0cSchris        } else {
1932626ee0cSchris
1942626ee0cSchris            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
1952626ee0cSchris            if ($offset > $strlen) return '';           // another trivial case
1962626ee0cSchris
1972626ee0cSchris            if ($length > 0) {
1982626ee0cSchris
1992626ee0cSchris                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2002626ee0cSchris
2012626ee0cSchris                $Lx = (int)($length/65535);
2022626ee0cSchris                $Ly = $length%65535;
2032626ee0cSchris
2042626ee0cSchris                // +ve length requires ... a captured group of length characters
2052626ee0cSchris                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2062626ee0cSchris                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2072626ee0cSchris
2082626ee0cSchris            } else if ($length < 0) {
2092626ee0cSchris
2102626ee0cSchris                if ($length < ($offset - $strlen)) return '';
2112626ee0cSchris
2122626ee0cSchris                $Lx = (int)((-$length)/65535);
2132626ee0cSchris                $Ly = (-$length)%65535;
2142626ee0cSchris
2152626ee0cSchris                // -ve length requires ... capture everything except a group of -length characters
2162626ee0cSchris                //                         anchored at the tail-end of the string
2172626ee0cSchris                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2182626ee0cSchris                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
21910f09f2aSAndreas Gohr            }
22010f09f2aSAndreas Gohr        }
22110f09f2aSAndreas Gohr
2222626ee0cSchris        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2232626ee0cSchris        return $match[1];
2242626ee0cSchris    }
225df957b36SAndreas Gohr}
22610f09f2aSAndreas Gohr
227df957b36SAndreas Gohrif(!function_exists('utf8_substr_replace')){
228f29317c1Sandi    /**
229dc57ef04Sandi     * Unicode aware replacement for substr_replace()
230dc57ef04Sandi     *
231dc57ef04Sandi     * @author Andreas Gohr <andi@splitbrain.org>
232dc57ef04Sandi     * @see    substr_replace()
233dc57ef04Sandi     */
234dc57ef04Sandi    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
235dc57ef04Sandi        $ret = '';
236dc57ef04Sandi        if($start>0) $ret .= utf8_substr($string, 0, $start);
237dc57ef04Sandi        $ret .= $replacement;
238dc57ef04Sandi        $ret .= utf8_substr($string, $start+$length);
239dc57ef04Sandi        return $ret;
240dc57ef04Sandi    }
241df957b36SAndreas Gohr}
242dc57ef04Sandi
243df957b36SAndreas Gohrif(!function_exists('utf8_ltrim')){
244dc57ef04Sandi    /**
245f29317c1Sandi     * Unicode aware replacement for ltrim()
246f29317c1Sandi     *
247f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
248f29317c1Sandi     * @see    ltrim()
249e3736c26SAndreas Gohr     * @param  string $str
250e3736c26SAndreas Gohr     * @param  string $charlist
251f29317c1Sandi     * @return string
252f29317c1Sandi     */
253f29317c1Sandi    function utf8_ltrim($str,$charlist=''){
254f29317c1Sandi        if($charlist == '') return ltrim($str);
255f29317c1Sandi
256f29317c1Sandi        //quote charlist for use in a characterclass
257f29317c1Sandi        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
258f29317c1Sandi
259f29317c1Sandi        return preg_replace('/^['.$charlist.']+/u','',$str);
260f29317c1Sandi    }
261df957b36SAndreas Gohr}
262f29317c1Sandi
263df957b36SAndreas Gohrif(!function_exists('utf8_rtrim')){
264f29317c1Sandi    /**
265ea2eed85Sandi     * Unicode aware replacement for rtrim()
266f29317c1Sandi     *
267f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
268f29317c1Sandi     * @see    rtrim()
269e3736c26SAndreas Gohr     * @param  string $str
270e3736c26SAndreas Gohr     * @param  string $charlist
271f29317c1Sandi     * @return string
272f29317c1Sandi     */
273f29317c1Sandi    function  utf8_rtrim($str,$charlist=''){
274f29317c1Sandi        if($charlist == '') return rtrim($str);
275f29317c1Sandi
276f29317c1Sandi        //quote charlist for use in a characterclass
277f29317c1Sandi        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
278f29317c1Sandi
279f29317c1Sandi        return preg_replace('/['.$charlist.']+$/u','',$str);
280f29317c1Sandi    }
281df957b36SAndreas Gohr}
282f29317c1Sandi
283df957b36SAndreas Gohrif(!function_exists('utf8_trim')){
284f29317c1Sandi    /**
285f29317c1Sandi     * Unicode aware replacement for trim()
286f29317c1Sandi     *
287f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
288f29317c1Sandi     * @see    trim()
289e3736c26SAndreas Gohr     * @param  string $str
290e3736c26SAndreas Gohr     * @param  string $charlist
291f29317c1Sandi     * @return string
292f29317c1Sandi     */
293f29317c1Sandi    function  utf8_trim($str,$charlist='') {
294f29317c1Sandi        if($charlist == '') return trim($str);
295f29317c1Sandi
29640421069SAndreas Gohr        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
297f29317c1Sandi    }
298df957b36SAndreas Gohr}
299f29317c1Sandi
300df957b36SAndreas Gohrif(!function_exists('utf8_strtolower')){
30149c713a3Sandi    /**
30282257610Sandi     * This is a unicode aware replacement for strtolower()
30382257610Sandi     *
30482257610Sandi     * Uses mb_string extension if available
30582257610Sandi     *
30672de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
30782257610Sandi     * @see    strtolower()
30882257610Sandi     * @see    utf8_strtoupper()
30982257610Sandi     */
31082257610Sandi    function utf8_strtolower($string){
311ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
31282257610Sandi
31382257610Sandi        global $UTF8_UPPER_TO_LOWER;
31472de9068SAndreas Gohr        return strtr($string,$UTF8_UPPER_TO_LOWER);
31582257610Sandi    }
316df957b36SAndreas Gohr}
31782257610Sandi
318df957b36SAndreas Gohrif(!function_exists('utf8_strtoupper')){
31982257610Sandi    /**
32082257610Sandi     * This is a unicode aware replacement for strtoupper()
32182257610Sandi     *
32282257610Sandi     * Uses mb_string extension if available
32382257610Sandi     *
32472de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
32582257610Sandi     * @see    strtoupper()
32682257610Sandi     * @see    utf8_strtoupper()
32782257610Sandi     */
32882257610Sandi    function utf8_strtoupper($string){
329ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
33082257610Sandi
33182257610Sandi        global $UTF8_LOWER_TO_UPPER;
33272de9068SAndreas Gohr        return strtr($string,$UTF8_LOWER_TO_UPPER);
33382257610Sandi    }
334df957b36SAndreas Gohr}
33582257610Sandi
336df957b36SAndreas Gohrif(!function_exists('utf8_ucfirst')){
33782257610Sandi    /**
33826ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucfirst
33926ece5a7SAndreas Gohr     * Make a string's first character uppercase
34026ece5a7SAndreas Gohr     *
34126ece5a7SAndreas Gohr     * @author Harry Fuecks
34226ece5a7SAndreas Gohr     * @param string
34326ece5a7SAndreas Gohr     * @return string with first character as upper case (if applicable)
34426ece5a7SAndreas Gohr     */
34526ece5a7SAndreas Gohr    function utf8_ucfirst($str){
34626ece5a7SAndreas Gohr        switch ( utf8_strlen($str) ) {
34726ece5a7SAndreas Gohr            case 0:
34826ece5a7SAndreas Gohr                return '';
34926ece5a7SAndreas Gohr            case 1:
35026ece5a7SAndreas Gohr                return utf8_strtoupper($str);
35126ece5a7SAndreas Gohr            default:
35226ece5a7SAndreas Gohr                preg_match('/^(.{1})(.*)$/us', $str, $matches);
35326ece5a7SAndreas Gohr                return utf8_strtoupper($matches[1]).$matches[2];
35426ece5a7SAndreas Gohr        }
35526ece5a7SAndreas Gohr    }
356df957b36SAndreas Gohr}
35726ece5a7SAndreas Gohr
358df957b36SAndreas Gohrif(!function_exists('utf8_ucwords')){
35926ece5a7SAndreas Gohr    /**
36026ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucwords
36126ece5a7SAndreas Gohr     * Uppercase the first character of each word in a string
36226ece5a7SAndreas Gohr     *
36326ece5a7SAndreas Gohr     * @author Harry Fuecks
36426ece5a7SAndreas Gohr     * @param string
36526ece5a7SAndreas Gohr     * @return string with first char of each word uppercase
36626ece5a7SAndreas Gohr     * @see http://www.php.net/ucwords
36726ece5a7SAndreas Gohr     */
36826ece5a7SAndreas Gohr    function utf8_ucwords($str) {
36926ece5a7SAndreas Gohr        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
37026ece5a7SAndreas Gohr        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
37126ece5a7SAndreas Gohr        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
37226ece5a7SAndreas Gohr        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
37326ece5a7SAndreas Gohr
37426ece5a7SAndreas Gohr        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
37526ece5a7SAndreas Gohr    }
37626ece5a7SAndreas Gohr
37726ece5a7SAndreas Gohr    /**
37826ece5a7SAndreas Gohr     * Callback function for preg_replace_callback call in utf8_ucwords
37926ece5a7SAndreas Gohr     * You don't need to call this yourself
38026ece5a7SAndreas Gohr     *
38126ece5a7SAndreas Gohr     * @author Harry Fuecks
382e3736c26SAndreas Gohr     * @param  array $matches matches corresponding to a single word
38326ece5a7SAndreas Gohr     * @return string with first char of the word in uppercase
38426ece5a7SAndreas Gohr     * @see utf8_ucwords
38526ece5a7SAndreas Gohr     * @see utf8_strtoupper
38626ece5a7SAndreas Gohr     */
38726ece5a7SAndreas Gohr    function utf8_ucwords_callback($matches) {
38826ece5a7SAndreas Gohr        $leadingws = $matches[2];
38926ece5a7SAndreas Gohr        $ucfirst = utf8_strtoupper($matches[3]);
39026ece5a7SAndreas Gohr        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
39126ece5a7SAndreas Gohr        return $leadingws . $ucword;
39226ece5a7SAndreas Gohr    }
393df957b36SAndreas Gohr}
39426ece5a7SAndreas Gohr
395df957b36SAndreas Gohrif(!function_exists('utf8_deaccent')){
39626ece5a7SAndreas Gohr    /**
39782257610Sandi     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
39882257610Sandi     *
39982257610Sandi     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
40082257610Sandi     * letters. Default is to deaccent both cases ($case = 0)
40182257610Sandi     *
40282257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
40382257610Sandi     */
40482257610Sandi    function utf8_deaccent($string,$case=0){
40582257610Sandi        if($case <= 0){
40682257610Sandi            global $UTF8_LOWER_ACCENTS;
40772de9068SAndreas Gohr            $string = strtr($string,$UTF8_LOWER_ACCENTS);
40882257610Sandi        }
40982257610Sandi        if($case >= 0){
41082257610Sandi            global $UTF8_UPPER_ACCENTS;
41172de9068SAndreas Gohr            $string = strtr($string,$UTF8_UPPER_ACCENTS);
41282257610Sandi        }
41382257610Sandi        return $string;
41482257610Sandi    }
415df957b36SAndreas Gohr}
41682257610Sandi
417df957b36SAndreas Gohrif(!function_exists('utf8_romanize')){
41882257610Sandi    /**
4198a831f2bSAndreas Gohr     * Romanize a non-latin string
4208a831f2bSAndreas Gohr     *
4218a831f2bSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
4228a831f2bSAndreas Gohr     */
4238a831f2bSAndreas Gohr    function utf8_romanize($string){
4248a831f2bSAndreas Gohr        if(utf8_isASCII($string)) return $string; //nothing to do
4258a831f2bSAndreas Gohr
4268a831f2bSAndreas Gohr        global $UTF8_ROMANIZATION;
4278a831f2bSAndreas Gohr        return strtr($string,$UTF8_ROMANIZATION);
4288a831f2bSAndreas Gohr    }
429df957b36SAndreas Gohr}
4308a831f2bSAndreas Gohr
431df957b36SAndreas Gohrif(!function_exists('utf8_stripspecials')){
4328a831f2bSAndreas Gohr    /**
433099ada41Sandi     * Removes special characters (nonalphanumeric) from a UTF-8 string
434099ada41Sandi     *
435099ada41Sandi     * This function adds the controlchars 0x00 to 0x19 to the array of
436099ada41Sandi     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
437099ada41Sandi     *
438099ada41Sandi     * @author Andreas Gohr <andi@splitbrain.org>
439099ada41Sandi     * @param  string $string     The UTF8 string to strip of special chars
440099ada41Sandi     * @param  string $repl       Replace special with this string
441b4ce25e9SAndreas Gohr     * @param  string $additional Additional chars to strip (used in regexp char class)
442e3736c26SAndreas Gohr     * @return string
443099ada41Sandi     */
444b4ce25e9SAndreas Gohr    function utf8_stripspecials($string,$repl='',$additional=''){
445720307d9Schris        global $UTF8_SPECIAL_CHARS2;
446099ada41Sandi
4475c812709Sandi        static $specials = null;
4485c812709Sandi        if(is_null($specials)){
449720307d9Schris            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
450720307d9Schris            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4515c812709Sandi        }
452099ada41Sandi
453b4ce25e9SAndreas Gohr        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
454099ada41Sandi    }
455df957b36SAndreas Gohr}
456099ada41Sandi
457df957b36SAndreas Gohrif(!function_exists('utf8_strpos')){
458099ada41Sandi    /**
4592f954959Sandi     * This is an Unicode aware replacement for strpos
4602f954959Sandi     *
46172de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
4622f954959Sandi     * @see    strpos()
46372de9068SAndreas Gohr     * @param  string
46472de9068SAndreas Gohr     * @param  string
46572de9068SAndreas Gohr     * @param  integer
46672de9068SAndreas Gohr     * @return integer
4672f954959Sandi     */
4682f954959Sandi    function utf8_strpos($haystack, $needle, $offset=0){
46972de9068SAndreas Gohr        $comp = 0;
47072de9068SAndreas Gohr        $length = null;
4712f954959Sandi
47272de9068SAndreas Gohr        while (is_null($length) || $length < $offset) {
47372de9068SAndreas Gohr            $pos = strpos($haystack, $needle, $offset + $comp);
47472de9068SAndreas Gohr
47572de9068SAndreas Gohr            if ($pos === false)
476f29317c1Sandi                return false;
47772de9068SAndreas Gohr
47872de9068SAndreas Gohr            $length = utf8_strlen(substr($haystack, 0, $pos));
47972de9068SAndreas Gohr
48072de9068SAndreas Gohr            if ($length < $offset)
48172de9068SAndreas Gohr                $comp = $pos - $length;
482f29317c1Sandi        }
4832f954959Sandi
48472de9068SAndreas Gohr        return $length;
48572de9068SAndreas Gohr    }
486df957b36SAndreas Gohr}
487f29317c1Sandi
488df957b36SAndreas Gohrif(!function_exists('utf8_tohtml')){
4892f954959Sandi    /**
490ea2eed85Sandi     * Encodes UTF-8 characters to HTML entities
491ea2eed85Sandi     *
4929f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
493ea2eed85Sandi     * @author <vpribish at shopping dot com>
494ea2eed85Sandi     * @link   http://www.php.net/manual/en/function.utf8-decode.php
495ea2eed85Sandi     */
496ea2eed85Sandi    function utf8_tohtml ($str) {
497ea2eed85Sandi        $ret = '';
4989f9fb0e5STom N Harris        foreach (utf8_to_unicode($str) as $cp) {
4999f9fb0e5STom N Harris            if ($cp < 0x80)
5009f9fb0e5STom N Harris                $ret .= chr($cp);
5019f9fb0e5STom N Harris            elseif ($cp < 0x100)
5029f9fb0e5STom N Harris                $ret .= "&#$cp;";
5039f9fb0e5STom N Harris            else
5049f9fb0e5STom N Harris                $ret .= '&#x'.dechex($cp).';';
5059f9fb0e5STom N Harris        }
5069f9fb0e5STom N Harris        return $ret;
5079f9fb0e5STom N Harris    }
508df957b36SAndreas Gohr}
5099f9fb0e5STom N Harris
510df957b36SAndreas Gohrif(!function_exists('utf8_unhtml')){
5119f9fb0e5STom N Harris    /**
5129f9fb0e5STom N Harris     * Decodes HTML entities to UTF-8 characters
5139f9fb0e5STom N Harris     *
5149f9fb0e5STom N Harris     * Convert any &#..; entity to a codepoint,
5159f9fb0e5STom N Harris     * The entities flag defaults to only decoding numeric entities.
5169f9fb0e5STom N Harris     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
5179f9fb0e5STom N Harris     * are handled as well. Avoids the problem that would occur if you
5189f9fb0e5STom N Harris     * had to decode "&amp;#38;&#38;amp;#38;"
5199f9fb0e5STom N Harris     *
5209f9fb0e5STom N Harris     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
5219f9fb0e5STom N Harris     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
5229f9fb0e5STom N Harris     * what it should be                   -> "&#38;&amp#38;"
5239f9fb0e5STom N Harris     *
5249f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
5259f9fb0e5STom N Harris     * @param  string  $str      UTF-8 encoded string
5269f9fb0e5STom N Harris     * @param  boolean $entities Flag controlling decoding of named entities.
527e3736c26SAndreas Gohr     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
5289f9fb0e5STom N Harris     */
5299f9fb0e5STom N Harris    function utf8_unhtml($str, $entities=null) {
5309f9fb0e5STom N Harris        static $decoder = null;
5319f9fb0e5STom N Harris        if (is_null($decoder))
5329f9fb0e5STom N Harris            $decoder = new utf8_entity_decoder();
5339f9fb0e5STom N Harris        if (is_null($entities))
5349f9fb0e5STom N Harris            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
5359f9fb0e5STom N Harris                                         'utf8_decode_numeric', $str);
5369f9fb0e5STom N Harris        else
5379f9fb0e5STom N Harris            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
5389f9fb0e5STom N Harris                                         array(&$decoder, 'decode'), $str);
5399f9fb0e5STom N Harris    }
540df957b36SAndreas Gohr}
541df957b36SAndreas Gohr
542df957b36SAndreas Gohrif(!function_exists('utf8_decode_numeric')){
543e3736c26SAndreas Gohr    /**
544e3736c26SAndreas Gohr     * Decodes numeric HTML entities to their correct UTF-8 characters
545e3736c26SAndreas Gohr     *
546e3736c26SAndreas Gohr     * @param $ent string A numeric entity
547e3736c26SAndreas Gohr     * @return string
548e3736c26SAndreas Gohr     */
5499f9fb0e5STom N Harris    function utf8_decode_numeric($ent) {
5509f9fb0e5STom N Harris        switch ($ent[2]) {
5519f9fb0e5STom N Harris            case 'X':
5529f9fb0e5STom N Harris            case 'x':
5539f9fb0e5STom N Harris                $cp = hexdec($ent[3]);
5549f9fb0e5STom N Harris                break;
5559f9fb0e5STom N Harris            default:
5569f9fb0e5STom N Harris                $cp = intval($ent[3]);
5579f9fb0e5STom N Harris                break;
5589f9fb0e5STom N Harris        }
5599f9fb0e5STom N Harris        return unicode_to_utf8(array($cp));
5609f9fb0e5STom N Harris    }
561df957b36SAndreas Gohr}
562df957b36SAndreas Gohr
563df957b36SAndreas Gohrif(!class_exists('utf8_entity_decoder')){
564e3736c26SAndreas Gohr    /**
565e3736c26SAndreas Gohr     * Encapsulate HTML entity decoding tables
566e3736c26SAndreas Gohr     */
5679f9fb0e5STom N Harris    class utf8_entity_decoder {
5689f9fb0e5STom N Harris        var $table;
569e3736c26SAndreas Gohr
570e3736c26SAndreas Gohr        /**
571e3736c26SAndreas Gohr         * Initializes the decoding tables
572e3736c26SAndreas Gohr         */
57363703ba5SAndreas Gohr        function __construct() {
5749f9fb0e5STom N Harris            $table = get_html_translation_table(HTML_ENTITIES);
5759f9fb0e5STom N Harris            $table = array_flip($table);
5769f9fb0e5STom N Harris            $this->table = array_map(array(&$this,'makeutf8'), $table);
5779f9fb0e5STom N Harris        }
578e3736c26SAndreas Gohr
579e3736c26SAndreas Gohr        /**
580e3736c26SAndreas Gohr         * Wrapper aorund unicode_to_utf8()
581e3736c26SAndreas Gohr         *
582e3736c26SAndreas Gohr         * @param $c string
583e3736c26SAndreas Gohr         * @return mixed
584e3736c26SAndreas Gohr         */
5859f9fb0e5STom N Harris        function makeutf8($c) {
5869f9fb0e5STom N Harris            return unicode_to_utf8(array(ord($c)));
5879f9fb0e5STom N Harris        }
588e3736c26SAndreas Gohr
589e3736c26SAndreas Gohr        /**
590e3736c26SAndreas Gohr         * Decodes any HTML entity to it's correct UTF-8 char equivalent
591e3736c26SAndreas Gohr         *
592e3736c26SAndreas Gohr         * @param $ent string An entity
593e3736c26SAndreas Gohr         * @return string
594e3736c26SAndreas Gohr         */
5959f9fb0e5STom N Harris        function decode($ent) {
5969f9fb0e5STom N Harris            if ($ent[1] == '#') {
5979f9fb0e5STom N Harris                return utf8_decode_numeric($ent);
5989f9fb0e5STom N Harris            } elseif (array_key_exists($ent[0],$this->table)) {
5999f9fb0e5STom N Harris                return $this->table[$ent[0]];
6009f9fb0e5STom N Harris            } else {
6019f9fb0e5STom N Harris                return $ent[0];
602ea2eed85Sandi            }
603ea2eed85Sandi        }
604ea2eed85Sandi    }
605df957b36SAndreas Gohr}
606ea2eed85Sandi
607df957b36SAndreas Gohrif(!function_exists('utf8_to_unicode')){
608ea2eed85Sandi    /**
6091abfaba4SAndreas Gohr     * Takes an UTF-8 string and returns an array of ints representing the
6101abfaba4SAndreas Gohr     * Unicode characters. Astral planes are supported ie. the ints in the
6111abfaba4SAndreas Gohr     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6121abfaba4SAndreas Gohr     * are not allowed.
61382257610Sandi     *
6141abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
6151abfaba4SAndreas Gohr     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
6161abfaba4SAndreas Gohr     * level E_USER_WARNING
6171abfaba4SAndreas Gohr     *
6181abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to
6191abfaba4SAndreas Gohr     * trigger errors on encountering bad bytes
6201abfaba4SAndreas Gohr     *
6211abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
6221abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
623e3736c26SAndreas Gohr     * @param  string  $str UTF-8 encoded string
624e3736c26SAndreas Gohr     * @param  boolean $strict Check for invalid sequences?
62544881bd0Shenning.noren     * @return mixed array of unicode code points or false if UTF-8 invalid
6261abfaba4SAndreas Gohr     * @see    unicode_to_utf8
6271abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
6281abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
62982257610Sandi     */
6301abfaba4SAndreas Gohr    function utf8_to_unicode($str,$strict=false) {
6311abfaba4SAndreas Gohr        $mState = 0;     // cached expected number of octets after the current octet
6321abfaba4SAndreas Gohr                         // until the beginning of the next UTF8 character sequence
6331abfaba4SAndreas Gohr        $mUcs4  = 0;     // cached Unicode character
6341abfaba4SAndreas Gohr        $mBytes = 1;     // cached expected number of octets in the current sequence
63582257610Sandi
6361abfaba4SAndreas Gohr        $out = array();
6371abfaba4SAndreas Gohr
6381abfaba4SAndreas Gohr        $len = strlen($str);
6391abfaba4SAndreas Gohr
6401abfaba4SAndreas Gohr        for($i = 0; $i < $len; $i++) {
6411abfaba4SAndreas Gohr
6421abfaba4SAndreas Gohr            $in = ord($str{$i});
6431abfaba4SAndreas Gohr
6441abfaba4SAndreas Gohr            if ( $mState == 0) {
6451abfaba4SAndreas Gohr
6461abfaba4SAndreas Gohr                // When mState is zero we expect either a US-ASCII character or a
6471abfaba4SAndreas Gohr                // multi-octet sequence.
6481abfaba4SAndreas Gohr                if (0 == (0x80 & ($in))) {
6491abfaba4SAndreas Gohr                    // US-ASCII, pass straight through.
6501abfaba4SAndreas Gohr                    $out[] = $in;
6511abfaba4SAndreas Gohr                    $mBytes = 1;
6521abfaba4SAndreas Gohr
6531abfaba4SAndreas Gohr                } else if (0xC0 == (0xE0 & ($in))) {
6541abfaba4SAndreas Gohr                    // First octet of 2 octet sequence
6551abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6561abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
6571abfaba4SAndreas Gohr                    $mState = 1;
6581abfaba4SAndreas Gohr                    $mBytes = 2;
6591abfaba4SAndreas Gohr
6601abfaba4SAndreas Gohr                } else if (0xE0 == (0xF0 & ($in))) {
6611abfaba4SAndreas Gohr                    // First octet of 3 octet sequence
6621abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6631abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
6641abfaba4SAndreas Gohr                    $mState = 2;
6651abfaba4SAndreas Gohr                    $mBytes = 3;
6661abfaba4SAndreas Gohr
6671abfaba4SAndreas Gohr                } else if (0xF0 == (0xF8 & ($in))) {
6681abfaba4SAndreas Gohr                    // First octet of 4 octet sequence
6691abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6701abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x07) << 18;
6711abfaba4SAndreas Gohr                    $mState = 3;
6721abfaba4SAndreas Gohr                    $mBytes = 4;
6731abfaba4SAndreas Gohr
6741abfaba4SAndreas Gohr                } else if (0xF8 == (0xFC & ($in))) {
6751abfaba4SAndreas Gohr                    /* First octet of 5 octet sequence.
6761abfaba4SAndreas Gohr                     *
6771abfaba4SAndreas Gohr                     * This is illegal because the encoded codepoint must be either
6781abfaba4SAndreas Gohr                     * (a) not the shortest form or
6791abfaba4SAndreas Gohr                     * (b) outside the Unicode range of 0-0x10FFFF.
6801abfaba4SAndreas Gohr                     * Rather than trying to resynchronize, we will carry on until the end
6811abfaba4SAndreas Gohr                     * of the sequence and let the later error handling code catch it.
6821abfaba4SAndreas Gohr                     */
6831abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6841abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x03) << 24;
6851abfaba4SAndreas Gohr                    $mState = 4;
6861abfaba4SAndreas Gohr                    $mBytes = 5;
6871abfaba4SAndreas Gohr
6881abfaba4SAndreas Gohr                } else if (0xFC == (0xFE & ($in))) {
6891abfaba4SAndreas Gohr                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
6901abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6911abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 1) << 30;
6921abfaba4SAndreas Gohr                    $mState = 5;
6931abfaba4SAndreas Gohr                    $mBytes = 6;
6941abfaba4SAndreas Gohr
6951abfaba4SAndreas Gohr                } elseif($strict) {
6961abfaba4SAndreas Gohr                    /* Current octet is neither in the US-ASCII range nor a legal first
6971abfaba4SAndreas Gohr                     * octet of a multi-octet sequence.
6981abfaba4SAndreas Gohr                     */
6991abfaba4SAndreas Gohr                    trigger_error(
7001abfaba4SAndreas Gohr                            'utf8_to_unicode: Illegal sequence identifier '.
7011abfaba4SAndreas Gohr                                'in UTF-8 at byte '.$i,
7021abfaba4SAndreas Gohr                            E_USER_WARNING
7031abfaba4SAndreas Gohr                        );
70444881bd0Shenning.noren                    return false;
7051abfaba4SAndreas Gohr
7061abfaba4SAndreas Gohr                }
7071abfaba4SAndreas Gohr
7081abfaba4SAndreas Gohr            } else {
7091abfaba4SAndreas Gohr
7101abfaba4SAndreas Gohr                // When mState is non-zero, we expect a continuation of the multi-octet
7111abfaba4SAndreas Gohr                // sequence
7121abfaba4SAndreas Gohr                if (0x80 == (0xC0 & ($in))) {
7131abfaba4SAndreas Gohr
7141abfaba4SAndreas Gohr                    // Legal continuation.
7151abfaba4SAndreas Gohr                    $shift = ($mState - 1) * 6;
7161abfaba4SAndreas Gohr                    $tmp = $in;
7171abfaba4SAndreas Gohr                    $tmp = ($tmp & 0x0000003F) << $shift;
7181abfaba4SAndreas Gohr                    $mUcs4 |= $tmp;
7191abfaba4SAndreas Gohr
7201abfaba4SAndreas Gohr                    /**
7211abfaba4SAndreas Gohr                     * End of the multi-octet sequence. mUcs4 now contains the final
7221abfaba4SAndreas Gohr                     * Unicode codepoint to be output
7231abfaba4SAndreas Gohr                     */
7241abfaba4SAndreas Gohr                    if (0 == --$mState) {
7251abfaba4SAndreas Gohr
7261abfaba4SAndreas Gohr                        /*
7271abfaba4SAndreas Gohr                         * Check for illegal sequences and codepoints.
7281abfaba4SAndreas Gohr                         */
7291abfaba4SAndreas Gohr                        // From Unicode 3.1, non-shortest form is illegal
7301abfaba4SAndreas Gohr                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
7311abfaba4SAndreas Gohr                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
7321abfaba4SAndreas Gohr                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
7331abfaba4SAndreas Gohr                            (4 < $mBytes) ||
7341abfaba4SAndreas Gohr                            // From Unicode 3.2, surrogate characters are illegal
7351abfaba4SAndreas Gohr                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
7361abfaba4SAndreas Gohr                            // Codepoints outside the Unicode range are illegal
7371abfaba4SAndreas Gohr                            ($mUcs4 > 0x10FFFF)) {
7381abfaba4SAndreas Gohr
7391abfaba4SAndreas Gohr                            if($strict){
7401abfaba4SAndreas Gohr                                trigger_error(
7411abfaba4SAndreas Gohr                                        'utf8_to_unicode: Illegal sequence or codepoint '.
7421abfaba4SAndreas Gohr                                            'in UTF-8 at byte '.$i,
7431abfaba4SAndreas Gohr                                        E_USER_WARNING
7441abfaba4SAndreas Gohr                                    );
7451abfaba4SAndreas Gohr
74644881bd0Shenning.noren                                return false;
7471abfaba4SAndreas Gohr                            }
7481abfaba4SAndreas Gohr
7491abfaba4SAndreas Gohr                        }
7501abfaba4SAndreas Gohr
7511abfaba4SAndreas Gohr                        if (0xFEFF != $mUcs4) {
7521abfaba4SAndreas Gohr                            // BOM is legal but we don't want to output it
7531abfaba4SAndreas Gohr                            $out[] = $mUcs4;
7541abfaba4SAndreas Gohr                        }
7551abfaba4SAndreas Gohr
7561abfaba4SAndreas Gohr                        //initialize UTF8 cache
7571abfaba4SAndreas Gohr                        $mState = 0;
7581abfaba4SAndreas Gohr                        $mUcs4  = 0;
7591abfaba4SAndreas Gohr                        $mBytes = 1;
7601abfaba4SAndreas Gohr                    }
7611abfaba4SAndreas Gohr
7621abfaba4SAndreas Gohr                } elseif($strict) {
7631abfaba4SAndreas Gohr                    /**
7641abfaba4SAndreas Gohr                     *((0xC0 & (*in) != 0x80) && (mState != 0))
7651abfaba4SAndreas Gohr                     * Incomplete multi-octet sequence.
7661abfaba4SAndreas Gohr                     */
7671abfaba4SAndreas Gohr                    trigger_error(
7681abfaba4SAndreas Gohr                            'utf8_to_unicode: Incomplete multi-octet '.
7691abfaba4SAndreas Gohr                            '   sequence in UTF-8 at byte '.$i,
7701abfaba4SAndreas Gohr                            E_USER_WARNING
7711abfaba4SAndreas Gohr                        );
7721abfaba4SAndreas Gohr
77344881bd0Shenning.noren                    return false;
77482257610Sandi                }
77582257610Sandi            }
77682257610Sandi        }
7771abfaba4SAndreas Gohr        return $out;
77882257610Sandi    }
779df957b36SAndreas Gohr}
78082257610Sandi
781df957b36SAndreas Gohrif(!function_exists('unicode_to_utf8')){
78282257610Sandi    /**
7831abfaba4SAndreas Gohr     * Takes an array of ints representing the Unicode characters and returns
7841abfaba4SAndreas Gohr     * a UTF-8 string. Astral planes are supported ie. the ints in the
7851abfaba4SAndreas Gohr     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
7861abfaba4SAndreas Gohr     * are not allowed.
78782257610Sandi     *
7881abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
7891abfaba4SAndreas Gohr     * array contains ints that represent surrogates or are outside the
7901abfaba4SAndreas Gohr     * Unicode range and raises a PHP error at level E_USER_WARNING
7911abfaba4SAndreas Gohr     *
7921abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to use
7931abfaba4SAndreas Gohr     * output buffering to concatenate the UTF-8 string (faster) as well as
7941abfaba4SAndreas Gohr     * reference the array by it's keys
7951abfaba4SAndreas Gohr     *
796e3736c26SAndreas Gohr     * @param  array $arr of unicode code points representing a string
797e3736c26SAndreas Gohr     * @param  boolean $strict Check for invalid sequences?
79844881bd0Shenning.noren     * @return mixed UTF-8 string or false if array contains invalid code points
7991abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
8001abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
8011abfaba4SAndreas Gohr     * @see    utf8_to_unicode
8021abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
8031abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
80482257610Sandi     */
8051abfaba4SAndreas Gohr    function unicode_to_utf8($arr,$strict=false) {
8061abfaba4SAndreas Gohr        if (!is_array($arr)) return '';
8071abfaba4SAndreas Gohr        ob_start();
808f949a01cSAndreas Gohr
8091abfaba4SAndreas Gohr        foreach (array_keys($arr) as $k) {
8101abfaba4SAndreas Gohr
8111abfaba4SAndreas Gohr            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
812db959ae3SAndreas Gohr                # ASCII range (including control chars)
8131abfaba4SAndreas Gohr
8141abfaba4SAndreas Gohr                echo chr($arr[$k]);
8151abfaba4SAndreas Gohr
8161abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x07ff) {
817db959ae3SAndreas Gohr                # 2 byte sequence
8181abfaba4SAndreas Gohr
8191abfaba4SAndreas Gohr                echo chr(0xc0 | ($arr[$k] >> 6));
8201abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
8211abfaba4SAndreas Gohr
8221abfaba4SAndreas Gohr            } else if($arr[$k] == 0xFEFF) {
823db959ae3SAndreas Gohr                # Byte order mark (skip)
8241abfaba4SAndreas Gohr
8251abfaba4SAndreas Gohr                // nop -- zap the BOM
8261abfaba4SAndreas Gohr
8271abfaba4SAndreas Gohr            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
828db959ae3SAndreas Gohr                # Test for illegal surrogates
8291abfaba4SAndreas Gohr
8301abfaba4SAndreas Gohr                // found a surrogate
8311abfaba4SAndreas Gohr                if($strict){
8321abfaba4SAndreas Gohr                    trigger_error(
8331abfaba4SAndreas Gohr                        'unicode_to_utf8: Illegal surrogate '.
8341abfaba4SAndreas Gohr                            'at index: '.$k.', value: '.$arr[$k],
8351abfaba4SAndreas Gohr                        E_USER_WARNING
8361abfaba4SAndreas Gohr                        );
83744881bd0Shenning.noren                    return false;
8381abfaba4SAndreas Gohr                }
8391abfaba4SAndreas Gohr
8401abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0xffff) {
841db959ae3SAndreas Gohr                # 3 byte sequence
8421abfaba4SAndreas Gohr
8431abfaba4SAndreas Gohr                echo chr(0xe0 | ($arr[$k] >> 12));
8441abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
8451abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
8461abfaba4SAndreas Gohr
8471abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x10ffff) {
848db959ae3SAndreas Gohr                # 4 byte sequence
8491abfaba4SAndreas Gohr
8501abfaba4SAndreas Gohr                echo chr(0xf0 | ($arr[$k] >> 18));
8511abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
8521abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
8531abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x3f));
8541abfaba4SAndreas Gohr
8551abfaba4SAndreas Gohr            } elseif($strict) {
8561abfaba4SAndreas Gohr
8571abfaba4SAndreas Gohr                trigger_error(
8581abfaba4SAndreas Gohr                    'unicode_to_utf8: Codepoint out of Unicode range '.
8591abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
8601abfaba4SAndreas Gohr                    E_USER_WARNING
8611abfaba4SAndreas Gohr                    );
8621abfaba4SAndreas Gohr
8631abfaba4SAndreas Gohr                // out of range
86444881bd0Shenning.noren                return false;
86582257610Sandi            }
86682257610Sandi        }
8671abfaba4SAndreas Gohr
8681abfaba4SAndreas Gohr        $result = ob_get_contents();
8691abfaba4SAndreas Gohr        ob_end_clean();
8701abfaba4SAndreas Gohr        return $result;
87182257610Sandi    }
872df957b36SAndreas Gohr}
87382257610Sandi
874df957b36SAndreas Gohrif(!function_exists('utf8_to_utf16be')){
87582257610Sandi    /**
87615fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
87715fa0b4fSAndreas Gohr     *
87815fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
87915fa0b4fSAndreas Gohr     */
88015fa0b4fSAndreas Gohr    function utf8_to_utf16be(&$str, $bom = false) {
88115fa0b4fSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
882ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
88315fa0b4fSAndreas Gohr
88415fa0b4fSAndreas Gohr        $uni = utf8_to_unicode($str);
88515fa0b4fSAndreas Gohr        foreach($uni as $cp){
88615fa0b4fSAndreas Gohr            $out .= pack('n',$cp);
88715fa0b4fSAndreas Gohr        }
88815fa0b4fSAndreas Gohr        return $out;
88915fa0b4fSAndreas Gohr    }
890df957b36SAndreas Gohr}
89115fa0b4fSAndreas Gohr
892df957b36SAndreas Gohrif(!function_exists('utf16be_to_utf8')){
89315fa0b4fSAndreas Gohr    /**
89415fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
89515fa0b4fSAndreas Gohr     *
89615fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
89715fa0b4fSAndreas Gohr     */
89815fa0b4fSAndreas Gohr    function utf16be_to_utf8(&$str) {
89915fa0b4fSAndreas Gohr        $uni = unpack('n*',$str);
90015fa0b4fSAndreas Gohr        return unicode_to_utf8($uni);
90115fa0b4fSAndreas Gohr    }
902df957b36SAndreas Gohr}
90315fa0b4fSAndreas Gohr
904df957b36SAndreas Gohrif(!function_exists('utf8_bad_replace')){
9050eac1afbSAndreas Gohr    /**
9060eac1afbSAndreas Gohr     * Replace bad bytes with an alternative character
9070eac1afbSAndreas Gohr     *
9080eac1afbSAndreas Gohr     * ASCII character is recommended for replacement char
9090eac1afbSAndreas Gohr     *
9100eac1afbSAndreas Gohr     * PCRE Pattern to locate bad bytes in a UTF-8 string
9110eac1afbSAndreas Gohr     * Comes from W3 FAQ: Multilingual Forms
9120eac1afbSAndreas Gohr     * Note: modified to include full ASCII range including control chars
9130eac1afbSAndreas Gohr     *
9140eac1afbSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
9150eac1afbSAndreas Gohr     * @see http://www.w3.org/International/questions/qa-forms-utf-8
916e3736c26SAndreas Gohr     * @param string $str to search
917e3736c26SAndreas Gohr     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
9180eac1afbSAndreas Gohr     * @return string
9190eac1afbSAndreas Gohr     */
9200eac1afbSAndreas Gohr    function utf8_bad_replace($str, $replace = '') {
9210eac1afbSAndreas Gohr        $UTF8_BAD =
9220eac1afbSAndreas Gohr         '([\x00-\x7F]'.                          # ASCII (including control chars)
9230eac1afbSAndreas Gohr         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
9240eac1afbSAndreas Gohr         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
9250eac1afbSAndreas Gohr         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
9260eac1afbSAndreas Gohr         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
9270eac1afbSAndreas Gohr         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
9280eac1afbSAndreas Gohr         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
9290eac1afbSAndreas Gohr         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
9300eac1afbSAndreas Gohr         '|(.{1}))';                              # invalid byte
9310eac1afbSAndreas Gohr        ob_start();
9320eac1afbSAndreas Gohr        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
9330eac1afbSAndreas Gohr            if ( !isset($matches[2])) {
9340eac1afbSAndreas Gohr                echo $matches[0];
9350eac1afbSAndreas Gohr            } else {
9360eac1afbSAndreas Gohr                echo $replace;
9370eac1afbSAndreas Gohr            }
9380eac1afbSAndreas Gohr            $str = substr($str,strlen($matches[0]));
9390eac1afbSAndreas Gohr        }
9400eac1afbSAndreas Gohr        $result = ob_get_contents();
9410eac1afbSAndreas Gohr        ob_end_clean();
9420eac1afbSAndreas Gohr        return $result;
9430eac1afbSAndreas Gohr    }
944df957b36SAndreas Gohr}
945ab77016bSAndreas Gohr
946df957b36SAndreas Gohrif(!function_exists('utf8_correctIdx')){
9475953e889Schris    /**
9485953e889Schris     * adjust a byte index into a utf8 string to a utf8 character boundary
9495953e889Schris     *
9505953e889Schris     * @param $str   string   utf8 character string
9515953e889Schris     * @param $i     int      byte index into $str
9525953e889Schris     * @param $next  bool     direction to search for boundary,
9535953e889Schris     *                           false = up (current character)
9545953e889Schris     *                           true = down (next character)
9555953e889Schris     *
9565953e889Schris     * @return int            byte index into $str now pointing to a utf8 character boundary
9575953e889Schris     *
9585953e889Schris     * @author       chris smith <chris@jalakai.co.uk>
9595953e889Schris     */
9605953e889Schris    function utf8_correctIdx(&$str,$i,$next=false) {
9615953e889Schris
962f50163d1Schris        if ($i <= 0) return 0;
963f50163d1Schris
9645953e889Schris        $limit = strlen($str);
965f50163d1Schris        if ($i>=$limit) return $limit;
966f50163d1Schris
967f50163d1Schris        if ($next) {
9685953e889Schris            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
9695953e889Schris        } else {
9705953e889Schris            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
9715953e889Schris        }
9725953e889Schris
9735953e889Schris        return $i;
9745953e889Schris    }
975df957b36SAndreas Gohr}
9765953e889Schris
977ab77016bSAndreas Gohr// only needed if no mb_string available
978ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
97915fa0b4fSAndreas Gohr    /**
98082257610Sandi     * UTF-8 Case lookup table
98182257610Sandi     *
98282257610Sandi     * This lookuptable defines the upper case letters to their correspponding
98382257610Sandi     * lower case letter in UTF-8
98482257610Sandi     *
98582257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
98682257610Sandi     */
98754662a04SAndreas Gohr    global $UTF8_LOWER_TO_UPPER;
988df957b36SAndreas Gohr    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
98972de9068SAndreas Gohr            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
99072de9068SAndreas Gohr            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
99172de9068SAndreas Gohr            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
99285b77bbdSAndreas Gohr            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
99372de9068SAndreas Gohr            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
99485b77bbdSAndreas Gohr            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
99585b77bbdSAndreas Gohr            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
99685b77bbdSAndreas Gohr            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
99772de9068SAndreas Gohr            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
99872de9068SAndreas Gohr            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
99972de9068SAndreas Gohr            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
100072de9068SAndreas Gohr            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
100172de9068SAndreas Gohr            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
100272de9068SAndreas Gohr            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
100372de9068SAndreas Gohr            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
100472de9068SAndreas Gohr            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
100572de9068SAndreas Gohr            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
100672de9068SAndreas Gohr            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
100772de9068SAndreas Gohr            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
100872de9068SAndreas Gohr            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
100972de9068SAndreas Gohr            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
101072de9068SAndreas Gohr            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
101172de9068SAndreas Gohr            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
101272de9068SAndreas Gohr            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
101372de9068SAndreas Gohr            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
101472de9068SAndreas Gohr            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
101572de9068SAndreas Gohr            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
101672de9068SAndreas Gohr            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
101772de9068SAndreas Gohr            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
101872de9068SAndreas Gohr            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
101972de9068SAndreas Gohr            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
102072de9068SAndreas Gohr            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
102172de9068SAndreas Gohr            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
102272de9068SAndreas Gohr            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
102372de9068SAndreas Gohr            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
102472de9068SAndreas Gohr            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
102572de9068SAndreas Gohr            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
102672de9068SAndreas Gohr            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
102772de9068SAndreas Gohr            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
102872de9068SAndreas Gohr            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
102972de9068SAndreas Gohr            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
103072de9068SAndreas Gohr            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
103172de9068SAndreas Gohr            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
103272de9068SAndreas Gohr            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
103372de9068SAndreas Gohr            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
103472de9068SAndreas Gohr            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
103572de9068SAndreas Gohr            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
103672de9068SAndreas Gohr            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
103772de9068SAndreas Gohr            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
103872de9068SAndreas Gohr            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
103972de9068SAndreas Gohr            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
104072de9068SAndreas Gohr            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
104172de9068SAndreas Gohr            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
104272de9068SAndreas Gohr            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
104372de9068SAndreas Gohr            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
104472de9068SAndreas Gohr            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
104572de9068SAndreas Gohr            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
104672de9068SAndreas Gohr            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
104772de9068SAndreas Gohr            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
104872de9068SAndreas Gohr            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
104972de9068SAndreas Gohr            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
105072de9068SAndreas Gohr            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
105172de9068SAndreas Gohr            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
105272de9068SAndreas Gohr            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
105372de9068SAndreas Gohr            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
105472de9068SAndreas Gohr            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
105572de9068SAndreas Gohr            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
105682257610Sandi                );
105782257610Sandi
105882257610Sandi    /**
105982257610Sandi     * UTF-8 Case lookup table
106082257610Sandi     *
1061e3736c26SAndreas Gohr     * This lookuptable defines the lower case letters to their corresponding
106272de9068SAndreas Gohr     * upper case letter in UTF-8
106382257610Sandi     *
106482257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
106582257610Sandi     */
106654662a04SAndreas Gohr    global $UTF8_UPPER_TO_LOWER;
1067df957b36SAndreas Gohr    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
106872de9068SAndreas Gohr            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
106972de9068SAndreas Gohr            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
107072de9068SAndreas Gohr            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
107185b77bbdSAndreas Gohr            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
107272de9068SAndreas Gohr            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
107385b77bbdSAndreas Gohr            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
107485b77bbdSAndreas Gohr            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
107585b77bbdSAndreas Gohr            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
107672de9068SAndreas Gohr            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
107772de9068SAndreas Gohr            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
107872de9068SAndreas Gohr            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
107972de9068SAndreas Gohr            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
108072de9068SAndreas Gohr            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
108172de9068SAndreas Gohr            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
108272de9068SAndreas Gohr            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
108372de9068SAndreas Gohr            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
108472de9068SAndreas Gohr            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
108572de9068SAndreas Gohr            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
108672de9068SAndreas Gohr            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
108772de9068SAndreas Gohr            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
108872de9068SAndreas Gohr            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
108972de9068SAndreas Gohr            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
109072de9068SAndreas Gohr            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
109172de9068SAndreas Gohr            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
109272de9068SAndreas Gohr            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
109372de9068SAndreas Gohr            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
109472de9068SAndreas Gohr            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
109572de9068SAndreas Gohr            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
109672de9068SAndreas Gohr            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
109772de9068SAndreas Gohr            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
109872de9068SAndreas Gohr            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
109972de9068SAndreas Gohr            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
110072de9068SAndreas Gohr            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
110172de9068SAndreas Gohr            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
110272de9068SAndreas Gohr            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
110372de9068SAndreas Gohr            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
110472de9068SAndreas Gohr            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
110572de9068SAndreas Gohr            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
110672de9068SAndreas Gohr            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
110772de9068SAndreas Gohr            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
110872de9068SAndreas Gohr            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
110972de9068SAndreas Gohr            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
111072de9068SAndreas Gohr            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
111172de9068SAndreas Gohr            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
111272de9068SAndreas Gohr            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
111372de9068SAndreas Gohr            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
111472de9068SAndreas Gohr            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
111572de9068SAndreas Gohr            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
111672de9068SAndreas Gohr            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
111772de9068SAndreas Gohr            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
111872de9068SAndreas Gohr            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
111972de9068SAndreas Gohr            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
112072de9068SAndreas Gohr            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
112172de9068SAndreas Gohr            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
112272de9068SAndreas Gohr            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
112372de9068SAndreas Gohr            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
112472de9068SAndreas Gohr            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
112572de9068SAndreas Gohr            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
112672de9068SAndreas Gohr            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
112772de9068SAndreas Gohr            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
112872de9068SAndreas Gohr            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
112972de9068SAndreas Gohr            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
113072de9068SAndreas Gohr            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
113172de9068SAndreas Gohr            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
113272de9068SAndreas Gohr            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
113372de9068SAndreas Gohr            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
113472de9068SAndreas Gohr            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
113572de9068SAndreas Gohr                );
113672de9068SAndreas Gohr}; // end of case lookup tables
1137ab77016bSAndreas Gohr
113882257610Sandi/**
113982257610Sandi * UTF-8 lookup table for lower case accented letters
114082257610Sandi *
114182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
114282257610Sandi * range. This are lower case letters only.
114382257610Sandi *
114482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
114582257610Sandi * @see    utf8_deaccent()
114682257610Sandi */
114754662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
1148df957b36SAndreas Gohrif(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
114982257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
115082257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
115182257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
115282257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
115382257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
115482257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
115582257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
115682257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
115782257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
115882257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
115982257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
116082257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
116182257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
116282257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
116374c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
116482257610Sandi);
116582257610Sandi
116682257610Sandi/**
116782257610Sandi * UTF-8 lookup table for upper case accented letters
116882257610Sandi *
116982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
117082257610Sandi * range. This are upper case letters only.
117182257610Sandi *
117282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
117382257610Sandi * @see    utf8_deaccent()
117482257610Sandi */
117554662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
1176df957b36SAndreas Gohrif(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1177df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1178df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1179df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1180df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1181df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1182df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1183df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1184df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1185df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1186df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1187df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1188df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1189df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1190df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
119174c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
119282257610Sandi);
119382257610Sandi
1194099ada41Sandi/**
1195099ada41Sandi * UTF-8 array of common special characters
1196099ada41Sandi *
1197099ada41Sandi * This array should contain all special characters (not a letter or digit)
1198099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1199099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1200099ada41Sandi * chars.
1201099ada41Sandi *
1202099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1203ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1204099ada41Sandi *
1205099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1206099ada41Sandi * @see    utf8_stripspecials()
1207099ada41Sandi */
120854662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1209df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1210099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1211ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
12125c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
12135c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1214099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1215099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1216099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1217099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1218099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1219099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1220099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1221099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1222fae4b5fcSAndreas Gohr  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1223099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1224099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1225099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1226099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1227099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1228099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1229099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1230099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1231099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1232099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1233099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1234099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1235099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1236099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1237099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1238099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1239099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1240099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1241099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1242099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1243099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1244099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1245099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1246099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1247099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1248099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1249099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1250099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1251099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1252099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1253099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1254099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1255d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1256d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1257d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1258d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1259099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1260099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1261099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1262099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1263d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1264d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1265d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1266d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1267d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1268d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1269fae4b5fcSAndreas Gohr  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1270fae4b5fcSAndreas Gohr  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1271fae4b5fcSAndreas Gohr  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
12727de9cff5SAndreas Gohr  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
12737de9cff5SAndreas Gohr  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1274099ada41Sandi);
1275340756e4Sandi
1276720307d9Schris// utf8 version of above data
1277720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1278df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
127937242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
128032261ab5SChristopher Smith    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
128185b77bbdSAndreas Gohr    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1282720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
128385b77bbdSAndreas Gohr    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1284720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
128585b77bbdSAndreas Gohr    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1286720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1287720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1288720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1289720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1290720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1291720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1292720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1293d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1294d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1295d5b23302STom N Harris    '�'.
1296d5b23302STom N Harris    '�ﹼﹽ'.
1297d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1298fae4b5fcSAndreas Gohr    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
12997de9cff5SAndreas Gohr    '����������������������������������������������������������������'.
13007de9cff5SAndreas Gohr    '   ⁠';
1301720307d9Schris
13028a831f2bSAndreas Gohr/**
13038a831f2bSAndreas Gohr * Romanization lookup table
13048a831f2bSAndreas Gohr *
13058a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
13068a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
13078a831f2bSAndreas Gohr *
13088a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
13098a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
13108a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
13118a831f2bSAndreas Gohr *
13128a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
13138a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
13148a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
13158a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
13168a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
13178a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
13188a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
13198a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
1320014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com>
1321fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de>
132256c92de6SEivind Morland * @author Eivind Morland <eivind.morland@gmail.com>
13238a831f2bSAndreas Gohr */
132454662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
1325df957b36SAndreas Gohrif(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1326176ae32bSAndreas Gohr  // scandinavian - differs from what we do in deaccent
1327176ae32bSAndreas Gohr  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1328176ae32bSAndreas Gohr
13298a831f2bSAndreas Gohr  //russian cyrillic
13308a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
13318a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
13328a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
13338a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
13348a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
13358a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1336d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1337f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
13388a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
13398a831f2bSAndreas Gohr  // Ukrainian cyrillic
13408a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
13418a831f2bSAndreas Gohr  // Georgian
13428a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
13438a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
13448a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
13458a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
13468a831f2bSAndreas Gohr  'ჰ'=>'xh',
13478a831f2bSAndreas Gohr  //Sanskrit
13488a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
13498a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
13508a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
13518a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
13528a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
13538a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
13548a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
135556c92de6SEivind Morland  //Sanskrit diacritics
135656c92de6SEivind Morland  'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
135756c92de6SEivind Morland  'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
135856c92de6SEivind Morland  'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
135956c92de6SEivind Morland  'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
13608a831f2bSAndreas Gohr  //Hebrew
13613dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
13623dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
13633dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
13648a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
13658a831f2bSAndreas Gohr  //Arabic
13668a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
13678a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
13688a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
13698a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
13708a831f2bSAndreas Gohr
1371799e0977SAndreas Gohr  // Japanese characters  (last update: 2008-05-09)
13729476a253SAndreas Gohr
13738a831f2bSAndreas Gohr  // Japanese hiragana
1374fed467f8SDenis Scheither
1375fed467f8SDenis Scheither  // 3 character syllables, っ doubles the consonant after
1376fed467f8SDenis Scheither  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1377879205e1SAndreas Gohr  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1378799e0977SAndreas Gohr  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1379879205e1SAndreas Gohr  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1380879205e1SAndreas Gohr  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1381879205e1SAndreas Gohr  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1382879205e1SAndreas Gohr  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1383879205e1SAndreas Gohr  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1384879205e1SAndreas Gohr  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1385879205e1SAndreas Gohr  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1386879205e1SAndreas Gohr  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1387879205e1SAndreas Gohr
1388879205e1SAndreas Gohr  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1389879205e1SAndreas Gohr  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1390879205e1SAndreas Gohr  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1391fed467f8SDenis Scheither
1392fed467f8SDenis Scheither   // 2 character syllables - normal
1393879205e1SAndreas Gohr  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1394fed467f8SDenis Scheither  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1395fed467f8SDenis Scheither  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1396799e0977SAndreas Gohr  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1397799e0977SAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1398fed467f8SDenis Scheither  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1399fed467f8SDenis Scheither  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1400fed467f8SDenis Scheither  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1401fed467f8SDenis Scheither  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1402fed467f8SDenis Scheither  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1403fed467f8SDenis Scheither  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1404879205e1SAndreas Gohr  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1405879205e1SAndreas Gohr  'うぇ'=>'we','うぃ'=>'wi',
1406879205e1SAndreas Gohr  'いぇ'=>'ye',
1407fed467f8SDenis Scheither
1408fed467f8SDenis Scheither  // 2 character syllables, っ doubles the consonant after
1409fed467f8SDenis Scheither  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1410fed467f8SDenis Scheither  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1411fed467f8SDenis Scheither  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1412fed467f8SDenis Scheither  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1413fed467f8SDenis Scheither  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1414fed467f8SDenis Scheither  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1415fed467f8SDenis Scheither  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1416fed467f8SDenis Scheither  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1417fed467f8SDenis Scheither  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1418fed467f8SDenis Scheither  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1419799e0977SAndreas Gohr  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1420fed467f8SDenis Scheither
1421fed467f8SDenis Scheither  // 1 character syllabels
1422fed467f8SDenis Scheither  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1423879205e1SAndreas Gohr  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1424fed467f8SDenis Scheither  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1425fed467f8SDenis Scheither  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
14269476a253SAndreas Gohr  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1427fed467f8SDenis Scheither  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1428fed467f8SDenis Scheither  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1429fed467f8SDenis Scheither  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1430fed467f8SDenis Scheither  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1431fed467f8SDenis Scheither  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1432fed467f8SDenis Scheither  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1433fed467f8SDenis Scheither  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1434879205e1SAndreas Gohr  'わ'=>'wa','を'=>'wo',
1435879205e1SAndreas Gohr  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1436879205e1SAndreas Gohr  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
14379476a253SAndreas Gohr  // old characters
14389476a253SAndreas Gohr  'ゑ'=>'we','ゐ'=>'wi',
1439fed467f8SDenis Scheither
14409476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
14419476a253SAndreas Gohr  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
14429476a253SAndreas Gohr  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1443fed467f8SDenis Scheither
14449476a253SAndreas Gohr  // never seen one of those (disabled for the moment)
1445879205e1SAndreas Gohr  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
14469476a253SAndreas Gohr  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
14479476a253SAndreas Gohr  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
14489476a253SAndreas Gohr  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
14499476a253SAndreas Gohr  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
14509476a253SAndreas Gohr  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
14519476a253SAndreas Gohr  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
14529476a253SAndreas Gohr  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
14539476a253SAndreas Gohr  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
14549476a253SAndreas Gohr  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
14559476a253SAndreas Gohr  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
14569476a253SAndreas Gohr  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
14579476a253SAndreas Gohr  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
14589476a253SAndreas Gohr  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1459fed467f8SDenis Scheither
1460fed467f8SDenis Scheither  // 'spare' characters from other romanization systems
1461fed467f8SDenis Scheither  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1462fed467f8SDenis Scheither  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1463fed467f8SDenis Scheither  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1464fed467f8SDenis Scheither  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1465fed467f8SDenis Scheither  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1466fed467f8SDenis Scheither  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1467fed467f8SDenis Scheither  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1468fed467f8SDenis Scheither  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1469fed467f8SDenis Scheither  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1470fed467f8SDenis Scheither  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1471fed467f8SDenis Scheither
1472fed467f8SDenis Scheither
14738a831f2bSAndreas Gohr  // Japanese katakana
1474fed467f8SDenis Scheither
1475fed467f8SDenis Scheither  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1476fed467f8SDenis Scheither  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1477fed467f8SDenis Scheither  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1478fed467f8SDenis Scheither  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1479fed467f8SDenis Scheither  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1480fed467f8SDenis Scheither  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1481fed467f8SDenis Scheither  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1482fed467f8SDenis Scheither  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1483fed467f8SDenis Scheither  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1484fed467f8SDenis Scheither  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1485799e0977SAndreas Gohr  'ッティー'=>'ttii',
1486799e0977SAndreas Gohr  'ッヂィー'=>'ddii',
1487fed467f8SDenis Scheither
1488fed467f8SDenis Scheither  // 3 character syllables - doubled vowels
1489fed467f8SDenis Scheither  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1490fed467f8SDenis Scheither  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1491fed467f8SDenis Scheither  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1492fed467f8SDenis Scheither  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1493fed467f8SDenis Scheither  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1494fed467f8SDenis Scheither  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1495fed467f8SDenis Scheither  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1496fed467f8SDenis Scheither  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1497fed467f8SDenis Scheither  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1498fed467f8SDenis Scheither  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1499fed467f8SDenis Scheither  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1500fed467f8SDenis Scheither  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1501fed467f8SDenis Scheither  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1502fed467f8SDenis Scheither  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1503fed467f8SDenis Scheither  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1504fed467f8SDenis Scheither  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1505fed467f8SDenis Scheither  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1506fed467f8SDenis Scheither  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1507fed467f8SDenis Scheither  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1508fed467f8SDenis Scheither  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1509fed467f8SDenis Scheither  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1510fed467f8SDenis Scheither  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1511fed467f8SDenis Scheither  'ウェー'=>'wee','ウィー'=>'wii',
1512fed467f8SDenis Scheither  'イェー'=>'yee',
1513799e0977SAndreas Gohr  'ティー'=>'tii',
1514799e0977SAndreas Gohr  'ヂィー'=>'dii',
1515fed467f8SDenis Scheither
1516fed467f8SDenis Scheither  // 3 character syllables - doubled consonants
1517fed467f8SDenis Scheither  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1518fed467f8SDenis Scheither  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1519fed467f8SDenis Scheither  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1520fed467f8SDenis Scheither  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1521fed467f8SDenis Scheither  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1522fed467f8SDenis Scheither  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1523fed467f8SDenis Scheither  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1524fed467f8SDenis Scheither  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1525fed467f8SDenis Scheither  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1526799e0977SAndreas Gohr  'ッティ'=>'tti',
1527799e0977SAndreas Gohr  'ッヂィ'=>'ddi',
1528fed467f8SDenis Scheither
1529fed467f8SDenis Scheither  // 3 character syllables - doubled vowel and consonants
1530fed467f8SDenis Scheither  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1531fed467f8SDenis Scheither  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1532fed467f8SDenis Scheither  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1533fed467f8SDenis Scheither  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1534fed467f8SDenis Scheither  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1535fed467f8SDenis Scheither  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1536fed467f8SDenis Scheither  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1537fed467f8SDenis Scheither  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1538799e0977SAndreas Gohr  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1539799e0977SAndreas Gohr  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1540fed467f8SDenis Scheither  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1541fed467f8SDenis Scheither
1542fed467f8SDenis Scheither  // 2 character syllables - normal
1543799e0977SAndreas Gohr  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1544799e0977SAndreas Gohr  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1545799e0977SAndreas Gohr  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1546fed467f8SDenis Scheither  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1547fed467f8SDenis Scheither  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1548fed467f8SDenis Scheither  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1549fed467f8SDenis Scheither  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1550fed467f8SDenis Scheither  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1551fed467f8SDenis Scheither  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1552fed467f8SDenis Scheither  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1553fed467f8SDenis Scheither  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1554879205e1SAndreas Gohr  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1555879205e1SAndreas Gohr  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1556fed467f8SDenis Scheither  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1557fed467f8SDenis Scheither  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1558fed467f8SDenis Scheither  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1559799e0977SAndreas Gohr  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1560fed467f8SDenis Scheither  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1561fed467f8SDenis Scheither  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1562fed467f8SDenis Scheither  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1563fed467f8SDenis Scheither  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1564fed467f8SDenis Scheither  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1565fed467f8SDenis Scheither  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1566fed467f8SDenis Scheither  'ウェ'=>'we','ウィ'=>'wi',
1567fed467f8SDenis Scheither  'イェ'=>'ye',
1568799e0977SAndreas Gohr  'ティ'=>'ti',
1569799e0977SAndreas Gohr  'ヂィ'=>'di',
1570fed467f8SDenis Scheither
1571fed467f8SDenis Scheither  // 2 character syllables - doubled vocal
1572fed467f8SDenis Scheither  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1573fed467f8SDenis Scheither  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1574fed467f8SDenis Scheither  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1575fed467f8SDenis Scheither  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1576fed467f8SDenis Scheither  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1577fed467f8SDenis Scheither  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1578fed467f8SDenis Scheither  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1579fed467f8SDenis Scheither  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1580fed467f8SDenis Scheither  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1581fed467f8SDenis Scheither  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1582fed467f8SDenis Scheither  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1583799e0977SAndreas Gohr  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1584fed467f8SDenis Scheither  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1585fed467f8SDenis Scheither  'ワー'=>'waa','ヲー'=>'woo',
1586fed467f8SDenis Scheither  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1587fed467f8SDenis Scheither  'ヵー'=>'kaa','ヶー'=>'kee',
15889476a253SAndreas Gohr  // old characters
15899476a253SAndreas Gohr  'ヱー'=>'wee','ヰー'=>'wii',
1590fed467f8SDenis Scheither
1591879205e1SAndreas Gohr  // seperate katakana 'n'
1592879205e1SAndreas Gohr  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1593879205e1SAndreas Gohr  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1594879205e1SAndreas Gohr
1595fed467f8SDenis Scheither  // 2 character syllables - doubled consonants
1596fed467f8SDenis Scheither  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1597fed467f8SDenis Scheither  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1598fed467f8SDenis Scheither  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1599fed467f8SDenis Scheither  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1600fed467f8SDenis Scheither  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1601fed467f8SDenis Scheither  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1602fed467f8SDenis Scheither  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1603fed467f8SDenis Scheither  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1604799e0977SAndreas Gohr  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1605799e0977SAndreas Gohr  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1606fed467f8SDenis Scheither  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1607fed467f8SDenis Scheither
1608fed467f8SDenis Scheither  // 1 character syllables
1609fed467f8SDenis Scheither  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1610fed467f8SDenis Scheither  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1611fed467f8SDenis Scheither  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1612fed467f8SDenis Scheither  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1613fed467f8SDenis Scheither  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1614fed467f8SDenis Scheither  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1615fed467f8SDenis Scheither  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1616fed467f8SDenis Scheither  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1617fed467f8SDenis Scheither  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1618fed467f8SDenis Scheither  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1619879205e1SAndreas Gohr  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1620fed467f8SDenis Scheither  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1621fed467f8SDenis Scheither  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1622fed467f8SDenis Scheither  'ワ'=>'wa','ヲ'=>'wo',
1623fed467f8SDenis Scheither  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1624fed467f8SDenis Scheither  'ヵ'=>'ka','ヶ'=>'ke',
16259476a253SAndreas Gohr  // old characters
16269476a253SAndreas Gohr  'ヱ'=>'we','ヰ'=>'wi',
1627fed467f8SDenis Scheither
16289476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1629fed467f8SDenis Scheither  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1630fed467f8SDenis Scheither  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1631fed467f8SDenis Scheither
1632799e0977SAndreas Gohr  // special characters
1633799e0977SAndreas Gohr  '・'=>'_','、'=>'_',
1634799e0977SAndreas Gohr  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1635799e0977SAndreas Gohr
1636fed467f8SDenis Scheither  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1637fed467f8SDenis Scheither  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1638fed467f8SDenis Scheither  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1639fed467f8SDenis Scheither  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1640fed467f8SDenis Scheither  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1641fed467f8SDenis Scheither  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1642fed467f8SDenis Scheither  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1643fed467f8SDenis Scheither  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1644fed467f8SDenis Scheither  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1645fed467f8SDenis Scheither  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1646fed467f8SDenis Scheither  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1647fed467f8SDenis Scheither  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
16488a831f2bSAndreas Gohr
16498a831f2bSAndreas Gohr  // "Greeklish"
16508a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
16518a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
16528a831f2bSAndreas Gohr
16538a831f2bSAndreas Gohr  // Thai
16548a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
16558a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
16568a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
16578a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
16588a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
16598a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1660014d0ab6SAndreas Gohr  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1661014d0ab6SAndreas Gohr  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1662014d0ab6SAndreas Gohr  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1663014d0ab6SAndreas Gohr  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1664014d0ab6SAndreas Gohr  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1665014d0ab6SAndreas Gohr  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1666014d0ab6SAndreas Gohr  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1667014d0ab6SAndreas Gohr  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1668014d0ab6SAndreas Gohr  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1669014d0ab6SAndreas Gohr  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1670014d0ab6SAndreas Gohr  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1671014d0ab6SAndreas Gohr  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
16728a831f2bSAndreas Gohr
16738a831f2bSAndreas Gohr  // Korean
16748a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
16758a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
16768a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
16778a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
16788a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
16798a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
16808a831f2bSAndreas Gohr);
1681340756e4Sandi
16828a831f2bSAndreas Gohr
1683