xref: /dokuwiki/inc/utf8.php (revision 3cf900244031048d73d8dc9cc4ae32a9362dca59)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
51f2058faSAndreas Gohr * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr        define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr    }else{
16ab77016bSAndreas Gohr        define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr    }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22df957b36SAndreas Gohrif(!function_exists('utf8_isASCII')){
23f29bd553Sandi    /**
2444f669e9Sandi     * Checks if a string contains 7bit ASCII only
2544f669e9Sandi     *
26*3cf90024SMichael Hamann     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
2744f669e9Sandi     */
2844f669e9Sandi    function utf8_isASCII($str){
297e6f32c4SAndreas Gohr        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
3044f669e9Sandi    }
31df957b36SAndreas Gohr}
3244f669e9Sandi
33df957b36SAndreas Gohrif(!function_exists('utf8_strip')){
3444f669e9Sandi    /**
35e1906e6eSandi     * Strips all highbyte chars
36e1906e6eSandi     *
37e1906e6eSandi     * Returns a pure ASCII7 string
38e1906e6eSandi     *
39e1906e6eSandi     * @author Andreas Gohr <andi@splitbrain.org>
40e1906e6eSandi     */
41e1906e6eSandi    function utf8_strip($str){
42e1906e6eSandi        $ascii = '';
438ec3f7bdSAndreas Gohr        $len = strlen($str);
448ec3f7bdSAndreas Gohr        for($i=0; $i<$len; $i++){
45e1906e6eSandi            if(ord($str{$i}) <128){
46e1906e6eSandi                $ascii .= $str{$i};
47e1906e6eSandi            }
48e1906e6eSandi        }
49e1906e6eSandi        return $ascii;
50e1906e6eSandi    }
51df957b36SAndreas Gohr}
52e1906e6eSandi
53df957b36SAndreas Gohrif(!function_exists('utf8_check')){
54e1906e6eSandi    /**
55f29bd553Sandi     * Tries to detect if a string is in Unicode encoding
56f29bd553Sandi     *
57f29bd553Sandi     * @author <bmorel@ssi.fr>
58f29bd553Sandi     * @link   http://www.php.net/manual/en/function.utf8-encode.php
59f29bd553Sandi     */
60f29bd553Sandi    function utf8_check($Str) {
618ec3f7bdSAndreas Gohr        $len = strlen($Str);
628ec3f7bdSAndreas Gohr        for ($i=0; $i<$len; $i++) {
635e613a5cSchris            $b = ord($Str[$i]);
645e613a5cSchris            if ($b < 0x80) continue; # 0bbbbbbb
655e613a5cSchris            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
665e613a5cSchris            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
675e613a5cSchris            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
685e613a5cSchris            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
695e613a5cSchris            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70f29bd553Sandi            else return false; # Does not match any model
71df957b36SAndreas Gohr
72f29bd553Sandi            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
738ec3f7bdSAndreas Gohr                if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74f29bd553Sandi                    return false;
75f29bd553Sandi            }
76f29bd553Sandi        }
77f29bd553Sandi        return true;
78f29bd553Sandi    }
79df957b36SAndreas Gohr}
8049c713a3Sandi
81df957b36SAndreas Gohrif(!function_exists('utf8_strlen')){
822f954959Sandi    /**
83f29317c1Sandi     * Unicode aware replacement for strlen()
842f954959Sandi     *
85f29317c1Sandi     * utf8_decode() converts characters that are not in ISO-8859-1
86f29317c1Sandi     * to '?', which, for the purpose of counting, is alright - It's
87f29317c1Sandi     * even faster than mb_strlen.
882f954959Sandi     *
89f29317c1Sandi     * @author <chernyshevsky at hotmail dot com>
902f954959Sandi     * @see    strlen()
91f29317c1Sandi     * @see    utf8_decode()
922f954959Sandi     */
932f954959Sandi    function utf8_strlen($string){
94dc57ef04Sandi        return strlen(utf8_decode($string));
952f954959Sandi    }
96df957b36SAndreas Gohr}
972f954959Sandi
98df957b36SAndreas Gohrif(!function_exists('utf8_substr')){
997077c942Sandi    /**
10010f09f2aSAndreas Gohr     * UTF-8 aware alternative to substr
1017077c942Sandi     *
10210f09f2aSAndreas Gohr     * Return part of a string given character offset (and optionally length)
10310f09f2aSAndreas Gohr     *
10410f09f2aSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
1055e613a5cSchris     * @author Chris Smith <chris@jalakai.co.uk>
10610f09f2aSAndreas Gohr     * @param string
10710f09f2aSAndreas Gohr     * @param integer number of UTF-8 characters offset (from left)
10810f09f2aSAndreas Gohr     * @param integer (optional) length in UTF-8 characters from offset
10944881bd0Shenning.noren     * @return mixed string or false if failure
1107077c942Sandi     */
11110f09f2aSAndreas Gohr    function utf8_substr($str, $offset, $length = null) {
112ab77016bSAndreas Gohr        if(UTF8_MBSTRING){
11310f09f2aSAndreas Gohr            if( $length === null ){
11419a32233Schris                return mb_substr($str, $offset);
1157d8be200Sandi            }else{
11619a32233Schris                return mb_substr($str, $offset, $length);
117f29317c1Sandi            }
118f29317c1Sandi        }
119f29317c1Sandi
1202626ee0cSchris        /*
1212626ee0cSchris         * Notes:
1222626ee0cSchris         *
1232626ee0cSchris         * no mb string support, so we'll use pcre regex's with 'u' flag
1242626ee0cSchris         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1252626ee0cSchris         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1262626ee0cSchris         *
1272626ee0cSchris         * substr documentation states false can be returned in some cases (e.g. offset > string length)
1282626ee0cSchris         * mb_substr never returns false, it will return an empty string instead.
1292626ee0cSchris         *
1302626ee0cSchris         * calculating the number of characters in the string is a relatively expensive operation, so
1312626ee0cSchris         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1322626ee0cSchris         */
13310f09f2aSAndreas Gohr
1342626ee0cSchris        // cast parameters to appropriate types to avoid multiple notices/warnings
1352626ee0cSchris        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1362626ee0cSchris        $offset = (int)$offset;
1372626ee0cSchris        if (!is_null($length)) $length = (int)$length;
13810f09f2aSAndreas Gohr
1392626ee0cSchris        // handle trivial cases
1405e613a5cSchris        if ($length === 0) return '';
1412626ee0cSchris        if ($offset < 0 && $length < 0 && $length < $offset) return '';
1425e613a5cSchris
1432626ee0cSchris        $offset_pattern = '';
1442626ee0cSchris        $length_pattern = '';
1452626ee0cSchris
1462626ee0cSchris        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1472626ee0cSchris        if ($offset < 0) {
1482626ee0cSchris            $strlen = strlen(utf8_decode($str));        // see notes
1492626ee0cSchris            $offset = $strlen + $offset;
1502626ee0cSchris            if ($offset < 0) $offset = 0;
1512626ee0cSchris        }
1522626ee0cSchris
1532626ee0cSchris        // establish a pattern for offset, a non-captured group equal in length to offset
1542626ee0cSchris        if ($offset > 0) {
1552626ee0cSchris            $Ox = (int)($offset/65535);
1562626ee0cSchris            $Oy = $offset%65535;
1572626ee0cSchris
1582626ee0cSchris            if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1592626ee0cSchris            $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1602626ee0cSchris        } else {
1612626ee0cSchris            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1622626ee0cSchris        }
1632626ee0cSchris
1642626ee0cSchris        // establish a pattern for length
1652626ee0cSchris        if (is_null($length)) {
1662626ee0cSchris            $length_pattern = '(.*)$';                  // the rest of the string
1672626ee0cSchris        } else {
1682626ee0cSchris
1692626ee0cSchris            if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
1702626ee0cSchris            if ($offset > $strlen) return '';           // another trivial case
1712626ee0cSchris
1722626ee0cSchris            if ($length > 0) {
1732626ee0cSchris
1742626ee0cSchris                $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
1752626ee0cSchris
1762626ee0cSchris                $Lx = (int)($length/65535);
1772626ee0cSchris                $Ly = $length%65535;
1782626ee0cSchris
1792626ee0cSchris                // +ve length requires ... a captured group of length characters
1802626ee0cSchris                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
1812626ee0cSchris                    $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
1822626ee0cSchris
1832626ee0cSchris            } else if ($length < 0) {
1842626ee0cSchris
1852626ee0cSchris                if ($length < ($offset - $strlen)) return '';
1862626ee0cSchris
1872626ee0cSchris                $Lx = (int)((-$length)/65535);
1882626ee0cSchris                $Ly = (-$length)%65535;
1892626ee0cSchris
1902626ee0cSchris                // -ve length requires ... capture everything except a group of -length characters
1912626ee0cSchris                //                         anchored at the tail-end of the string
1922626ee0cSchris                if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
1932626ee0cSchris                $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
19410f09f2aSAndreas Gohr            }
19510f09f2aSAndreas Gohr        }
19610f09f2aSAndreas Gohr
1972626ee0cSchris        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
1982626ee0cSchris        return $match[1];
1992626ee0cSchris    }
200df957b36SAndreas Gohr}
20110f09f2aSAndreas Gohr
202df957b36SAndreas Gohrif(!function_exists('utf8_substr_replace')){
203f29317c1Sandi    /**
204dc57ef04Sandi     * Unicode aware replacement for substr_replace()
205dc57ef04Sandi     *
206dc57ef04Sandi     * @author Andreas Gohr <andi@splitbrain.org>
207dc57ef04Sandi     * @see    substr_replace()
208dc57ef04Sandi     */
209dc57ef04Sandi    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
210dc57ef04Sandi        $ret = '';
211dc57ef04Sandi        if($start>0) $ret .= utf8_substr($string, 0, $start);
212dc57ef04Sandi        $ret .= $replacement;
213dc57ef04Sandi        $ret .= utf8_substr($string, $start+$length);
214dc57ef04Sandi        return $ret;
215dc57ef04Sandi    }
216df957b36SAndreas Gohr}
217dc57ef04Sandi
218df957b36SAndreas Gohrif(!function_exists('utf8_ltrim')){
219dc57ef04Sandi    /**
220f29317c1Sandi     * Unicode aware replacement for ltrim()
221f29317c1Sandi     *
222f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
223f29317c1Sandi     * @see    ltrim()
224f29317c1Sandi     * @return string
225f29317c1Sandi     */
226f29317c1Sandi    function utf8_ltrim($str,$charlist=''){
227f29317c1Sandi        if($charlist == '') return ltrim($str);
228f29317c1Sandi
229f29317c1Sandi        //quote charlist for use in a characterclass
230f29317c1Sandi        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
231f29317c1Sandi
232f29317c1Sandi        return preg_replace('/^['.$charlist.']+/u','',$str);
233f29317c1Sandi    }
234df957b36SAndreas Gohr}
235f29317c1Sandi
236df957b36SAndreas Gohrif(!function_exists('utf8_rtrim')){
237f29317c1Sandi    /**
238ea2eed85Sandi     * Unicode aware replacement for rtrim()
239f29317c1Sandi     *
240f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
241f29317c1Sandi     * @see    rtrim()
242f29317c1Sandi     * @return string
243f29317c1Sandi     */
244f29317c1Sandi    function  utf8_rtrim($str,$charlist=''){
245f29317c1Sandi        if($charlist == '') return rtrim($str);
246f29317c1Sandi
247f29317c1Sandi        //quote charlist for use in a characterclass
248f29317c1Sandi        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
249f29317c1Sandi
250f29317c1Sandi        return preg_replace('/['.$charlist.']+$/u','',$str);
251f29317c1Sandi    }
252df957b36SAndreas Gohr}
253f29317c1Sandi
254df957b36SAndreas Gohrif(!function_exists('utf8_trim')){
255f29317c1Sandi    /**
256f29317c1Sandi     * Unicode aware replacement for trim()
257f29317c1Sandi     *
258f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
259f29317c1Sandi     * @see    trim()
260f29317c1Sandi     * @return string
261f29317c1Sandi     */
262f29317c1Sandi    function  utf8_trim($str,$charlist='') {
263f29317c1Sandi        if($charlist == '') return trim($str);
264f29317c1Sandi
26540421069SAndreas Gohr        return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
266f29317c1Sandi    }
267df957b36SAndreas Gohr}
268f29317c1Sandi
269df957b36SAndreas Gohrif(!function_exists('utf8_strtolower')){
27049c713a3Sandi    /**
27182257610Sandi     * This is a unicode aware replacement for strtolower()
27282257610Sandi     *
27382257610Sandi     * Uses mb_string extension if available
27482257610Sandi     *
27572de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
27682257610Sandi     * @see    strtolower()
27782257610Sandi     * @see    utf8_strtoupper()
27882257610Sandi     */
27982257610Sandi    function utf8_strtolower($string){
280ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
28182257610Sandi
28282257610Sandi        global $UTF8_UPPER_TO_LOWER;
28372de9068SAndreas Gohr        return strtr($string,$UTF8_UPPER_TO_LOWER);
28482257610Sandi    }
285df957b36SAndreas Gohr}
28682257610Sandi
287df957b36SAndreas Gohrif(!function_exists('utf8_strtoupper')){
28882257610Sandi    /**
28982257610Sandi     * This is a unicode aware replacement for strtoupper()
29082257610Sandi     *
29182257610Sandi     * Uses mb_string extension if available
29282257610Sandi     *
29372de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
29482257610Sandi     * @see    strtoupper()
29582257610Sandi     * @see    utf8_strtoupper()
29682257610Sandi     */
29782257610Sandi    function utf8_strtoupper($string){
298ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
29982257610Sandi
30082257610Sandi        global $UTF8_LOWER_TO_UPPER;
30172de9068SAndreas Gohr        return strtr($string,$UTF8_LOWER_TO_UPPER);
30282257610Sandi    }
303df957b36SAndreas Gohr}
30482257610Sandi
305df957b36SAndreas Gohrif(!function_exists('utf8_ucfirst')){
30682257610Sandi    /**
30726ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucfirst
30826ece5a7SAndreas Gohr     * Make a string's first character uppercase
30926ece5a7SAndreas Gohr     *
31026ece5a7SAndreas Gohr     * @author Harry Fuecks
31126ece5a7SAndreas Gohr     * @param string
31226ece5a7SAndreas Gohr     * @return string with first character as upper case (if applicable)
31326ece5a7SAndreas Gohr     */
31426ece5a7SAndreas Gohr    function utf8_ucfirst($str){
31526ece5a7SAndreas Gohr        switch ( utf8_strlen($str) ) {
31626ece5a7SAndreas Gohr            case 0:
31726ece5a7SAndreas Gohr                return '';
31826ece5a7SAndreas Gohr            case 1:
31926ece5a7SAndreas Gohr                return utf8_strtoupper($str);
32026ece5a7SAndreas Gohr            default:
32126ece5a7SAndreas Gohr                preg_match('/^(.{1})(.*)$/us', $str, $matches);
32226ece5a7SAndreas Gohr                return utf8_strtoupper($matches[1]).$matches[2];
32326ece5a7SAndreas Gohr        }
32426ece5a7SAndreas Gohr    }
325df957b36SAndreas Gohr}
32626ece5a7SAndreas Gohr
327df957b36SAndreas Gohrif(!function_exists('utf8_ucwords')){
32826ece5a7SAndreas Gohr    /**
32926ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucwords
33026ece5a7SAndreas Gohr     * Uppercase the first character of each word in a string
33126ece5a7SAndreas Gohr     *
33226ece5a7SAndreas Gohr     * @author Harry Fuecks
33326ece5a7SAndreas Gohr     * @param string
33426ece5a7SAndreas Gohr     * @return string with first char of each word uppercase
33526ece5a7SAndreas Gohr     * @see http://www.php.net/ucwords
33626ece5a7SAndreas Gohr     */
33726ece5a7SAndreas Gohr    function utf8_ucwords($str) {
33826ece5a7SAndreas Gohr        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
33926ece5a7SAndreas Gohr        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
34026ece5a7SAndreas Gohr        // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
34126ece5a7SAndreas Gohr        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
34226ece5a7SAndreas Gohr
34326ece5a7SAndreas Gohr        return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
34426ece5a7SAndreas Gohr    }
34526ece5a7SAndreas Gohr
34626ece5a7SAndreas Gohr    /**
34726ece5a7SAndreas Gohr     * Callback function for preg_replace_callback call in utf8_ucwords
34826ece5a7SAndreas Gohr     * You don't need to call this yourself
34926ece5a7SAndreas Gohr     *
35026ece5a7SAndreas Gohr     * @author Harry Fuecks
35126ece5a7SAndreas Gohr     * @param array of matches corresponding to a single word
35226ece5a7SAndreas Gohr     * @return string with first char of the word in uppercase
35326ece5a7SAndreas Gohr     * @see utf8_ucwords
35426ece5a7SAndreas Gohr     * @see utf8_strtoupper
35526ece5a7SAndreas Gohr     */
35626ece5a7SAndreas Gohr    function utf8_ucwords_callback($matches) {
35726ece5a7SAndreas Gohr        $leadingws = $matches[2];
35826ece5a7SAndreas Gohr        $ucfirst = utf8_strtoupper($matches[3]);
35926ece5a7SAndreas Gohr        $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
36026ece5a7SAndreas Gohr        return $leadingws . $ucword;
36126ece5a7SAndreas Gohr    }
362df957b36SAndreas Gohr}
36326ece5a7SAndreas Gohr
364df957b36SAndreas Gohrif(!function_exists('utf8_deaccent')){
36526ece5a7SAndreas Gohr    /**
36682257610Sandi     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
36782257610Sandi     *
36882257610Sandi     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
36982257610Sandi     * letters. Default is to deaccent both cases ($case = 0)
37082257610Sandi     *
37182257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
37282257610Sandi     */
37382257610Sandi    function utf8_deaccent($string,$case=0){
37482257610Sandi        if($case <= 0){
37582257610Sandi            global $UTF8_LOWER_ACCENTS;
37672de9068SAndreas Gohr            $string = strtr($string,$UTF8_LOWER_ACCENTS);
37782257610Sandi        }
37882257610Sandi        if($case >= 0){
37982257610Sandi            global $UTF8_UPPER_ACCENTS;
38072de9068SAndreas Gohr            $string = strtr($string,$UTF8_UPPER_ACCENTS);
38182257610Sandi        }
38282257610Sandi        return $string;
38382257610Sandi    }
384df957b36SAndreas Gohr}
38582257610Sandi
386df957b36SAndreas Gohrif(!function_exists('utf8_romanize')){
38782257610Sandi    /**
3888a831f2bSAndreas Gohr     * Romanize a non-latin string
3898a831f2bSAndreas Gohr     *
3908a831f2bSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
3918a831f2bSAndreas Gohr     */
3928a831f2bSAndreas Gohr    function utf8_romanize($string){
3938a831f2bSAndreas Gohr        if(utf8_isASCII($string)) return $string; //nothing to do
3948a831f2bSAndreas Gohr
3958a831f2bSAndreas Gohr        global $UTF8_ROMANIZATION;
3968a831f2bSAndreas Gohr        return strtr($string,$UTF8_ROMANIZATION);
3978a831f2bSAndreas Gohr    }
398df957b36SAndreas Gohr}
3998a831f2bSAndreas Gohr
400df957b36SAndreas Gohrif(!function_exists('utf8_stripspecials')){
4018a831f2bSAndreas Gohr    /**
402099ada41Sandi     * Removes special characters (nonalphanumeric) from a UTF-8 string
403099ada41Sandi     *
404099ada41Sandi     * This function adds the controlchars 0x00 to 0x19 to the array of
405099ada41Sandi     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
406099ada41Sandi     *
407099ada41Sandi     * @author Andreas Gohr <andi@splitbrain.org>
408099ada41Sandi     * @param  string $string     The UTF8 string to strip of special chars
409099ada41Sandi     * @param  string $repl       Replace special with this string
410b4ce25e9SAndreas Gohr     * @param  string $additional Additional chars to strip (used in regexp char class)
411099ada41Sandi     */
412b4ce25e9SAndreas Gohr    function utf8_stripspecials($string,$repl='',$additional=''){
413099ada41Sandi        global $UTF8_SPECIAL_CHARS;
414720307d9Schris        global $UTF8_SPECIAL_CHARS2;
415099ada41Sandi
4165c812709Sandi        static $specials = null;
4175c812709Sandi        if(is_null($specials)){
418720307d9Schris            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
419720307d9Schris            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4205c812709Sandi        }
421099ada41Sandi
422b4ce25e9SAndreas Gohr        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
423099ada41Sandi    }
424df957b36SAndreas Gohr}
425099ada41Sandi
426df957b36SAndreas Gohrif(!function_exists('utf8_strpos')){
427099ada41Sandi    /**
4282f954959Sandi     * This is an Unicode aware replacement for strpos
4292f954959Sandi     *
43072de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
4312f954959Sandi     * @see    strpos()
43272de9068SAndreas Gohr     * @param  string
43372de9068SAndreas Gohr     * @param  string
43472de9068SAndreas Gohr     * @param  integer
43572de9068SAndreas Gohr     * @return integer
4362f954959Sandi     */
4372f954959Sandi    function utf8_strpos($haystack, $needle, $offset=0){
43872de9068SAndreas Gohr        $comp = 0;
43972de9068SAndreas Gohr        $length = null;
4402f954959Sandi
44172de9068SAndreas Gohr        while (is_null($length) || $length < $offset) {
44272de9068SAndreas Gohr            $pos = strpos($haystack, $needle, $offset + $comp);
44372de9068SAndreas Gohr
44472de9068SAndreas Gohr            if ($pos === false)
445f29317c1Sandi                return false;
44672de9068SAndreas Gohr
44772de9068SAndreas Gohr            $length = utf8_strlen(substr($haystack, 0, $pos));
44872de9068SAndreas Gohr
44972de9068SAndreas Gohr            if ($length < $offset)
45072de9068SAndreas Gohr                $comp = $pos - $length;
451f29317c1Sandi        }
4522f954959Sandi
45372de9068SAndreas Gohr        return $length;
45472de9068SAndreas Gohr    }
455df957b36SAndreas Gohr}
456f29317c1Sandi
457df957b36SAndreas Gohrif(!function_exists('utf8_tohtml')){
4582f954959Sandi    /**
459ea2eed85Sandi     * Encodes UTF-8 characters to HTML entities
460ea2eed85Sandi     *
4619f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
462ea2eed85Sandi     * @author <vpribish at shopping dot com>
463ea2eed85Sandi     * @link   http://www.php.net/manual/en/function.utf8-decode.php
464ea2eed85Sandi     */
465ea2eed85Sandi    function utf8_tohtml ($str) {
466ea2eed85Sandi        $ret = '';
4679f9fb0e5STom N Harris        foreach (utf8_to_unicode($str) as $cp) {
4689f9fb0e5STom N Harris            if ($cp < 0x80)
4699f9fb0e5STom N Harris                $ret .= chr($cp);
4709f9fb0e5STom N Harris            elseif ($cp < 0x100)
4719f9fb0e5STom N Harris                $ret .= "&#$cp;";
4729f9fb0e5STom N Harris            else
4739f9fb0e5STom N Harris                $ret .= '&#x'.dechex($cp).';';
4749f9fb0e5STom N Harris        }
4759f9fb0e5STom N Harris        return $ret;
4769f9fb0e5STom N Harris    }
477df957b36SAndreas Gohr}
4789f9fb0e5STom N Harris
479df957b36SAndreas Gohrif(!function_exists('utf8_unhtml')){
4809f9fb0e5STom N Harris    /**
4819f9fb0e5STom N Harris     * Decodes HTML entities to UTF-8 characters
4829f9fb0e5STom N Harris     *
4839f9fb0e5STom N Harris     * Convert any &#..; entity to a codepoint,
4849f9fb0e5STom N Harris     * The entities flag defaults to only decoding numeric entities.
4859f9fb0e5STom N Harris     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
4869f9fb0e5STom N Harris     * are handled as well. Avoids the problem that would occur if you
4879f9fb0e5STom N Harris     * had to decode "&amp;#38;&#38;amp;#38;"
4889f9fb0e5STom N Harris     *
4899f9fb0e5STom N Harris     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
4909f9fb0e5STom N Harris     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
4919f9fb0e5STom N Harris     * what it should be                   -> "&#38;&amp#38;"
4929f9fb0e5STom N Harris     *
4939f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
4949f9fb0e5STom N Harris     * @param  string  $str      UTF-8 encoded string
4959f9fb0e5STom N Harris     * @param  boolean $entities Flag controlling decoding of named entities.
4969f9fb0e5STom N Harris     * @return UTF-8 encoded string with numeric (and named) entities replaced.
4979f9fb0e5STom N Harris     */
4989f9fb0e5STom N Harris    function utf8_unhtml($str, $entities=null) {
4999f9fb0e5STom N Harris        static $decoder = null;
5009f9fb0e5STom N Harris        if (is_null($decoder))
5019f9fb0e5STom N Harris            $decoder = new utf8_entity_decoder();
5029f9fb0e5STom N Harris        if (is_null($entities))
5039f9fb0e5STom N Harris            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
5049f9fb0e5STom N Harris                                         'utf8_decode_numeric', $str);
5059f9fb0e5STom N Harris        else
5069f9fb0e5STom N Harris            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
5079f9fb0e5STom N Harris                                         array(&$decoder, 'decode'), $str);
5089f9fb0e5STom N Harris    }
509df957b36SAndreas Gohr}
510df957b36SAndreas Gohr
511df957b36SAndreas Gohrif(!function_exists('utf8_decode_numeric')){
5129f9fb0e5STom N Harris    function utf8_decode_numeric($ent) {
5139f9fb0e5STom N Harris        switch ($ent[2]) {
5149f9fb0e5STom N Harris            case 'X':
5159f9fb0e5STom N Harris            case 'x':
5169f9fb0e5STom N Harris                $cp = hexdec($ent[3]);
5179f9fb0e5STom N Harris                break;
5189f9fb0e5STom N Harris            default:
5199f9fb0e5STom N Harris                $cp = intval($ent[3]);
5209f9fb0e5STom N Harris                break;
5219f9fb0e5STom N Harris        }
5229f9fb0e5STom N Harris        return unicode_to_utf8(array($cp));
5239f9fb0e5STom N Harris    }
524df957b36SAndreas Gohr}
525df957b36SAndreas Gohr
526df957b36SAndreas Gohrif(!class_exists('utf8_entity_decoder')){
5279f9fb0e5STom N Harris    class utf8_entity_decoder {
5289f9fb0e5STom N Harris        var $table;
5299f9fb0e5STom N Harris        function utf8_entity_decoder() {
5309f9fb0e5STom N Harris            $table = get_html_translation_table(HTML_ENTITIES);
5319f9fb0e5STom N Harris            $table = array_flip($table);
5329f9fb0e5STom N Harris            $this->table = array_map(array(&$this,'makeutf8'), $table);
5339f9fb0e5STom N Harris        }
5349f9fb0e5STom N Harris        function makeutf8($c) {
5359f9fb0e5STom N Harris            return unicode_to_utf8(array(ord($c)));
5369f9fb0e5STom N Harris        }
5379f9fb0e5STom N Harris        function decode($ent) {
5389f9fb0e5STom N Harris            if ($ent[1] == '#') {
5399f9fb0e5STom N Harris                return utf8_decode_numeric($ent);
5409f9fb0e5STom N Harris            } elseif (array_key_exists($ent[0],$this->table)) {
5419f9fb0e5STom N Harris                return $this->table[$ent[0]];
5429f9fb0e5STom N Harris            } else {
5439f9fb0e5STom N Harris                return $ent[0];
544ea2eed85Sandi            }
545ea2eed85Sandi        }
546ea2eed85Sandi    }
547df957b36SAndreas Gohr}
548ea2eed85Sandi
549df957b36SAndreas Gohrif(!function_exists('utf8_to_unicode')){
550ea2eed85Sandi    /**
5511abfaba4SAndreas Gohr     * Takes an UTF-8 string and returns an array of ints representing the
5521abfaba4SAndreas Gohr     * Unicode characters. Astral planes are supported ie. the ints in the
5531abfaba4SAndreas Gohr     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
5541abfaba4SAndreas Gohr     * are not allowed.
55582257610Sandi     *
5561abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
5571abfaba4SAndreas Gohr     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
5581abfaba4SAndreas Gohr     * level E_USER_WARNING
5591abfaba4SAndreas Gohr     *
5601abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to
5611abfaba4SAndreas Gohr     * trigger errors on encountering bad bytes
5621abfaba4SAndreas Gohr     *
5631abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
5641abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
5651abfaba4SAndreas Gohr     * @param  string  UTF-8 encoded string
5661abfaba4SAndreas Gohr     * @param  boolean Check for invalid sequences?
56744881bd0Shenning.noren     * @return mixed array of unicode code points or false if UTF-8 invalid
5681abfaba4SAndreas Gohr     * @see    unicode_to_utf8
5691abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
5701abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
57182257610Sandi     */
5721abfaba4SAndreas Gohr    function utf8_to_unicode($str,$strict=false) {
5731abfaba4SAndreas Gohr        $mState = 0;     // cached expected number of octets after the current octet
5741abfaba4SAndreas Gohr                         // until the beginning of the next UTF8 character sequence
5751abfaba4SAndreas Gohr        $mUcs4  = 0;     // cached Unicode character
5761abfaba4SAndreas Gohr        $mBytes = 1;     // cached expected number of octets in the current sequence
57782257610Sandi
5781abfaba4SAndreas Gohr        $out = array();
5791abfaba4SAndreas Gohr
5801abfaba4SAndreas Gohr        $len = strlen($str);
5811abfaba4SAndreas Gohr
5821abfaba4SAndreas Gohr        for($i = 0; $i < $len; $i++) {
5831abfaba4SAndreas Gohr
5841abfaba4SAndreas Gohr            $in = ord($str{$i});
5851abfaba4SAndreas Gohr
5861abfaba4SAndreas Gohr            if ( $mState == 0) {
5871abfaba4SAndreas Gohr
5881abfaba4SAndreas Gohr                // When mState is zero we expect either a US-ASCII character or a
5891abfaba4SAndreas Gohr                // multi-octet sequence.
5901abfaba4SAndreas Gohr                if (0 == (0x80 & ($in))) {
5911abfaba4SAndreas Gohr                    // US-ASCII, pass straight through.
5921abfaba4SAndreas Gohr                    $out[] = $in;
5931abfaba4SAndreas Gohr                    $mBytes = 1;
5941abfaba4SAndreas Gohr
5951abfaba4SAndreas Gohr                } else if (0xC0 == (0xE0 & ($in))) {
5961abfaba4SAndreas Gohr                    // First octet of 2 octet sequence
5971abfaba4SAndreas Gohr                    $mUcs4 = ($in);
5981abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
5991abfaba4SAndreas Gohr                    $mState = 1;
6001abfaba4SAndreas Gohr                    $mBytes = 2;
6011abfaba4SAndreas Gohr
6021abfaba4SAndreas Gohr                } else if (0xE0 == (0xF0 & ($in))) {
6031abfaba4SAndreas Gohr                    // First octet of 3 octet sequence
6041abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6051abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
6061abfaba4SAndreas Gohr                    $mState = 2;
6071abfaba4SAndreas Gohr                    $mBytes = 3;
6081abfaba4SAndreas Gohr
6091abfaba4SAndreas Gohr                } else if (0xF0 == (0xF8 & ($in))) {
6101abfaba4SAndreas Gohr                    // First octet of 4 octet sequence
6111abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6121abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x07) << 18;
6131abfaba4SAndreas Gohr                    $mState = 3;
6141abfaba4SAndreas Gohr                    $mBytes = 4;
6151abfaba4SAndreas Gohr
6161abfaba4SAndreas Gohr                } else if (0xF8 == (0xFC & ($in))) {
6171abfaba4SAndreas Gohr                    /* First octet of 5 octet sequence.
6181abfaba4SAndreas Gohr                     *
6191abfaba4SAndreas Gohr                     * This is illegal because the encoded codepoint must be either
6201abfaba4SAndreas Gohr                     * (a) not the shortest form or
6211abfaba4SAndreas Gohr                     * (b) outside the Unicode range of 0-0x10FFFF.
6221abfaba4SAndreas Gohr                     * Rather than trying to resynchronize, we will carry on until the end
6231abfaba4SAndreas Gohr                     * of the sequence and let the later error handling code catch it.
6241abfaba4SAndreas Gohr                     */
6251abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6261abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x03) << 24;
6271abfaba4SAndreas Gohr                    $mState = 4;
6281abfaba4SAndreas Gohr                    $mBytes = 5;
6291abfaba4SAndreas Gohr
6301abfaba4SAndreas Gohr                } else if (0xFC == (0xFE & ($in))) {
6311abfaba4SAndreas Gohr                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
6321abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6331abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 1) << 30;
6341abfaba4SAndreas Gohr                    $mState = 5;
6351abfaba4SAndreas Gohr                    $mBytes = 6;
6361abfaba4SAndreas Gohr
6371abfaba4SAndreas Gohr                } elseif($strict) {
6381abfaba4SAndreas Gohr                    /* Current octet is neither in the US-ASCII range nor a legal first
6391abfaba4SAndreas Gohr                     * octet of a multi-octet sequence.
6401abfaba4SAndreas Gohr                     */
6411abfaba4SAndreas Gohr                    trigger_error(
6421abfaba4SAndreas Gohr                            'utf8_to_unicode: Illegal sequence identifier '.
6431abfaba4SAndreas Gohr                                'in UTF-8 at byte '.$i,
6441abfaba4SAndreas Gohr                            E_USER_WARNING
6451abfaba4SAndreas Gohr                        );
64644881bd0Shenning.noren                    return false;
6471abfaba4SAndreas Gohr
6481abfaba4SAndreas Gohr                }
6491abfaba4SAndreas Gohr
6501abfaba4SAndreas Gohr            } else {
6511abfaba4SAndreas Gohr
6521abfaba4SAndreas Gohr                // When mState is non-zero, we expect a continuation of the multi-octet
6531abfaba4SAndreas Gohr                // sequence
6541abfaba4SAndreas Gohr                if (0x80 == (0xC0 & ($in))) {
6551abfaba4SAndreas Gohr
6561abfaba4SAndreas Gohr                    // Legal continuation.
6571abfaba4SAndreas Gohr                    $shift = ($mState - 1) * 6;
6581abfaba4SAndreas Gohr                    $tmp = $in;
6591abfaba4SAndreas Gohr                    $tmp = ($tmp & 0x0000003F) << $shift;
6601abfaba4SAndreas Gohr                    $mUcs4 |= $tmp;
6611abfaba4SAndreas Gohr
6621abfaba4SAndreas Gohr                    /**
6631abfaba4SAndreas Gohr                     * End of the multi-octet sequence. mUcs4 now contains the final
6641abfaba4SAndreas Gohr                     * Unicode codepoint to be output
6651abfaba4SAndreas Gohr                     */
6661abfaba4SAndreas Gohr                    if (0 == --$mState) {
6671abfaba4SAndreas Gohr
6681abfaba4SAndreas Gohr                        /*
6691abfaba4SAndreas Gohr                         * Check for illegal sequences and codepoints.
6701abfaba4SAndreas Gohr                         */
6711abfaba4SAndreas Gohr                        // From Unicode 3.1, non-shortest form is illegal
6721abfaba4SAndreas Gohr                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
6731abfaba4SAndreas Gohr                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
6741abfaba4SAndreas Gohr                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
6751abfaba4SAndreas Gohr                            (4 < $mBytes) ||
6761abfaba4SAndreas Gohr                            // From Unicode 3.2, surrogate characters are illegal
6771abfaba4SAndreas Gohr                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
6781abfaba4SAndreas Gohr                            // Codepoints outside the Unicode range are illegal
6791abfaba4SAndreas Gohr                            ($mUcs4 > 0x10FFFF)) {
6801abfaba4SAndreas Gohr
6811abfaba4SAndreas Gohr                            if($strict){
6821abfaba4SAndreas Gohr                                trigger_error(
6831abfaba4SAndreas Gohr                                        'utf8_to_unicode: Illegal sequence or codepoint '.
6841abfaba4SAndreas Gohr                                            'in UTF-8 at byte '.$i,
6851abfaba4SAndreas Gohr                                        E_USER_WARNING
6861abfaba4SAndreas Gohr                                    );
6871abfaba4SAndreas Gohr
68844881bd0Shenning.noren                                return false;
6891abfaba4SAndreas Gohr                            }
6901abfaba4SAndreas Gohr
6911abfaba4SAndreas Gohr                        }
6921abfaba4SAndreas Gohr
6931abfaba4SAndreas Gohr                        if (0xFEFF != $mUcs4) {
6941abfaba4SAndreas Gohr                            // BOM is legal but we don't want to output it
6951abfaba4SAndreas Gohr                            $out[] = $mUcs4;
6961abfaba4SAndreas Gohr                        }
6971abfaba4SAndreas Gohr
6981abfaba4SAndreas Gohr                        //initialize UTF8 cache
6991abfaba4SAndreas Gohr                        $mState = 0;
7001abfaba4SAndreas Gohr                        $mUcs4  = 0;
7011abfaba4SAndreas Gohr                        $mBytes = 1;
7021abfaba4SAndreas Gohr                    }
7031abfaba4SAndreas Gohr
7041abfaba4SAndreas Gohr                } elseif($strict) {
7051abfaba4SAndreas Gohr                    /**
7061abfaba4SAndreas Gohr                     *((0xC0 & (*in) != 0x80) && (mState != 0))
7071abfaba4SAndreas Gohr                     * Incomplete multi-octet sequence.
7081abfaba4SAndreas Gohr                     */
7091abfaba4SAndreas Gohr                    trigger_error(
7101abfaba4SAndreas Gohr                            'utf8_to_unicode: Incomplete multi-octet '.
7111abfaba4SAndreas Gohr                            '   sequence in UTF-8 at byte '.$i,
7121abfaba4SAndreas Gohr                            E_USER_WARNING
7131abfaba4SAndreas Gohr                        );
7141abfaba4SAndreas Gohr
71544881bd0Shenning.noren                    return false;
71682257610Sandi                }
71782257610Sandi            }
71882257610Sandi        }
7191abfaba4SAndreas Gohr        return $out;
72082257610Sandi    }
721df957b36SAndreas Gohr}
72282257610Sandi
723df957b36SAndreas Gohrif(!function_exists('unicode_to_utf8')){
72482257610Sandi    /**
7251abfaba4SAndreas Gohr     * Takes an array of ints representing the Unicode characters and returns
7261abfaba4SAndreas Gohr     * a UTF-8 string. Astral planes are supported ie. the ints in the
7271abfaba4SAndreas Gohr     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
7281abfaba4SAndreas Gohr     * are not allowed.
72982257610Sandi     *
7301abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
7311abfaba4SAndreas Gohr     * array contains ints that represent surrogates or are outside the
7321abfaba4SAndreas Gohr     * Unicode range and raises a PHP error at level E_USER_WARNING
7331abfaba4SAndreas Gohr     *
7341abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to use
7351abfaba4SAndreas Gohr     * output buffering to concatenate the UTF-8 string (faster) as well as
7361abfaba4SAndreas Gohr     * reference the array by it's keys
7371abfaba4SAndreas Gohr     *
7381abfaba4SAndreas Gohr     * @param  array of unicode code points representing a string
7391abfaba4SAndreas Gohr     * @param  boolean Check for invalid sequences?
74044881bd0Shenning.noren     * @return mixed UTF-8 string or false if array contains invalid code points
7411abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
7421abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
7431abfaba4SAndreas Gohr     * @see    utf8_to_unicode
7441abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
7451abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
74682257610Sandi     */
7471abfaba4SAndreas Gohr    function unicode_to_utf8($arr,$strict=false) {
7481abfaba4SAndreas Gohr        if (!is_array($arr)) return '';
7491abfaba4SAndreas Gohr        ob_start();
750f949a01cSAndreas Gohr
7511abfaba4SAndreas Gohr        foreach (array_keys($arr) as $k) {
7521abfaba4SAndreas Gohr
7531abfaba4SAndreas Gohr            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
754db959ae3SAndreas Gohr                # ASCII range (including control chars)
7551abfaba4SAndreas Gohr
7561abfaba4SAndreas Gohr                echo chr($arr[$k]);
7571abfaba4SAndreas Gohr
7581abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x07ff) {
759db959ae3SAndreas Gohr                # 2 byte sequence
7601abfaba4SAndreas Gohr
7611abfaba4SAndreas Gohr                echo chr(0xc0 | ($arr[$k] >> 6));
7621abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
7631abfaba4SAndreas Gohr
7641abfaba4SAndreas Gohr            } else if($arr[$k] == 0xFEFF) {
765db959ae3SAndreas Gohr                # Byte order mark (skip)
7661abfaba4SAndreas Gohr
7671abfaba4SAndreas Gohr                // nop -- zap the BOM
7681abfaba4SAndreas Gohr
7691abfaba4SAndreas Gohr            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
770db959ae3SAndreas Gohr                # Test for illegal surrogates
7711abfaba4SAndreas Gohr
7721abfaba4SAndreas Gohr                // found a surrogate
7731abfaba4SAndreas Gohr                if($strict){
7741abfaba4SAndreas Gohr                    trigger_error(
7751abfaba4SAndreas Gohr                        'unicode_to_utf8: Illegal surrogate '.
7761abfaba4SAndreas Gohr                            'at index: '.$k.', value: '.$arr[$k],
7771abfaba4SAndreas Gohr                        E_USER_WARNING
7781abfaba4SAndreas Gohr                        );
77944881bd0Shenning.noren                    return false;
7801abfaba4SAndreas Gohr                }
7811abfaba4SAndreas Gohr
7821abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0xffff) {
783db959ae3SAndreas Gohr                # 3 byte sequence
7841abfaba4SAndreas Gohr
7851abfaba4SAndreas Gohr                echo chr(0xe0 | ($arr[$k] >> 12));
7861abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
7871abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
7881abfaba4SAndreas Gohr
7891abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x10ffff) {
790db959ae3SAndreas Gohr                # 4 byte sequence
7911abfaba4SAndreas Gohr
7921abfaba4SAndreas Gohr                echo chr(0xf0 | ($arr[$k] >> 18));
7931abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
7941abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
7951abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x3f));
7961abfaba4SAndreas Gohr
7971abfaba4SAndreas Gohr            } elseif($strict) {
7981abfaba4SAndreas Gohr
7991abfaba4SAndreas Gohr                trigger_error(
8001abfaba4SAndreas Gohr                    'unicode_to_utf8: Codepoint out of Unicode range '.
8011abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
8021abfaba4SAndreas Gohr                    E_USER_WARNING
8031abfaba4SAndreas Gohr                    );
8041abfaba4SAndreas Gohr
8051abfaba4SAndreas Gohr                // out of range
80644881bd0Shenning.noren                return false;
80782257610Sandi            }
80882257610Sandi        }
8091abfaba4SAndreas Gohr
8101abfaba4SAndreas Gohr        $result = ob_get_contents();
8111abfaba4SAndreas Gohr        ob_end_clean();
8121abfaba4SAndreas Gohr        return $result;
81382257610Sandi    }
814df957b36SAndreas Gohr}
81582257610Sandi
816df957b36SAndreas Gohrif(!function_exists('utf8_to_utf16be')){
81782257610Sandi    /**
81815fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
81915fa0b4fSAndreas Gohr     *
82015fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
82115fa0b4fSAndreas Gohr     */
82215fa0b4fSAndreas Gohr    function utf8_to_utf16be(&$str, $bom = false) {
82315fa0b4fSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
824ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
82515fa0b4fSAndreas Gohr
82615fa0b4fSAndreas Gohr        $uni = utf8_to_unicode($str);
82715fa0b4fSAndreas Gohr        foreach($uni as $cp){
82815fa0b4fSAndreas Gohr            $out .= pack('n',$cp);
82915fa0b4fSAndreas Gohr        }
83015fa0b4fSAndreas Gohr        return $out;
83115fa0b4fSAndreas Gohr    }
832df957b36SAndreas Gohr}
83315fa0b4fSAndreas Gohr
834df957b36SAndreas Gohrif(!function_exists('utf16be_to_utf8')){
83515fa0b4fSAndreas Gohr    /**
83615fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
83715fa0b4fSAndreas Gohr     *
83815fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
83915fa0b4fSAndreas Gohr     */
84015fa0b4fSAndreas Gohr    function utf16be_to_utf8(&$str) {
84115fa0b4fSAndreas Gohr        $uni = unpack('n*',$str);
84215fa0b4fSAndreas Gohr        return unicode_to_utf8($uni);
84315fa0b4fSAndreas Gohr    }
844df957b36SAndreas Gohr}
84515fa0b4fSAndreas Gohr
846df957b36SAndreas Gohrif(!function_exists('utf8_bad_replace')){
8470eac1afbSAndreas Gohr    /**
8480eac1afbSAndreas Gohr     * Replace bad bytes with an alternative character
8490eac1afbSAndreas Gohr     *
8500eac1afbSAndreas Gohr     * ASCII character is recommended for replacement char
8510eac1afbSAndreas Gohr     *
8520eac1afbSAndreas Gohr     * PCRE Pattern to locate bad bytes in a UTF-8 string
8530eac1afbSAndreas Gohr     * Comes from W3 FAQ: Multilingual Forms
8540eac1afbSAndreas Gohr     * Note: modified to include full ASCII range including control chars
8550eac1afbSAndreas Gohr     *
8560eac1afbSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
8570eac1afbSAndreas Gohr     * @see http://www.w3.org/International/questions/qa-forms-utf-8
8580eac1afbSAndreas Gohr     * @param string to search
8590eac1afbSAndreas Gohr     * @param string to replace bad bytes with (defaults to '?') - use ASCII
8600eac1afbSAndreas Gohr     * @return string
8610eac1afbSAndreas Gohr     */
8620eac1afbSAndreas Gohr    function utf8_bad_replace($str, $replace = '') {
8630eac1afbSAndreas Gohr        $UTF8_BAD =
8640eac1afbSAndreas Gohr         '([\x00-\x7F]'.                          # ASCII (including control chars)
8650eac1afbSAndreas Gohr         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
8660eac1afbSAndreas Gohr         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
8670eac1afbSAndreas Gohr         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
8680eac1afbSAndreas Gohr         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
8690eac1afbSAndreas Gohr         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
8700eac1afbSAndreas Gohr         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
8710eac1afbSAndreas Gohr         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
8720eac1afbSAndreas Gohr         '|(.{1}))';                              # invalid byte
8730eac1afbSAndreas Gohr        ob_start();
8740eac1afbSAndreas Gohr        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
8750eac1afbSAndreas Gohr            if ( !isset($matches[2])) {
8760eac1afbSAndreas Gohr                echo $matches[0];
8770eac1afbSAndreas Gohr            } else {
8780eac1afbSAndreas Gohr                echo $replace;
8790eac1afbSAndreas Gohr            }
8800eac1afbSAndreas Gohr            $str = substr($str,strlen($matches[0]));
8810eac1afbSAndreas Gohr        }
8820eac1afbSAndreas Gohr        $result = ob_get_contents();
8830eac1afbSAndreas Gohr        ob_end_clean();
8840eac1afbSAndreas Gohr        return $result;
8850eac1afbSAndreas Gohr    }
886df957b36SAndreas Gohr}
887ab77016bSAndreas Gohr
888df957b36SAndreas Gohrif(!function_exists('utf8_correctIdx')){
8895953e889Schris    /**
8905953e889Schris     * adjust a byte index into a utf8 string to a utf8 character boundary
8915953e889Schris     *
8925953e889Schris     * @param $str   string   utf8 character string
8935953e889Schris     * @param $i     int      byte index into $str
8945953e889Schris     * @param $next  bool     direction to search for boundary,
8955953e889Schris     *                           false = up (current character)
8965953e889Schris     *                           true = down (next character)
8975953e889Schris     *
8985953e889Schris     * @return int            byte index into $str now pointing to a utf8 character boundary
8995953e889Schris     *
9005953e889Schris     * @author       chris smith <chris@jalakai.co.uk>
9015953e889Schris     */
9025953e889Schris    function utf8_correctIdx(&$str,$i,$next=false) {
9035953e889Schris
904f50163d1Schris        if ($i <= 0) return 0;
905f50163d1Schris
9065953e889Schris        $limit = strlen($str);
907f50163d1Schris        if ($i>=$limit) return $limit;
908f50163d1Schris
909f50163d1Schris        if ($next) {
9105953e889Schris            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
9115953e889Schris        } else {
9125953e889Schris            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
9135953e889Schris        }
9145953e889Schris
9155953e889Schris        return $i;
9165953e889Schris    }
917df957b36SAndreas Gohr}
9185953e889Schris
919ab77016bSAndreas Gohr// only needed if no mb_string available
920ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
92115fa0b4fSAndreas Gohr    /**
92282257610Sandi     * UTF-8 Case lookup table
92382257610Sandi     *
92482257610Sandi     * This lookuptable defines the upper case letters to their correspponding
92582257610Sandi     * lower case letter in UTF-8
92682257610Sandi     *
92782257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
92882257610Sandi     */
92954662a04SAndreas Gohr    global $UTF8_LOWER_TO_UPPER;
930df957b36SAndreas Gohr    if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
93172de9068SAndreas Gohr            "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
93272de9068SAndreas Gohr            "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
93372de9068SAndreas Gohr            "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
93485b77bbdSAndreas Gohr            "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
93572de9068SAndreas Gohr            "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
93685b77bbdSAndreas Gohr            "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
93785b77bbdSAndreas Gohr            "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
93885b77bbdSAndreas Gohr            "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
93972de9068SAndreas Gohr            "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
94072de9068SAndreas Gohr            "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
94172de9068SAndreas Gohr            "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
94272de9068SAndreas Gohr            "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
94372de9068SAndreas Gohr            "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
94472de9068SAndreas Gohr            "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
94572de9068SAndreas Gohr            "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
94672de9068SAndreas Gohr            "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
94772de9068SAndreas Gohr            "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
94872de9068SAndreas Gohr            "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
94972de9068SAndreas Gohr            "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
95072de9068SAndreas Gohr            "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
95172de9068SAndreas Gohr            "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
95272de9068SAndreas Gohr            "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
95372de9068SAndreas Gohr            "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
95472de9068SAndreas Gohr            "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
95572de9068SAndreas Gohr            "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
95672de9068SAndreas Gohr            "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
95772de9068SAndreas Gohr            "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
95872de9068SAndreas Gohr            "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
95972de9068SAndreas Gohr            "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
96072de9068SAndreas Gohr            "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
96172de9068SAndreas Gohr            "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
96272de9068SAndreas Gohr            "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
96372de9068SAndreas Gohr            "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
96472de9068SAndreas Gohr            "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
96572de9068SAndreas Gohr            "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
96672de9068SAndreas Gohr            "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
96772de9068SAndreas Gohr            "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
96872de9068SAndreas Gohr            "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
96972de9068SAndreas Gohr            "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
97072de9068SAndreas Gohr            "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
97172de9068SAndreas Gohr            "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
97272de9068SAndreas Gohr            "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
97372de9068SAndreas Gohr            "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
97472de9068SAndreas Gohr            "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
97572de9068SAndreas Gohr            "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
97672de9068SAndreas Gohr            "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
97772de9068SAndreas Gohr            "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
97872de9068SAndreas Gohr            "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
97972de9068SAndreas Gohr            "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
98072de9068SAndreas Gohr            "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
98172de9068SAndreas Gohr            "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
98272de9068SAndreas Gohr            "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
98372de9068SAndreas Gohr            "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
98472de9068SAndreas Gohr            "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
98572de9068SAndreas Gohr            "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
98672de9068SAndreas Gohr            "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
98772de9068SAndreas Gohr            "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
98872de9068SAndreas Gohr            "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
98972de9068SAndreas Gohr            "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
99072de9068SAndreas Gohr            "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
99172de9068SAndreas Gohr            "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
99272de9068SAndreas Gohr            "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
99372de9068SAndreas Gohr            "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
99472de9068SAndreas Gohr            "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
99572de9068SAndreas Gohr            "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
99672de9068SAndreas Gohr            "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
99772de9068SAndreas Gohr            "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
99882257610Sandi                );
99982257610Sandi
100082257610Sandi    /**
100182257610Sandi     * UTF-8 Case lookup table
100282257610Sandi     *
100382257610Sandi     * This lookuptable defines the lower case letters to their correspponding
100472de9068SAndreas Gohr     * upper case letter in UTF-8
100582257610Sandi     *
100682257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
100782257610Sandi     */
100854662a04SAndreas Gohr    global $UTF8_UPPER_TO_LOWER;
1009df957b36SAndreas Gohr    if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
101072de9068SAndreas Gohr            "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
101172de9068SAndreas Gohr            "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
101272de9068SAndreas Gohr            "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
101385b77bbdSAndreas Gohr            "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
101472de9068SAndreas Gohr            "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
101585b77bbdSAndreas Gohr            "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
101685b77bbdSAndreas Gohr            "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
101785b77bbdSAndreas Gohr            "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
101872de9068SAndreas Gohr            "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
101972de9068SAndreas Gohr            "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
102072de9068SAndreas Gohr            "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
102172de9068SAndreas Gohr            "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
102272de9068SAndreas Gohr            "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
102372de9068SAndreas Gohr            "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
102472de9068SAndreas Gohr            "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
102572de9068SAndreas Gohr            "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
102672de9068SAndreas Gohr            "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
102772de9068SAndreas Gohr            "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
102872de9068SAndreas Gohr            "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
102972de9068SAndreas Gohr            "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
103072de9068SAndreas Gohr            "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
103172de9068SAndreas Gohr            "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
103272de9068SAndreas Gohr            "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
103372de9068SAndreas Gohr            "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
103472de9068SAndreas Gohr            "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
103572de9068SAndreas Gohr            "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
103672de9068SAndreas Gohr            "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
103772de9068SAndreas Gohr            "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
103872de9068SAndreas Gohr            "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
103972de9068SAndreas Gohr            "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
104072de9068SAndreas Gohr            "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
104172de9068SAndreas Gohr            "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
104272de9068SAndreas Gohr            "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
104372de9068SAndreas Gohr            "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
104472de9068SAndreas Gohr            "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
104572de9068SAndreas Gohr            "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
104672de9068SAndreas Gohr            "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
104772de9068SAndreas Gohr            "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
104872de9068SAndreas Gohr            "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
104972de9068SAndreas Gohr            "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
105072de9068SAndreas Gohr            "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
105172de9068SAndreas Gohr            "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
105272de9068SAndreas Gohr            "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
105372de9068SAndreas Gohr            "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
105472de9068SAndreas Gohr            "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
105572de9068SAndreas Gohr            "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
105672de9068SAndreas Gohr            "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
105772de9068SAndreas Gohr            "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
105872de9068SAndreas Gohr            "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
105972de9068SAndreas Gohr            "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
106072de9068SAndreas Gohr            "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
106172de9068SAndreas Gohr            "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
106272de9068SAndreas Gohr            "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
106372de9068SAndreas Gohr            "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
106472de9068SAndreas Gohr            "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
106572de9068SAndreas Gohr            "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
106672de9068SAndreas Gohr            "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
106772de9068SAndreas Gohr            "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
106872de9068SAndreas Gohr            "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
106972de9068SAndreas Gohr            "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
107072de9068SAndreas Gohr            "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
107172de9068SAndreas Gohr            "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
107272de9068SAndreas Gohr            "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
107372de9068SAndreas Gohr            "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
107472de9068SAndreas Gohr            "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
107572de9068SAndreas Gohr            "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
107672de9068SAndreas Gohr            "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
107772de9068SAndreas Gohr                );
107872de9068SAndreas Gohr}; // end of case lookup tables
1079ab77016bSAndreas Gohr
108082257610Sandi/**
108182257610Sandi * UTF-8 lookup table for lower case accented letters
108282257610Sandi *
108382257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
108482257610Sandi * range. This are lower case letters only.
108582257610Sandi *
108682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
108782257610Sandi * @see    utf8_deaccent()
108882257610Sandi */
108954662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
1090df957b36SAndreas Gohrif(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
109182257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
109282257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
109382257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
109482257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
109582257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
109682257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
109782257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
109882257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
109982257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
110082257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
110182257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
110282257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
110382257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
110482257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
110574c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
110682257610Sandi);
110782257610Sandi
110882257610Sandi/**
110982257610Sandi * UTF-8 lookup table for upper case accented letters
111082257610Sandi *
111182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
111282257610Sandi * range. This are upper case letters only.
111382257610Sandi *
111482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
111582257610Sandi * @see    utf8_deaccent()
111682257610Sandi */
111754662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
1118df957b36SAndreas Gohrif(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1119df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1120df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1121df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1122df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1123df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1124df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1125df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1126df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1127df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1128df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1129df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1130df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1131df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1132df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
113374c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
113482257610Sandi);
113582257610Sandi
1136099ada41Sandi/**
1137099ada41Sandi * UTF-8 array of common special characters
1138099ada41Sandi *
1139099ada41Sandi * This array should contain all special characters (not a letter or digit)
1140099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1141099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1142099ada41Sandi * chars.
1143099ada41Sandi *
1144099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1145ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1146099ada41Sandi *
1147099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1148099ada41Sandi * @see    utf8_stripspecials()
1149099ada41Sandi */
115054662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1151df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1152099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1153ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
11545c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
11555c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1156099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1157099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1158099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1159099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1160099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1161099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1162099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1163099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1164fae4b5fcSAndreas Gohr  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1165099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1166099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1167099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1168099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1169099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1170099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1171099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1172099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1173099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1174099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1175099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1176099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1177099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1178099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1179099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1180099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1181099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1182099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1183099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1184099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1185099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1186099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1187099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1188099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1189099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1190099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1191099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1192099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1193099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1194099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1195099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1196099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1197d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1198d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1199d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1200d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1201099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1202099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1203099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1204099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1205d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1206d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1207d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1208d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1209d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1210d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1211fae4b5fcSAndreas Gohr  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1212fae4b5fcSAndreas Gohr  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1213fae4b5fcSAndreas Gohr  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
12147de9cff5SAndreas Gohr  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
12157de9cff5SAndreas Gohr  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1216099ada41Sandi);
1217340756e4Sandi
1218720307d9Schris// utf8 version of above data
1219720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1220df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
122137242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1222720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
122385b77bbdSAndreas Gohr    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1224720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
122585b77bbdSAndreas Gohr    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1226720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
122785b77bbdSAndreas Gohr    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1228720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1229720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1230720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1231720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1232720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1233720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1234720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1235d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1236d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1237d5b23302STom N Harris    '�'.
1238d5b23302STom N Harris    '�ﹼﹽ'.
1239d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1240fae4b5fcSAndreas Gohr    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
12417de9cff5SAndreas Gohr    '����������������������������������������������������������������'.
12427de9cff5SAndreas Gohr    '   ⁠';
1243720307d9Schris
12448a831f2bSAndreas Gohr/**
12458a831f2bSAndreas Gohr * Romanization lookup table
12468a831f2bSAndreas Gohr *
12478a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
12488a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
12498a831f2bSAndreas Gohr *
12508a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
12518a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
12528a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
12538a831f2bSAndreas Gohr *
12548a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
12558a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
12568a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
12578a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
12588a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
12598a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
12608a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
12618a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
1262014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com>
1263fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de>
12648a831f2bSAndreas Gohr */
126554662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
1266df957b36SAndreas Gohrif(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1267176ae32bSAndreas Gohr  // scandinavian - differs from what we do in deaccent
1268176ae32bSAndreas Gohr  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1269176ae32bSAndreas Gohr
12708a831f2bSAndreas Gohr  //russian cyrillic
12718a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
12728a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
12738a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
12748a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
12758a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
12768a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1277d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1278f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
12798a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
12808a831f2bSAndreas Gohr  // Ukrainian cyrillic
12818a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
12828a831f2bSAndreas Gohr  // Georgian
12838a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
12848a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
12858a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
12868a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
12878a831f2bSAndreas Gohr  'ჰ'=>'xh',
12888a831f2bSAndreas Gohr  //Sanskrit
12898a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
12908a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
12918a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
12928a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
12938a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
12948a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
12958a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
12968a831f2bSAndreas Gohr  //Hebrew
12973dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
12983dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
12993dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
13008a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
13018a831f2bSAndreas Gohr  //Arabic
13028a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
13038a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
13048a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
13058a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
13068a831f2bSAndreas Gohr
1307799e0977SAndreas Gohr  // Japanese characters  (last update: 2008-05-09)
13089476a253SAndreas Gohr
13098a831f2bSAndreas Gohr  // Japanese hiragana
1310fed467f8SDenis Scheither
1311fed467f8SDenis Scheither  // 3 character syllables, っ doubles the consonant after
1312fed467f8SDenis Scheither  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1313879205e1SAndreas Gohr  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1314799e0977SAndreas Gohr  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1315879205e1SAndreas Gohr  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1316879205e1SAndreas Gohr  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1317879205e1SAndreas Gohr  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1318879205e1SAndreas Gohr  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1319879205e1SAndreas Gohr  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1320879205e1SAndreas Gohr  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1321879205e1SAndreas Gohr  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1322879205e1SAndreas Gohr  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1323879205e1SAndreas Gohr
1324879205e1SAndreas Gohr  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1325879205e1SAndreas Gohr  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1326879205e1SAndreas Gohr  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1327fed467f8SDenis Scheither
1328fed467f8SDenis Scheither   // 2 character syllables - normal
1329879205e1SAndreas Gohr  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1330fed467f8SDenis Scheither  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1331fed467f8SDenis Scheither  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1332799e0977SAndreas Gohr  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1333799e0977SAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1334fed467f8SDenis Scheither  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1335fed467f8SDenis Scheither  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1336fed467f8SDenis Scheither  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1337fed467f8SDenis Scheither  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1338fed467f8SDenis Scheither  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1339fed467f8SDenis Scheither  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1340879205e1SAndreas Gohr  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1341879205e1SAndreas Gohr  'うぇ'=>'we','うぃ'=>'wi',
1342879205e1SAndreas Gohr  'いぇ'=>'ye',
1343fed467f8SDenis Scheither
1344fed467f8SDenis Scheither  // 2 character syllables, っ doubles the consonant after
1345fed467f8SDenis Scheither  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1346fed467f8SDenis Scheither  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1347fed467f8SDenis Scheither  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1348fed467f8SDenis Scheither  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1349fed467f8SDenis Scheither  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1350fed467f8SDenis Scheither  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1351fed467f8SDenis Scheither  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1352fed467f8SDenis Scheither  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1353fed467f8SDenis Scheither  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1354fed467f8SDenis Scheither  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1355799e0977SAndreas Gohr  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1356fed467f8SDenis Scheither
1357fed467f8SDenis Scheither  // 1 character syllabels
1358fed467f8SDenis Scheither  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1359879205e1SAndreas Gohr  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1360fed467f8SDenis Scheither  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1361fed467f8SDenis Scheither  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
13629476a253SAndreas Gohr  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1363fed467f8SDenis Scheither  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1364fed467f8SDenis Scheither  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1365fed467f8SDenis Scheither  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1366fed467f8SDenis Scheither  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1367fed467f8SDenis Scheither  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1368fed467f8SDenis Scheither  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1369fed467f8SDenis Scheither  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1370879205e1SAndreas Gohr  'わ'=>'wa','を'=>'wo',
1371879205e1SAndreas Gohr  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1372879205e1SAndreas Gohr  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
13739476a253SAndreas Gohr  // old characters
13749476a253SAndreas Gohr  'ゑ'=>'we','ゐ'=>'wi',
1375fed467f8SDenis Scheither
13769476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
13779476a253SAndreas Gohr  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
13789476a253SAndreas Gohr  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1379fed467f8SDenis Scheither
13809476a253SAndreas Gohr  // never seen one of those (disabled for the moment)
1381879205e1SAndreas Gohr  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
13829476a253SAndreas Gohr  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
13839476a253SAndreas Gohr  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
13849476a253SAndreas Gohr  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
13859476a253SAndreas Gohr  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
13869476a253SAndreas Gohr  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
13879476a253SAndreas Gohr  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
13889476a253SAndreas Gohr  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
13899476a253SAndreas Gohr  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
13909476a253SAndreas Gohr  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
13919476a253SAndreas Gohr  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
13929476a253SAndreas Gohr  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
13939476a253SAndreas Gohr  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
13949476a253SAndreas Gohr  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1395fed467f8SDenis Scheither
1396fed467f8SDenis Scheither  // 'spare' characters from other romanization systems
1397fed467f8SDenis Scheither  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1398fed467f8SDenis Scheither  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1399fed467f8SDenis Scheither  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1400fed467f8SDenis Scheither  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1401fed467f8SDenis Scheither  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1402fed467f8SDenis Scheither  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1403fed467f8SDenis Scheither  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1404fed467f8SDenis Scheither  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1405fed467f8SDenis Scheither  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1406fed467f8SDenis Scheither  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1407fed467f8SDenis Scheither
1408fed467f8SDenis Scheither
14098a831f2bSAndreas Gohr  // Japanese katakana
1410fed467f8SDenis Scheither
1411fed467f8SDenis Scheither  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1412fed467f8SDenis Scheither  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1413fed467f8SDenis Scheither  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1414fed467f8SDenis Scheither  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1415fed467f8SDenis Scheither  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1416fed467f8SDenis Scheither  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1417fed467f8SDenis Scheither  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1418fed467f8SDenis Scheither  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1419fed467f8SDenis Scheither  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1420fed467f8SDenis Scheither  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1421799e0977SAndreas Gohr  'ッティー'=>'ttii',
1422799e0977SAndreas Gohr  'ッヂィー'=>'ddii',
1423fed467f8SDenis Scheither
1424fed467f8SDenis Scheither  // 3 character syllables - doubled vowels
1425fed467f8SDenis Scheither  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1426fed467f8SDenis Scheither  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1427fed467f8SDenis Scheither  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1428fed467f8SDenis Scheither  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1429fed467f8SDenis Scheither  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1430fed467f8SDenis Scheither  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1431fed467f8SDenis Scheither  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1432fed467f8SDenis Scheither  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1433fed467f8SDenis Scheither  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1434fed467f8SDenis Scheither  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1435fed467f8SDenis Scheither  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1436fed467f8SDenis Scheither  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1437fed467f8SDenis Scheither  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1438fed467f8SDenis Scheither  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1439fed467f8SDenis Scheither  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1440fed467f8SDenis Scheither  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1441fed467f8SDenis Scheither  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1442fed467f8SDenis Scheither  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1443fed467f8SDenis Scheither  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1444fed467f8SDenis Scheither  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1445fed467f8SDenis Scheither  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1446fed467f8SDenis Scheither  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1447fed467f8SDenis Scheither  'ウェー'=>'wee','ウィー'=>'wii',
1448fed467f8SDenis Scheither  'イェー'=>'yee',
1449799e0977SAndreas Gohr  'ティー'=>'tii',
1450799e0977SAndreas Gohr  'ヂィー'=>'dii',
1451fed467f8SDenis Scheither
1452fed467f8SDenis Scheither  // 3 character syllables - doubled consonants
1453fed467f8SDenis Scheither  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1454fed467f8SDenis Scheither  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1455fed467f8SDenis Scheither  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1456fed467f8SDenis Scheither  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1457fed467f8SDenis Scheither  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1458fed467f8SDenis Scheither  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1459fed467f8SDenis Scheither  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1460fed467f8SDenis Scheither  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1461fed467f8SDenis Scheither  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1462799e0977SAndreas Gohr  'ッティ'=>'tti',
1463799e0977SAndreas Gohr  'ッヂィ'=>'ddi',
1464fed467f8SDenis Scheither
1465fed467f8SDenis Scheither  // 3 character syllables - doubled vowel and consonants
1466fed467f8SDenis Scheither  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1467fed467f8SDenis Scheither  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1468fed467f8SDenis Scheither  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1469fed467f8SDenis Scheither  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1470fed467f8SDenis Scheither  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1471fed467f8SDenis Scheither  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1472fed467f8SDenis Scheither  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1473fed467f8SDenis Scheither  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1474799e0977SAndreas Gohr  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1475799e0977SAndreas Gohr  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1476fed467f8SDenis Scheither  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1477fed467f8SDenis Scheither
1478fed467f8SDenis Scheither  // 2 character syllables - normal
1479799e0977SAndreas Gohr  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1480799e0977SAndreas Gohr  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1481799e0977SAndreas Gohr  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1482fed467f8SDenis Scheither  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1483fed467f8SDenis Scheither  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1484fed467f8SDenis Scheither  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1485fed467f8SDenis Scheither  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1486fed467f8SDenis Scheither  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1487fed467f8SDenis Scheither  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1488fed467f8SDenis Scheither  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1489fed467f8SDenis Scheither  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1490879205e1SAndreas Gohr  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1491879205e1SAndreas Gohr  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1492fed467f8SDenis Scheither  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1493fed467f8SDenis Scheither  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1494fed467f8SDenis Scheither  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1495799e0977SAndreas Gohr  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1496fed467f8SDenis Scheither  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1497fed467f8SDenis Scheither  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1498fed467f8SDenis Scheither  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1499fed467f8SDenis Scheither  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1500fed467f8SDenis Scheither  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1501fed467f8SDenis Scheither  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1502fed467f8SDenis Scheither  'ウェ'=>'we','ウィ'=>'wi',
1503fed467f8SDenis Scheither  'イェ'=>'ye',
1504799e0977SAndreas Gohr  'ティ'=>'ti',
1505799e0977SAndreas Gohr  'ヂィ'=>'di',
1506fed467f8SDenis Scheither
1507fed467f8SDenis Scheither  // 2 character syllables - doubled vocal
1508fed467f8SDenis Scheither  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1509fed467f8SDenis Scheither  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1510fed467f8SDenis Scheither  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1511fed467f8SDenis Scheither  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1512fed467f8SDenis Scheither  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1513fed467f8SDenis Scheither  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1514fed467f8SDenis Scheither  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1515fed467f8SDenis Scheither  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1516fed467f8SDenis Scheither  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1517fed467f8SDenis Scheither  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1518fed467f8SDenis Scheither  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1519799e0977SAndreas Gohr  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1520fed467f8SDenis Scheither  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1521fed467f8SDenis Scheither  'ワー'=>'waa','ヲー'=>'woo',
1522fed467f8SDenis Scheither  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1523fed467f8SDenis Scheither  'ヵー'=>'kaa','ヶー'=>'kee',
15249476a253SAndreas Gohr  // old characters
15259476a253SAndreas Gohr  'ヱー'=>'wee','ヰー'=>'wii',
1526fed467f8SDenis Scheither
1527879205e1SAndreas Gohr  // seperate katakana 'n'
1528879205e1SAndreas Gohr  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1529879205e1SAndreas Gohr  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1530879205e1SAndreas Gohr
1531fed467f8SDenis Scheither  // 2 character syllables - doubled consonants
1532fed467f8SDenis Scheither  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1533fed467f8SDenis Scheither  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1534fed467f8SDenis Scheither  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1535fed467f8SDenis Scheither  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1536fed467f8SDenis Scheither  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1537fed467f8SDenis Scheither  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1538fed467f8SDenis Scheither  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1539fed467f8SDenis Scheither  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1540799e0977SAndreas Gohr  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1541799e0977SAndreas Gohr  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1542fed467f8SDenis Scheither  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1543fed467f8SDenis Scheither
1544fed467f8SDenis Scheither  // 1 character syllables
1545fed467f8SDenis Scheither  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1546fed467f8SDenis Scheither  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1547fed467f8SDenis Scheither  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1548fed467f8SDenis Scheither  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1549fed467f8SDenis Scheither  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1550fed467f8SDenis Scheither  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1551fed467f8SDenis Scheither  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1552fed467f8SDenis Scheither  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1553fed467f8SDenis Scheither  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1554fed467f8SDenis Scheither  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1555879205e1SAndreas Gohr  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1556fed467f8SDenis Scheither  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1557fed467f8SDenis Scheither  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1558fed467f8SDenis Scheither  'ワ'=>'wa','ヲ'=>'wo',
1559fed467f8SDenis Scheither  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1560fed467f8SDenis Scheither  'ヵ'=>'ka','ヶ'=>'ke',
15619476a253SAndreas Gohr  // old characters
15629476a253SAndreas Gohr  'ヱ'=>'we','ヰ'=>'wi',
1563fed467f8SDenis Scheither
15649476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1565fed467f8SDenis Scheither  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1566fed467f8SDenis Scheither  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1567fed467f8SDenis Scheither
1568799e0977SAndreas Gohr  // special characters
1569799e0977SAndreas Gohr  '・'=>'_','、'=>'_',
1570799e0977SAndreas Gohr  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1571799e0977SAndreas Gohr
1572fed467f8SDenis Scheither  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1573fed467f8SDenis Scheither  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1574fed467f8SDenis Scheither  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1575fed467f8SDenis Scheither  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1576fed467f8SDenis Scheither  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1577fed467f8SDenis Scheither  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1578fed467f8SDenis Scheither  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1579fed467f8SDenis Scheither  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1580fed467f8SDenis Scheither  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1581fed467f8SDenis Scheither  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1582fed467f8SDenis Scheither  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1583fed467f8SDenis Scheither  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
15848a831f2bSAndreas Gohr
15858a831f2bSAndreas Gohr  // "Greeklish"
15868a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
15878a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
15888a831f2bSAndreas Gohr
15898a831f2bSAndreas Gohr  // Thai
15908a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
15918a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
15928a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
15938a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
15948a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
15958a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1596014d0ab6SAndreas Gohr  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1597014d0ab6SAndreas Gohr  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1598014d0ab6SAndreas Gohr  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1599014d0ab6SAndreas Gohr  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1600014d0ab6SAndreas Gohr  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1601014d0ab6SAndreas Gohr  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1602014d0ab6SAndreas Gohr  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1603014d0ab6SAndreas Gohr  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1604014d0ab6SAndreas Gohr  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1605014d0ab6SAndreas Gohr  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1606014d0ab6SAndreas Gohr  '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1607014d0ab6SAndreas Gohr  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
16088a831f2bSAndreas Gohr
16098a831f2bSAndreas Gohr  // Korean
16108a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
16118a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
16128a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
16138a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
16148a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
16158a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
16168a831f2bSAndreas Gohr);
1617340756e4Sandi
16188a831f2bSAndreas Gohr
1619