xref: /dokuwiki/inc/utf8.php (revision 7e6f32c47b0f4cdb2f5fb9ccad614427df6c4c03)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr  }else{
16ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr  }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22df957b36SAndreas Gohrif(!function_exists('utf8_encodeFN')){
2382257610Sandi    /**
2449c713a3Sandi     * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi     *
2649c713a3Sandi     * Slashes are not encoded
2749c713a3Sandi     *
28f59b22f0Sandi     * When the second parameter is true the string will
29f59b22f0Sandi     * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi     * This makes it safe to run it multiple times on the
31f59b22f0Sandi     * same string (default is true)
32f59b22f0Sandi     *
3349c713a3Sandi     * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi     * @see    urlencode
3549c713a3Sandi     */
36f59b22f0Sandi    function utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi      if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi        return $file;
39f59b22f0Sandi      }
40f59b22f0Sandi      $file = urlencode($file);
4149c713a3Sandi      $file = str_replace('%2F','/',$file);
4249c713a3Sandi      return $file;
4349c713a3Sandi    }
44df957b36SAndreas Gohr}
4549c713a3Sandi
46df957b36SAndreas Gohrif(!function_exists('utf8_decodeFN')){
4749c713a3Sandi    /**
4849c713a3Sandi     * URL-Decode a filename
4949c713a3Sandi     *
50f59b22f0Sandi     * This is just a wrapper around urldecode
51f59b22f0Sandi     *
5249c713a3Sandi     * @author Andreas Gohr <andi@splitbrain.org>
53f59b22f0Sandi     * @see    urldecode
5449c713a3Sandi     */
5549c713a3Sandi    function utf8_decodeFN($file){
56f59b22f0Sandi        $file = urldecode($file);
5749c713a3Sandi        return $file;
5849c713a3Sandi    }
59df957b36SAndreas Gohr}
6049c713a3Sandi
61df957b36SAndreas Gohrif(!function_exists('utf8_isASCII')){
62f29bd553Sandi    /**
6344f669e9Sandi     * Checks if a string contains 7bit ASCII only
6444f669e9Sandi     *
65*7e6f32c4SAndreas Gohr     * @author Andreas Haerter <netzmeister@andreas-haerter.de>
6644f669e9Sandi     */
6744f669e9Sandi    function utf8_isASCII($str){
68*7e6f32c4SAndreas Gohr        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
6944f669e9Sandi    }
70df957b36SAndreas Gohr}
7144f669e9Sandi
72df957b36SAndreas Gohrif(!function_exists('utf8_strip')){
7344f669e9Sandi    /**
74e1906e6eSandi     * Strips all highbyte chars
75e1906e6eSandi     *
76e1906e6eSandi     * Returns a pure ASCII7 string
77e1906e6eSandi     *
78e1906e6eSandi     * @author Andreas Gohr <andi@splitbrain.org>
79e1906e6eSandi     */
80e1906e6eSandi    function utf8_strip($str){
81e1906e6eSandi      $ascii = '';
82e1906e6eSandi      for($i=0; $i<strlen($str); $i++){
83e1906e6eSandi        if(ord($str{$i}) <128){
84e1906e6eSandi          $ascii .= $str{$i};
85e1906e6eSandi        }
86e1906e6eSandi      }
87e1906e6eSandi      return $ascii;
88e1906e6eSandi    }
89df957b36SAndreas Gohr}
90e1906e6eSandi
91df957b36SAndreas Gohrif(!function_exists('utf8_check')){
92e1906e6eSandi    /**
93f29bd553Sandi     * Tries to detect if a string is in Unicode encoding
94f29bd553Sandi     *
95f29bd553Sandi     * @author <bmorel@ssi.fr>
96f29bd553Sandi     * @link   http://www.php.net/manual/en/function.utf8-encode.php
97f29bd553Sandi     */
98f29bd553Sandi    function utf8_check($Str) {
99f29bd553Sandi        for ($i=0; $i<strlen($Str); $i++) {
1005e613a5cSchris            $b = ord($Str[$i]);
1015e613a5cSchris            if ($b < 0x80) continue; # 0bbbbbbb
1025e613a5cSchris            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
1035e613a5cSchris            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
1045e613a5cSchris            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
1055e613a5cSchris            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
1065e613a5cSchris            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
107f29bd553Sandi            else return false; # Does not match any model
108df957b36SAndreas Gohr
109f29bd553Sandi            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
110f29bd553Sandi                if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
111f29bd553Sandi                    return false;
112f29bd553Sandi            }
113f29bd553Sandi        }
114f29bd553Sandi        return true;
115f29bd553Sandi    }
116df957b36SAndreas Gohr}
11749c713a3Sandi
118df957b36SAndreas Gohrif(!function_exists('utf8_strlen')){
1192f954959Sandi    /**
120f29317c1Sandi     * Unicode aware replacement for strlen()
1212f954959Sandi     *
122f29317c1Sandi     * utf8_decode() converts characters that are not in ISO-8859-1
123f29317c1Sandi     * to '?', which, for the purpose of counting, is alright - It's
124f29317c1Sandi     * even faster than mb_strlen.
1252f954959Sandi     *
126f29317c1Sandi     * @author <chernyshevsky at hotmail dot com>
1272f954959Sandi     * @see    strlen()
128f29317c1Sandi     * @see    utf8_decode()
1292f954959Sandi     */
1302f954959Sandi    function utf8_strlen($string){
131dc57ef04Sandi        return strlen(utf8_decode($string));
1322f954959Sandi    }
133df957b36SAndreas Gohr}
1342f954959Sandi
135df957b36SAndreas Gohrif(!function_exists('utf8_substr')){
1367077c942Sandi    /**
13710f09f2aSAndreas Gohr     * UTF-8 aware alternative to substr
1387077c942Sandi     *
13910f09f2aSAndreas Gohr     * Return part of a string given character offset (and optionally length)
14010f09f2aSAndreas Gohr     *
14110f09f2aSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
1425e613a5cSchris     * @author Chris Smith <chris@jalakai.co.uk>
14310f09f2aSAndreas Gohr     * @param string
14410f09f2aSAndreas Gohr     * @param integer number of UTF-8 characters offset (from left)
14510f09f2aSAndreas Gohr     * @param integer (optional) length in UTF-8 characters from offset
14644881bd0Shenning.noren     * @return mixed string or false if failure
1477077c942Sandi     */
14810f09f2aSAndreas Gohr    function utf8_substr($str, $offset, $length = null) {
149ab77016bSAndreas Gohr        if(UTF8_MBSTRING){
15010f09f2aSAndreas Gohr            if( $length === null ){
15119a32233Schris                return mb_substr($str, $offset);
1527d8be200Sandi            }else{
15319a32233Schris                return mb_substr($str, $offset, $length);
154f29317c1Sandi            }
155f29317c1Sandi        }
156f29317c1Sandi
1572626ee0cSchris        /*
1582626ee0cSchris         * Notes:
1592626ee0cSchris         *
1602626ee0cSchris         * no mb string support, so we'll use pcre regex's with 'u' flag
1612626ee0cSchris         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1622626ee0cSchris         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1632626ee0cSchris         *
1642626ee0cSchris         * substr documentation states false can be returned in some cases (e.g. offset > string length)
1652626ee0cSchris         * mb_substr never returns false, it will return an empty string instead.
1662626ee0cSchris         *
1672626ee0cSchris         * calculating the number of characters in the string is a relatively expensive operation, so
1682626ee0cSchris         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1692626ee0cSchris         */
17010f09f2aSAndreas Gohr
1712626ee0cSchris        // cast parameters to appropriate types to avoid multiple notices/warnings
1722626ee0cSchris        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1732626ee0cSchris        $offset = (int)$offset;
1742626ee0cSchris        if (!is_null($length)) $length = (int)$length;
17510f09f2aSAndreas Gohr
1762626ee0cSchris        // handle trivial cases
1775e613a5cSchris        if ($length === 0) return '';
1782626ee0cSchris        if ($offset < 0 && $length < 0 && $length < $offset) return '';
1795e613a5cSchris
1802626ee0cSchris        $offset_pattern = '';
1812626ee0cSchris        $length_pattern = '';
1822626ee0cSchris
1832626ee0cSchris        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1842626ee0cSchris        if ($offset < 0) {
1852626ee0cSchris          $strlen = strlen(utf8_decode($str));        // see notes
1862626ee0cSchris          $offset = $strlen + $offset;
1872626ee0cSchris          if ($offset < 0) $offset = 0;
1882626ee0cSchris        }
1892626ee0cSchris
1902626ee0cSchris        // establish a pattern for offset, a non-captured group equal in length to offset
1912626ee0cSchris        if ($offset > 0) {
1922626ee0cSchris          $Ox = (int)($offset/65535);
1932626ee0cSchris          $Oy = $offset%65535;
1942626ee0cSchris
1952626ee0cSchris          if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1962626ee0cSchris          $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1972626ee0cSchris        } else {
1982626ee0cSchris          $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1992626ee0cSchris        }
2002626ee0cSchris
2012626ee0cSchris        // establish a pattern for length
2022626ee0cSchris        if (is_null($length)) {
2032626ee0cSchris          $length_pattern = '(.*)$';                  // the rest of the string
2042626ee0cSchris        } else {
2052626ee0cSchris
2062626ee0cSchris          if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
2072626ee0cSchris          if ($offset > $strlen) return '';           // another trivial case
2082626ee0cSchris
2092626ee0cSchris          if ($length > 0) {
2102626ee0cSchris
2112626ee0cSchris            $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2122626ee0cSchris
2132626ee0cSchris            $Lx = (int)($length/65535);
2142626ee0cSchris            $Ly = $length%65535;
2152626ee0cSchris
2162626ee0cSchris            // +ve length requires ... a captured group of length characters
2172626ee0cSchris            if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2182626ee0cSchris            $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2192626ee0cSchris
2202626ee0cSchris          } else if ($length < 0) {
2212626ee0cSchris
2222626ee0cSchris            if ($length < ($offset - $strlen)) return '';
2232626ee0cSchris
2242626ee0cSchris            $Lx = (int)((-$length)/65535);
2252626ee0cSchris            $Ly = (-$length)%65535;
2262626ee0cSchris
2272626ee0cSchris            // -ve length requires ... capture everything except a group of -length characters
2282626ee0cSchris            //                         anchored at the tail-end of the string
2292626ee0cSchris            if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2302626ee0cSchris            $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
23110f09f2aSAndreas Gohr          }
23210f09f2aSAndreas Gohr        }
23310f09f2aSAndreas Gohr
2342626ee0cSchris        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2352626ee0cSchris        return $match[1];
2362626ee0cSchris    }
237df957b36SAndreas Gohr}
23810f09f2aSAndreas Gohr
239df957b36SAndreas Gohrif(!function_exists('utf8_substr_replace')){
240f29317c1Sandi    /**
241dc57ef04Sandi     * Unicode aware replacement for substr_replace()
242dc57ef04Sandi     *
243dc57ef04Sandi     * @author Andreas Gohr <andi@splitbrain.org>
244dc57ef04Sandi     * @see    substr_replace()
245dc57ef04Sandi     */
246dc57ef04Sandi    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
247dc57ef04Sandi      $ret = '';
248dc57ef04Sandi      if($start>0) $ret .= utf8_substr($string, 0, $start);
249dc57ef04Sandi      $ret .= $replacement;
250dc57ef04Sandi      $ret .= utf8_substr($string, $start+$length);
251dc57ef04Sandi      return $ret;
252dc57ef04Sandi    }
253df957b36SAndreas Gohr}
254dc57ef04Sandi
255df957b36SAndreas Gohrif(!function_exists('utf8_ltrim')){
256dc57ef04Sandi    /**
257f29317c1Sandi     * Unicode aware replacement for ltrim()
258f29317c1Sandi     *
259f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
260f29317c1Sandi     * @see    ltrim()
261f29317c1Sandi     * @return string
262f29317c1Sandi     */
263f29317c1Sandi    function utf8_ltrim($str,$charlist=''){
264f29317c1Sandi      if($charlist == '') return ltrim($str);
265f29317c1Sandi
266f29317c1Sandi      //quote charlist for use in a characterclass
267f29317c1Sandi      $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
268f29317c1Sandi
269f29317c1Sandi      return preg_replace('/^['.$charlist.']+/u','',$str);
270f29317c1Sandi    }
271df957b36SAndreas Gohr}
272f29317c1Sandi
273df957b36SAndreas Gohrif(!function_exists('utf8_rtrim')){
274f29317c1Sandi    /**
275ea2eed85Sandi     * Unicode aware replacement for rtrim()
276f29317c1Sandi     *
277f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
278f29317c1Sandi     * @see    rtrim()
279f29317c1Sandi     * @return string
280f29317c1Sandi     */
281f29317c1Sandi    function  utf8_rtrim($str,$charlist=''){
282f29317c1Sandi      if($charlist == '') return rtrim($str);
283f29317c1Sandi
284f29317c1Sandi      //quote charlist for use in a characterclass
285f29317c1Sandi      $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
286f29317c1Sandi
287f29317c1Sandi      return preg_replace('/['.$charlist.']+$/u','',$str);
288f29317c1Sandi    }
289df957b36SAndreas Gohr}
290f29317c1Sandi
291df957b36SAndreas Gohrif(!function_exists('utf8_trim')){
292f29317c1Sandi    /**
293f29317c1Sandi     * Unicode aware replacement for trim()
294f29317c1Sandi     *
295f29317c1Sandi     * @author Andreas Gohr <andi@splitbrain.org>
296f29317c1Sandi     * @see    trim()
297f29317c1Sandi     * @return string
298f29317c1Sandi     */
299f29317c1Sandi    function  utf8_trim($str,$charlist='') {
300f29317c1Sandi      if($charlist == '') return trim($str);
301f29317c1Sandi
30240421069SAndreas Gohr      return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
303f29317c1Sandi    }
304df957b36SAndreas Gohr}
305f29317c1Sandi
306df957b36SAndreas Gohrif(!function_exists('utf8_strtolower')){
30749c713a3Sandi    /**
30882257610Sandi     * This is a unicode aware replacement for strtolower()
30982257610Sandi     *
31082257610Sandi     * Uses mb_string extension if available
31182257610Sandi     *
31272de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
31382257610Sandi     * @see    strtolower()
31482257610Sandi     * @see    utf8_strtoupper()
31582257610Sandi     */
31682257610Sandi    function utf8_strtolower($string){
317ab77016bSAndreas Gohr      if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
31882257610Sandi
31982257610Sandi      global $UTF8_UPPER_TO_LOWER;
32072de9068SAndreas Gohr      return strtr($string,$UTF8_UPPER_TO_LOWER);
32182257610Sandi    }
322df957b36SAndreas Gohr}
32382257610Sandi
324df957b36SAndreas Gohrif(!function_exists('utf8_strtoupper')){
32582257610Sandi    /**
32682257610Sandi     * This is a unicode aware replacement for strtoupper()
32782257610Sandi     *
32882257610Sandi     * Uses mb_string extension if available
32982257610Sandi     *
33072de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
33182257610Sandi     * @see    strtoupper()
33282257610Sandi     * @see    utf8_strtoupper()
33382257610Sandi     */
33482257610Sandi    function utf8_strtoupper($string){
335ab77016bSAndreas Gohr      if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
33682257610Sandi
33782257610Sandi      global $UTF8_LOWER_TO_UPPER;
33872de9068SAndreas Gohr      return strtr($string,$UTF8_LOWER_TO_UPPER);
33982257610Sandi    }
340df957b36SAndreas Gohr}
34182257610Sandi
342df957b36SAndreas Gohrif(!function_exists('utf8_ucfirst')){
34382257610Sandi    /**
34426ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucfirst
34526ece5a7SAndreas Gohr     * Make a string's first character uppercase
34626ece5a7SAndreas Gohr     *
34726ece5a7SAndreas Gohr     * @author Harry Fuecks
34826ece5a7SAndreas Gohr     * @param string
34926ece5a7SAndreas Gohr     * @return string with first character as upper case (if applicable)
35026ece5a7SAndreas Gohr     */
35126ece5a7SAndreas Gohr    function utf8_ucfirst($str){
35226ece5a7SAndreas Gohr      switch ( utf8_strlen($str) ) {
35326ece5a7SAndreas Gohr        case 0:
35426ece5a7SAndreas Gohr            return '';
35526ece5a7SAndreas Gohr        case 1:
35626ece5a7SAndreas Gohr            return utf8_strtoupper($str);
35726ece5a7SAndreas Gohr        default:
35826ece5a7SAndreas Gohr            preg_match('/^(.{1})(.*)$/us', $str, $matches);
35926ece5a7SAndreas Gohr            return utf8_strtoupper($matches[1]).$matches[2];
36026ece5a7SAndreas Gohr      }
36126ece5a7SAndreas Gohr    }
362df957b36SAndreas Gohr}
36326ece5a7SAndreas Gohr
364df957b36SAndreas Gohrif(!function_exists('utf8_ucwords')){
36526ece5a7SAndreas Gohr    /**
36626ece5a7SAndreas Gohr     * UTF-8 aware alternative to ucwords
36726ece5a7SAndreas Gohr     * Uppercase the first character of each word in a string
36826ece5a7SAndreas Gohr     *
36926ece5a7SAndreas Gohr     * @author Harry Fuecks
37026ece5a7SAndreas Gohr     * @param string
37126ece5a7SAndreas Gohr     * @return string with first char of each word uppercase
37226ece5a7SAndreas Gohr     * @see http://www.php.net/ucwords
37326ece5a7SAndreas Gohr     */
37426ece5a7SAndreas Gohr    function utf8_ucwords($str) {
37526ece5a7SAndreas Gohr      // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
37626ece5a7SAndreas Gohr      // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
37726ece5a7SAndreas Gohr      // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
37826ece5a7SAndreas Gohr      $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
37926ece5a7SAndreas Gohr
38026ece5a7SAndreas Gohr      return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
38126ece5a7SAndreas Gohr    }
38226ece5a7SAndreas Gohr
38326ece5a7SAndreas Gohr    /**
38426ece5a7SAndreas Gohr     * Callback function for preg_replace_callback call in utf8_ucwords
38526ece5a7SAndreas Gohr     * You don't need to call this yourself
38626ece5a7SAndreas Gohr     *
38726ece5a7SAndreas Gohr     * @author Harry Fuecks
38826ece5a7SAndreas Gohr     * @param array of matches corresponding to a single word
38926ece5a7SAndreas Gohr     * @return string with first char of the word in uppercase
39026ece5a7SAndreas Gohr     * @see utf8_ucwords
39126ece5a7SAndreas Gohr     * @see utf8_strtoupper
39226ece5a7SAndreas Gohr     */
39326ece5a7SAndreas Gohr    function utf8_ucwords_callback($matches) {
39426ece5a7SAndreas Gohr      $leadingws = $matches[2];
39526ece5a7SAndreas Gohr      $ucfirst = utf8_strtoupper($matches[3]);
39626ece5a7SAndreas Gohr      $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
39726ece5a7SAndreas Gohr      return $leadingws . $ucword;
39826ece5a7SAndreas Gohr    }
399df957b36SAndreas Gohr}
40026ece5a7SAndreas Gohr
401df957b36SAndreas Gohrif(!function_exists('utf8_deaccent')){
40226ece5a7SAndreas Gohr    /**
40382257610Sandi     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
40482257610Sandi     *
40582257610Sandi     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
40682257610Sandi     * letters. Default is to deaccent both cases ($case = 0)
40782257610Sandi     *
40882257610Sandi     * @author Andreas Gohr <andi@splitbrain.org>
40982257610Sandi     */
41082257610Sandi    function utf8_deaccent($string,$case=0){
41182257610Sandi        if($case <= 0){
41282257610Sandi            global $UTF8_LOWER_ACCENTS;
41372de9068SAndreas Gohr            $string = strtr($string,$UTF8_LOWER_ACCENTS);
41482257610Sandi        }
41582257610Sandi        if($case >= 0){
41682257610Sandi            global $UTF8_UPPER_ACCENTS;
41772de9068SAndreas Gohr            $string = strtr($string,$UTF8_UPPER_ACCENTS);
41882257610Sandi        }
41982257610Sandi        return $string;
42082257610Sandi    }
421df957b36SAndreas Gohr}
42282257610Sandi
423df957b36SAndreas Gohrif(!function_exists('utf8_romanize')){
42482257610Sandi    /**
4258a831f2bSAndreas Gohr     * Romanize a non-latin string
4268a831f2bSAndreas Gohr     *
4278a831f2bSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
4288a831f2bSAndreas Gohr     */
4298a831f2bSAndreas Gohr    function utf8_romanize($string){
4308a831f2bSAndreas Gohr        if(utf8_isASCII($string)) return $string; //nothing to do
4318a831f2bSAndreas Gohr
4328a831f2bSAndreas Gohr        global $UTF8_ROMANIZATION;
4338a831f2bSAndreas Gohr        return strtr($string,$UTF8_ROMANIZATION);
4348a831f2bSAndreas Gohr    }
435df957b36SAndreas Gohr}
4368a831f2bSAndreas Gohr
437df957b36SAndreas Gohrif(!function_exists('utf8_stripspecials')){
4388a831f2bSAndreas Gohr    /**
439099ada41Sandi     * Removes special characters (nonalphanumeric) from a UTF-8 string
440099ada41Sandi     *
441099ada41Sandi     * This function adds the controlchars 0x00 to 0x19 to the array of
442099ada41Sandi     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
443099ada41Sandi     *
444099ada41Sandi     * @author Andreas Gohr <andi@splitbrain.org>
445099ada41Sandi     * @param  string $string     The UTF8 string to strip of special chars
446099ada41Sandi     * @param  string $repl       Replace special with this string
447b4ce25e9SAndreas Gohr     * @param  string $additional Additional chars to strip (used in regexp char class)
448099ada41Sandi     */
449b4ce25e9SAndreas Gohr    function utf8_stripspecials($string,$repl='',$additional=''){
450099ada41Sandi        global $UTF8_SPECIAL_CHARS;
451720307d9Schris        global $UTF8_SPECIAL_CHARS2;
452099ada41Sandi
4535c812709Sandi        static $specials = null;
4545c812709Sandi        if(is_null($specials)){
455720307d9Schris            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
456720307d9Schris            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4575c812709Sandi        }
458099ada41Sandi
459b4ce25e9SAndreas Gohr        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
460099ada41Sandi    }
461df957b36SAndreas Gohr}
462099ada41Sandi
463df957b36SAndreas Gohrif(!function_exists('utf8_strpos')){
464099ada41Sandi    /**
4652f954959Sandi     * This is an Unicode aware replacement for strpos
4662f954959Sandi     *
46772de9068SAndreas Gohr     * @author Leo Feyer <leo@typolight.org>
4682f954959Sandi     * @see    strpos()
46972de9068SAndreas Gohr     * @param  string
47072de9068SAndreas Gohr     * @param  string
47172de9068SAndreas Gohr     * @param  integer
47272de9068SAndreas Gohr     * @return integer
4732f954959Sandi     */
4742f954959Sandi    function utf8_strpos($haystack, $needle, $offset=0){
47572de9068SAndreas Gohr        $comp = 0;
47672de9068SAndreas Gohr        $length = null;
4772f954959Sandi
47872de9068SAndreas Gohr        while (is_null($length) || $length < $offset) {
47972de9068SAndreas Gohr            $pos = strpos($haystack, $needle, $offset + $comp);
48072de9068SAndreas Gohr
48172de9068SAndreas Gohr            if ($pos === false)
482f29317c1Sandi                return false;
48372de9068SAndreas Gohr
48472de9068SAndreas Gohr            $length = utf8_strlen(substr($haystack, 0, $pos));
48572de9068SAndreas Gohr
48672de9068SAndreas Gohr            if ($length < $offset)
48772de9068SAndreas Gohr                $comp = $pos - $length;
488f29317c1Sandi        }
4892f954959Sandi
49072de9068SAndreas Gohr        return $length;
49172de9068SAndreas Gohr    }
492df957b36SAndreas Gohr}
493f29317c1Sandi
494df957b36SAndreas Gohrif(!function_exists('utf8_tohtml')){
4952f954959Sandi    /**
496ea2eed85Sandi     * Encodes UTF-8 characters to HTML entities
497ea2eed85Sandi     *
4989f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
499ea2eed85Sandi     * @author <vpribish at shopping dot com>
500ea2eed85Sandi     * @link   http://www.php.net/manual/en/function.utf8-decode.php
501ea2eed85Sandi     */
502ea2eed85Sandi    function utf8_tohtml ($str) {
503ea2eed85Sandi        $ret = '';
5049f9fb0e5STom N Harris        foreach (utf8_to_unicode($str) as $cp) {
5059f9fb0e5STom N Harris            if ($cp < 0x80)
5069f9fb0e5STom N Harris                $ret .= chr($cp);
5079f9fb0e5STom N Harris            elseif ($cp < 0x100)
5089f9fb0e5STom N Harris                $ret .= "&#$cp;";
5099f9fb0e5STom N Harris            else
5109f9fb0e5STom N Harris                $ret .= '&#x'.dechex($cp).';';
5119f9fb0e5STom N Harris        }
5129f9fb0e5STom N Harris        return $ret;
5139f9fb0e5STom N Harris    }
514df957b36SAndreas Gohr}
5159f9fb0e5STom N Harris
516df957b36SAndreas Gohrif(!function_exists('utf8_unhtml')){
5179f9fb0e5STom N Harris    /**
5189f9fb0e5STom N Harris     * Decodes HTML entities to UTF-8 characters
5199f9fb0e5STom N Harris     *
5209f9fb0e5STom N Harris     * Convert any &#..; entity to a codepoint,
5219f9fb0e5STom N Harris     * The entities flag defaults to only decoding numeric entities.
5229f9fb0e5STom N Harris     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
5239f9fb0e5STom N Harris     * are handled as well. Avoids the problem that would occur if you
5249f9fb0e5STom N Harris     * had to decode "&amp;#38;&#38;amp;#38;"
5259f9fb0e5STom N Harris     *
5269f9fb0e5STom N Harris     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
5279f9fb0e5STom N Harris     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
5289f9fb0e5STom N Harris     * what it should be                   -> "&#38;&amp#38;"
5299f9fb0e5STom N Harris     *
5309f9fb0e5STom N Harris     * @author Tom N Harris <tnharris@whoopdedo.org>
5319f9fb0e5STom N Harris     * @param  string  $str      UTF-8 encoded string
5329f9fb0e5STom N Harris     * @param  boolean $entities Flag controlling decoding of named entities.
5339f9fb0e5STom N Harris     * @return UTF-8 encoded string with numeric (and named) entities replaced.
5349f9fb0e5STom N Harris     */
5359f9fb0e5STom N Harris    function utf8_unhtml($str, $entities=null) {
5369f9fb0e5STom N Harris        static $decoder = null;
5379f9fb0e5STom N Harris        if (is_null($decoder))
5389f9fb0e5STom N Harris            $decoder = new utf8_entity_decoder();
5399f9fb0e5STom N Harris        if (is_null($entities))
5409f9fb0e5STom N Harris            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
5419f9fb0e5STom N Harris                                         'utf8_decode_numeric', $str);
5429f9fb0e5STom N Harris        else
5439f9fb0e5STom N Harris            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
5449f9fb0e5STom N Harris                                         array(&$decoder, 'decode'), $str);
5459f9fb0e5STom N Harris    }
546df957b36SAndreas Gohr}
547df957b36SAndreas Gohr
548df957b36SAndreas Gohrif(!function_exists('utf8_decode_numeric')){
5499f9fb0e5STom N Harris    function utf8_decode_numeric($ent) {
5509f9fb0e5STom N Harris        switch ($ent[2]) {
5519f9fb0e5STom N Harris          case 'X':
5529f9fb0e5STom N Harris          case 'x':
5539f9fb0e5STom N Harris              $cp = hexdec($ent[3]);
5549f9fb0e5STom N Harris              break;
5559f9fb0e5STom N Harris          default:
5569f9fb0e5STom N Harris              $cp = intval($ent[3]);
5579f9fb0e5STom N Harris              break;
5589f9fb0e5STom N Harris        }
5599f9fb0e5STom N Harris        return unicode_to_utf8(array($cp));
5609f9fb0e5STom N Harris    }
561df957b36SAndreas Gohr}
562df957b36SAndreas Gohr
563df957b36SAndreas Gohrif(!class_exists('utf8_entity_decoder')){
5649f9fb0e5STom N Harris    class utf8_entity_decoder {
5659f9fb0e5STom N Harris        var $table;
5669f9fb0e5STom N Harris        function utf8_entity_decoder() {
5679f9fb0e5STom N Harris            $table = get_html_translation_table(HTML_ENTITIES);
5689f9fb0e5STom N Harris            $table = array_flip($table);
5699f9fb0e5STom N Harris            $this->table = array_map(array(&$this,'makeutf8'), $table);
5709f9fb0e5STom N Harris        }
5719f9fb0e5STom N Harris        function makeutf8($c) {
5729f9fb0e5STom N Harris            return unicode_to_utf8(array(ord($c)));
5739f9fb0e5STom N Harris        }
5749f9fb0e5STom N Harris        function decode($ent) {
5759f9fb0e5STom N Harris            if ($ent[1] == '#') {
5769f9fb0e5STom N Harris                return utf8_decode_numeric($ent);
5779f9fb0e5STom N Harris            } elseif (array_key_exists($ent[0],$this->table)) {
5789f9fb0e5STom N Harris                return $this->table[$ent[0]];
5799f9fb0e5STom N Harris            } else {
5809f9fb0e5STom N Harris                return $ent[0];
581ea2eed85Sandi            }
582ea2eed85Sandi        }
583ea2eed85Sandi    }
584df957b36SAndreas Gohr}
585ea2eed85Sandi
586df957b36SAndreas Gohrif(!function_exists('utf8_to_unicode')){
587ea2eed85Sandi    /**
5881abfaba4SAndreas Gohr     * Takes an UTF-8 string and returns an array of ints representing the
5891abfaba4SAndreas Gohr     * Unicode characters. Astral planes are supported ie. the ints in the
5901abfaba4SAndreas Gohr     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
5911abfaba4SAndreas Gohr     * are not allowed.
59282257610Sandi     *
5931abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
5941abfaba4SAndreas Gohr     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
5951abfaba4SAndreas Gohr     * level E_USER_WARNING
5961abfaba4SAndreas Gohr     *
5971abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to
5981abfaba4SAndreas Gohr     * trigger errors on encountering bad bytes
5991abfaba4SAndreas Gohr     *
6001abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
6011abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
6021abfaba4SAndreas Gohr     * @param  string  UTF-8 encoded string
6031abfaba4SAndreas Gohr     * @param  boolean Check for invalid sequences?
60444881bd0Shenning.noren     * @return mixed array of unicode code points or false if UTF-8 invalid
6051abfaba4SAndreas Gohr     * @see    unicode_to_utf8
6061abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
6071abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
60882257610Sandi     */
6091abfaba4SAndreas Gohr    function utf8_to_unicode($str,$strict=false) {
6101abfaba4SAndreas Gohr        $mState = 0;     // cached expected number of octets after the current octet
6111abfaba4SAndreas Gohr                         // until the beginning of the next UTF8 character sequence
6121abfaba4SAndreas Gohr        $mUcs4  = 0;     // cached Unicode character
6131abfaba4SAndreas Gohr        $mBytes = 1;     // cached expected number of octets in the current sequence
61482257610Sandi
6151abfaba4SAndreas Gohr        $out = array();
6161abfaba4SAndreas Gohr
6171abfaba4SAndreas Gohr        $len = strlen($str);
6181abfaba4SAndreas Gohr
6191abfaba4SAndreas Gohr        for($i = 0; $i < $len; $i++) {
6201abfaba4SAndreas Gohr
6211abfaba4SAndreas Gohr            $in = ord($str{$i});
6221abfaba4SAndreas Gohr
6231abfaba4SAndreas Gohr            if ( $mState == 0) {
6241abfaba4SAndreas Gohr
6251abfaba4SAndreas Gohr                // When mState is zero we expect either a US-ASCII character or a
6261abfaba4SAndreas Gohr                // multi-octet sequence.
6271abfaba4SAndreas Gohr                if (0 == (0x80 & ($in))) {
6281abfaba4SAndreas Gohr                    // US-ASCII, pass straight through.
6291abfaba4SAndreas Gohr                    $out[] = $in;
6301abfaba4SAndreas Gohr                    $mBytes = 1;
6311abfaba4SAndreas Gohr
6321abfaba4SAndreas Gohr                } else if (0xC0 == (0xE0 & ($in))) {
6331abfaba4SAndreas Gohr                    // First octet of 2 octet sequence
6341abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6351abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
6361abfaba4SAndreas Gohr                    $mState = 1;
6371abfaba4SAndreas Gohr                    $mBytes = 2;
6381abfaba4SAndreas Gohr
6391abfaba4SAndreas Gohr                } else if (0xE0 == (0xF0 & ($in))) {
6401abfaba4SAndreas Gohr                    // First octet of 3 octet sequence
6411abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6421abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
6431abfaba4SAndreas Gohr                    $mState = 2;
6441abfaba4SAndreas Gohr                    $mBytes = 3;
6451abfaba4SAndreas Gohr
6461abfaba4SAndreas Gohr                } else if (0xF0 == (0xF8 & ($in))) {
6471abfaba4SAndreas Gohr                    // First octet of 4 octet sequence
6481abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6491abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x07) << 18;
6501abfaba4SAndreas Gohr                    $mState = 3;
6511abfaba4SAndreas Gohr                    $mBytes = 4;
6521abfaba4SAndreas Gohr
6531abfaba4SAndreas Gohr                } else if (0xF8 == (0xFC & ($in))) {
6541abfaba4SAndreas Gohr                    /* First octet of 5 octet sequence.
6551abfaba4SAndreas Gohr                     *
6561abfaba4SAndreas Gohr                     * This is illegal because the encoded codepoint must be either
6571abfaba4SAndreas Gohr                     * (a) not the shortest form or
6581abfaba4SAndreas Gohr                     * (b) outside the Unicode range of 0-0x10FFFF.
6591abfaba4SAndreas Gohr                     * Rather than trying to resynchronize, we will carry on until the end
6601abfaba4SAndreas Gohr                     * of the sequence and let the later error handling code catch it.
6611abfaba4SAndreas Gohr                     */
6621abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6631abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 0x03) << 24;
6641abfaba4SAndreas Gohr                    $mState = 4;
6651abfaba4SAndreas Gohr                    $mBytes = 5;
6661abfaba4SAndreas Gohr
6671abfaba4SAndreas Gohr                } else if (0xFC == (0xFE & ($in))) {
6681abfaba4SAndreas Gohr                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
6691abfaba4SAndreas Gohr                    $mUcs4 = ($in);
6701abfaba4SAndreas Gohr                    $mUcs4 = ($mUcs4 & 1) << 30;
6711abfaba4SAndreas Gohr                    $mState = 5;
6721abfaba4SAndreas Gohr                    $mBytes = 6;
6731abfaba4SAndreas Gohr
6741abfaba4SAndreas Gohr                } elseif($strict) {
6751abfaba4SAndreas Gohr                    /* Current octet is neither in the US-ASCII range nor a legal first
6761abfaba4SAndreas Gohr                     * octet of a multi-octet sequence.
6771abfaba4SAndreas Gohr                     */
6781abfaba4SAndreas Gohr                    trigger_error(
6791abfaba4SAndreas Gohr                            'utf8_to_unicode: Illegal sequence identifier '.
6801abfaba4SAndreas Gohr                                'in UTF-8 at byte '.$i,
6811abfaba4SAndreas Gohr                            E_USER_WARNING
6821abfaba4SAndreas Gohr                        );
68344881bd0Shenning.noren                    return false;
6841abfaba4SAndreas Gohr
6851abfaba4SAndreas Gohr                }
6861abfaba4SAndreas Gohr
6871abfaba4SAndreas Gohr            } else {
6881abfaba4SAndreas Gohr
6891abfaba4SAndreas Gohr                // When mState is non-zero, we expect a continuation of the multi-octet
6901abfaba4SAndreas Gohr                // sequence
6911abfaba4SAndreas Gohr                if (0x80 == (0xC0 & ($in))) {
6921abfaba4SAndreas Gohr
6931abfaba4SAndreas Gohr                    // Legal continuation.
6941abfaba4SAndreas Gohr                    $shift = ($mState - 1) * 6;
6951abfaba4SAndreas Gohr                    $tmp = $in;
6961abfaba4SAndreas Gohr                    $tmp = ($tmp & 0x0000003F) << $shift;
6971abfaba4SAndreas Gohr                    $mUcs4 |= $tmp;
6981abfaba4SAndreas Gohr
6991abfaba4SAndreas Gohr                    /**
7001abfaba4SAndreas Gohr                     * End of the multi-octet sequence. mUcs4 now contains the final
7011abfaba4SAndreas Gohr                     * Unicode codepoint to be output
7021abfaba4SAndreas Gohr                     */
7031abfaba4SAndreas Gohr                    if (0 == --$mState) {
7041abfaba4SAndreas Gohr
7051abfaba4SAndreas Gohr                        /*
7061abfaba4SAndreas Gohr                         * Check for illegal sequences and codepoints.
7071abfaba4SAndreas Gohr                         */
7081abfaba4SAndreas Gohr                        // From Unicode 3.1, non-shortest form is illegal
7091abfaba4SAndreas Gohr                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
7101abfaba4SAndreas Gohr                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
7111abfaba4SAndreas Gohr                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
7121abfaba4SAndreas Gohr                            (4 < $mBytes) ||
7131abfaba4SAndreas Gohr                            // From Unicode 3.2, surrogate characters are illegal
7141abfaba4SAndreas Gohr                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
7151abfaba4SAndreas Gohr                            // Codepoints outside the Unicode range are illegal
7161abfaba4SAndreas Gohr                            ($mUcs4 > 0x10FFFF)) {
7171abfaba4SAndreas Gohr
7181abfaba4SAndreas Gohr                            if($strict){
7191abfaba4SAndreas Gohr                                trigger_error(
7201abfaba4SAndreas Gohr                                        'utf8_to_unicode: Illegal sequence or codepoint '.
7211abfaba4SAndreas Gohr                                            'in UTF-8 at byte '.$i,
7221abfaba4SAndreas Gohr                                        E_USER_WARNING
7231abfaba4SAndreas Gohr                                    );
7241abfaba4SAndreas Gohr
72544881bd0Shenning.noren                                return false;
7261abfaba4SAndreas Gohr                            }
7271abfaba4SAndreas Gohr
7281abfaba4SAndreas Gohr                        }
7291abfaba4SAndreas Gohr
7301abfaba4SAndreas Gohr                        if (0xFEFF != $mUcs4) {
7311abfaba4SAndreas Gohr                            // BOM is legal but we don't want to output it
7321abfaba4SAndreas Gohr                            $out[] = $mUcs4;
7331abfaba4SAndreas Gohr                        }
7341abfaba4SAndreas Gohr
7351abfaba4SAndreas Gohr                        //initialize UTF8 cache
7361abfaba4SAndreas Gohr                        $mState = 0;
7371abfaba4SAndreas Gohr                        $mUcs4  = 0;
7381abfaba4SAndreas Gohr                        $mBytes = 1;
7391abfaba4SAndreas Gohr                    }
7401abfaba4SAndreas Gohr
7411abfaba4SAndreas Gohr                } elseif($strict) {
7421abfaba4SAndreas Gohr                    /**
7431abfaba4SAndreas Gohr                     *((0xC0 & (*in) != 0x80) && (mState != 0))
7441abfaba4SAndreas Gohr                     * Incomplete multi-octet sequence.
7451abfaba4SAndreas Gohr                     */
7461abfaba4SAndreas Gohr                    trigger_error(
7471abfaba4SAndreas Gohr                            'utf8_to_unicode: Incomplete multi-octet '.
7481abfaba4SAndreas Gohr                            '   sequence in UTF-8 at byte '.$i,
7491abfaba4SAndreas Gohr                            E_USER_WARNING
7501abfaba4SAndreas Gohr                        );
7511abfaba4SAndreas Gohr
75244881bd0Shenning.noren                    return false;
75382257610Sandi                }
75482257610Sandi            }
75582257610Sandi        }
7561abfaba4SAndreas Gohr        return $out;
75782257610Sandi    }
758df957b36SAndreas Gohr}
75982257610Sandi
760df957b36SAndreas Gohrif(!function_exists('unicode_to_utf8')){
76182257610Sandi    /**
7621abfaba4SAndreas Gohr     * Takes an array of ints representing the Unicode characters and returns
7631abfaba4SAndreas Gohr     * a UTF-8 string. Astral planes are supported ie. the ints in the
7641abfaba4SAndreas Gohr     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
7651abfaba4SAndreas Gohr     * are not allowed.
76682257610Sandi     *
7671abfaba4SAndreas Gohr     * If $strict is set to true the function returns false if the input
7681abfaba4SAndreas Gohr     * array contains ints that represent surrogates or are outside the
7691abfaba4SAndreas Gohr     * Unicode range and raises a PHP error at level E_USER_WARNING
7701abfaba4SAndreas Gohr     *
7711abfaba4SAndreas Gohr     * Note: this function has been modified slightly in this library to use
7721abfaba4SAndreas Gohr     * output buffering to concatenate the UTF-8 string (faster) as well as
7731abfaba4SAndreas Gohr     * reference the array by it's keys
7741abfaba4SAndreas Gohr     *
7751abfaba4SAndreas Gohr     * @param  array of unicode code points representing a string
7761abfaba4SAndreas Gohr     * @param  boolean Check for invalid sequences?
77744881bd0Shenning.noren     * @return mixed UTF-8 string or false if array contains invalid code points
7781abfaba4SAndreas Gohr     * @author <hsivonen@iki.fi>
7791abfaba4SAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
7801abfaba4SAndreas Gohr     * @see    utf8_to_unicode
7811abfaba4SAndreas Gohr     * @link   http://hsivonen.iki.fi/php-utf8/
7821abfaba4SAndreas Gohr     * @link   http://sourceforge.net/projects/phputf8/
78382257610Sandi     */
7841abfaba4SAndreas Gohr    function unicode_to_utf8($arr,$strict=false) {
7851abfaba4SAndreas Gohr        if (!is_array($arr)) return '';
7861abfaba4SAndreas Gohr        ob_start();
787f949a01cSAndreas Gohr
7881abfaba4SAndreas Gohr        foreach (array_keys($arr) as $k) {
7891abfaba4SAndreas Gohr
7901abfaba4SAndreas Gohr            # ASCII range (including control chars)
7911abfaba4SAndreas Gohr            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
7921abfaba4SAndreas Gohr
7931abfaba4SAndreas Gohr                echo chr($arr[$k]);
7941abfaba4SAndreas Gohr
7951abfaba4SAndreas Gohr            # 2 byte sequence
7961abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x07ff) {
7971abfaba4SAndreas Gohr
7981abfaba4SAndreas Gohr                echo chr(0xc0 | ($arr[$k] >> 6));
7991abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
8001abfaba4SAndreas Gohr
8011abfaba4SAndreas Gohr            # Byte order mark (skip)
8021abfaba4SAndreas Gohr            } else if($arr[$k] == 0xFEFF) {
8031abfaba4SAndreas Gohr
8041abfaba4SAndreas Gohr                // nop -- zap the BOM
8051abfaba4SAndreas Gohr
8061abfaba4SAndreas Gohr            # Test for illegal surrogates
8071abfaba4SAndreas Gohr            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
8081abfaba4SAndreas Gohr
8091abfaba4SAndreas Gohr                // found a surrogate
8101abfaba4SAndreas Gohr                if($strict){
8111abfaba4SAndreas Gohr                    trigger_error(
8121abfaba4SAndreas Gohr                        'unicode_to_utf8: Illegal surrogate '.
8131abfaba4SAndreas Gohr                            'at index: '.$k.', value: '.$arr[$k],
8141abfaba4SAndreas Gohr                        E_USER_WARNING
8151abfaba4SAndreas Gohr                        );
81644881bd0Shenning.noren                    return false;
8171abfaba4SAndreas Gohr                }
8181abfaba4SAndreas Gohr
8191abfaba4SAndreas Gohr            # 3 byte sequence
8201abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0xffff) {
8211abfaba4SAndreas Gohr
8221abfaba4SAndreas Gohr                echo chr(0xe0 | ($arr[$k] >> 12));
8231abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
8241abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x003f));
8251abfaba4SAndreas Gohr
8261abfaba4SAndreas Gohr            # 4 byte sequence
8271abfaba4SAndreas Gohr            } else if ($arr[$k] <= 0x10ffff) {
8281abfaba4SAndreas Gohr
8291abfaba4SAndreas Gohr                echo chr(0xf0 | ($arr[$k] >> 18));
8301abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
8311abfaba4SAndreas Gohr                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
8321abfaba4SAndreas Gohr                echo chr(0x80 | ($arr[$k] & 0x3f));
8331abfaba4SAndreas Gohr
8341abfaba4SAndreas Gohr            } elseif($strict) {
8351abfaba4SAndreas Gohr
8361abfaba4SAndreas Gohr                trigger_error(
8371abfaba4SAndreas Gohr                    'unicode_to_utf8: Codepoint out of Unicode range '.
8381abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
8391abfaba4SAndreas Gohr                    E_USER_WARNING
8401abfaba4SAndreas Gohr                    );
8411abfaba4SAndreas Gohr
8421abfaba4SAndreas Gohr                // out of range
84344881bd0Shenning.noren                return false;
84482257610Sandi            }
84582257610Sandi        }
8461abfaba4SAndreas Gohr
8471abfaba4SAndreas Gohr        $result = ob_get_contents();
8481abfaba4SAndreas Gohr        ob_end_clean();
8491abfaba4SAndreas Gohr        return $result;
85082257610Sandi    }
851df957b36SAndreas Gohr}
85282257610Sandi
853df957b36SAndreas Gohrif(!function_exists('utf8_to_utf16be')){
85482257610Sandi    /**
85515fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
85615fa0b4fSAndreas Gohr     *
85715fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
85815fa0b4fSAndreas Gohr     */
85915fa0b4fSAndreas Gohr    function utf8_to_utf16be(&$str, $bom = false) {
86015fa0b4fSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
861ab77016bSAndreas Gohr        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
86215fa0b4fSAndreas Gohr
86315fa0b4fSAndreas Gohr        $uni = utf8_to_unicode($str);
86415fa0b4fSAndreas Gohr        foreach($uni as $cp){
86515fa0b4fSAndreas Gohr            $out .= pack('n',$cp);
86615fa0b4fSAndreas Gohr        }
86715fa0b4fSAndreas Gohr        return $out;
86815fa0b4fSAndreas Gohr    }
869df957b36SAndreas Gohr}
87015fa0b4fSAndreas Gohr
871df957b36SAndreas Gohrif(!function_exists('utf16be_to_utf8')){
87215fa0b4fSAndreas Gohr    /**
87315fa0b4fSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
87415fa0b4fSAndreas Gohr     *
87515fa0b4fSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
87615fa0b4fSAndreas Gohr     */
87715fa0b4fSAndreas Gohr    function utf16be_to_utf8(&$str) {
87815fa0b4fSAndreas Gohr        $uni = unpack('n*',$str);
87915fa0b4fSAndreas Gohr        return unicode_to_utf8($uni);
88015fa0b4fSAndreas Gohr    }
881df957b36SAndreas Gohr}
88215fa0b4fSAndreas Gohr
883df957b36SAndreas Gohrif(!function_exists('utf8_bad_replace')){
8840eac1afbSAndreas Gohr    /**
8850eac1afbSAndreas Gohr     * Replace bad bytes with an alternative character
8860eac1afbSAndreas Gohr     *
8870eac1afbSAndreas Gohr     * ASCII character is recommended for replacement char
8880eac1afbSAndreas Gohr     *
8890eac1afbSAndreas Gohr     * PCRE Pattern to locate bad bytes in a UTF-8 string
8900eac1afbSAndreas Gohr     * Comes from W3 FAQ: Multilingual Forms
8910eac1afbSAndreas Gohr     * Note: modified to include full ASCII range including control chars
8920eac1afbSAndreas Gohr     *
8930eac1afbSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
8940eac1afbSAndreas Gohr     * @see http://www.w3.org/International/questions/qa-forms-utf-8
8950eac1afbSAndreas Gohr     * @param string to search
8960eac1afbSAndreas Gohr     * @param string to replace bad bytes with (defaults to '?') - use ASCII
8970eac1afbSAndreas Gohr     * @return string
8980eac1afbSAndreas Gohr     */
8990eac1afbSAndreas Gohr    function utf8_bad_replace($str, $replace = '') {
9000eac1afbSAndreas Gohr        $UTF8_BAD =
9010eac1afbSAndreas Gohr         '([\x00-\x7F]'.                          # ASCII (including control chars)
9020eac1afbSAndreas Gohr         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
9030eac1afbSAndreas Gohr         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
9040eac1afbSAndreas Gohr         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
9050eac1afbSAndreas Gohr         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
9060eac1afbSAndreas Gohr         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
9070eac1afbSAndreas Gohr         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
9080eac1afbSAndreas Gohr         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
9090eac1afbSAndreas Gohr         '|(.{1}))';                              # invalid byte
9100eac1afbSAndreas Gohr        ob_start();
9110eac1afbSAndreas Gohr        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
9120eac1afbSAndreas Gohr            if ( !isset($matches[2])) {
9130eac1afbSAndreas Gohr                echo $matches[0];
9140eac1afbSAndreas Gohr            } else {
9150eac1afbSAndreas Gohr                echo $replace;
9160eac1afbSAndreas Gohr            }
9170eac1afbSAndreas Gohr            $str = substr($str,strlen($matches[0]));
9180eac1afbSAndreas Gohr        }
9190eac1afbSAndreas Gohr        $result = ob_get_contents();
9200eac1afbSAndreas Gohr        ob_end_clean();
9210eac1afbSAndreas Gohr        return $result;
9220eac1afbSAndreas Gohr    }
923df957b36SAndreas Gohr}
924ab77016bSAndreas Gohr
925df957b36SAndreas Gohrif(!function_exists('utf8_correctIdx')){
9265953e889Schris    /**
9275953e889Schris     * adjust a byte index into a utf8 string to a utf8 character boundary
9285953e889Schris     *
9295953e889Schris     * @param $str   string   utf8 character string
9305953e889Schris     * @param $i     int      byte index into $str
9315953e889Schris     * @param $next  bool     direction to search for boundary,
9325953e889Schris     *                           false = up (current character)
9335953e889Schris     *                           true = down (next character)
9345953e889Schris     *
9355953e889Schris     * @return int            byte index into $str now pointing to a utf8 character boundary
9365953e889Schris     *
9375953e889Schris     * @author       chris smith <chris@jalakai.co.uk>
9385953e889Schris     */
9395953e889Schris    function utf8_correctIdx(&$str,$i,$next=false) {
9405953e889Schris
941f50163d1Schris        if ($i <= 0) return 0;
942f50163d1Schris
9435953e889Schris        $limit = strlen($str);
944f50163d1Schris        if ($i>=$limit) return $limit;
945f50163d1Schris
946f50163d1Schris        if ($next) {
9475953e889Schris            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
9485953e889Schris        } else {
9495953e889Schris            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
9505953e889Schris        }
9515953e889Schris
9525953e889Schris        return $i;
9535953e889Schris    }
954df957b36SAndreas Gohr}
9555953e889Schris
956ab77016bSAndreas Gohr// only needed if no mb_string available
957ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
95815fa0b4fSAndreas Gohr  /**
95982257610Sandi   * UTF-8 Case lookup table
96082257610Sandi   *
96182257610Sandi   * This lookuptable defines the upper case letters to their correspponding
96282257610Sandi   * lower case letter in UTF-8
96382257610Sandi   *
96482257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
96582257610Sandi   */
96654662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
967df957b36SAndreas Gohr  if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
96872de9068SAndreas Gohr    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
96972de9068SAndreas Gohr    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
97072de9068SAndreas Gohr    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
97172de9068SAndreas Gohr    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
97272de9068SAndreas Gohr    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
97372de9068SAndreas Gohr    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
97472de9068SAndreas Gohr    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
97572de9068SAndreas Gohr    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
97672de9068SAndreas Gohr    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
97772de9068SAndreas Gohr    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
97872de9068SAndreas Gohr    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
97972de9068SAndreas Gohr    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
98072de9068SAndreas Gohr    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
98172de9068SAndreas Gohr    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
98272de9068SAndreas Gohr    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
98372de9068SAndreas Gohr    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
98472de9068SAndreas Gohr    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
98572de9068SAndreas Gohr    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
98672de9068SAndreas Gohr    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
98772de9068SAndreas Gohr    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
98872de9068SAndreas Gohr    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
98972de9068SAndreas Gohr    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
99072de9068SAndreas Gohr    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
99172de9068SAndreas Gohr    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
99272de9068SAndreas Gohr    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
99372de9068SAndreas Gohr    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
99472de9068SAndreas Gohr    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
99572de9068SAndreas Gohr    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
99672de9068SAndreas Gohr    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
99772de9068SAndreas Gohr    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
99872de9068SAndreas Gohr    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
99972de9068SAndreas Gohr    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
100072de9068SAndreas Gohr    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
100172de9068SAndreas Gohr    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
100272de9068SAndreas Gohr    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
100372de9068SAndreas Gohr    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
100472de9068SAndreas Gohr    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
100572de9068SAndreas Gohr    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
100672de9068SAndreas Gohr    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
100772de9068SAndreas Gohr    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
100872de9068SAndreas Gohr    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
100972de9068SAndreas Gohr    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
101072de9068SAndreas Gohr    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
101172de9068SAndreas Gohr    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
101272de9068SAndreas Gohr    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
101372de9068SAndreas Gohr    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
101472de9068SAndreas Gohr    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
101572de9068SAndreas Gohr    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
101672de9068SAndreas Gohr    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
101772de9068SAndreas Gohr    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
101872de9068SAndreas Gohr    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
101972de9068SAndreas Gohr    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
102072de9068SAndreas Gohr    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
102172de9068SAndreas Gohr    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
102272de9068SAndreas Gohr    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
102372de9068SAndreas Gohr    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
102472de9068SAndreas Gohr    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
102572de9068SAndreas Gohr    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
102672de9068SAndreas Gohr    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
102772de9068SAndreas Gohr    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
102872de9068SAndreas Gohr    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
102972de9068SAndreas Gohr    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
103072de9068SAndreas Gohr    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
103172de9068SAndreas Gohr    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
103272de9068SAndreas Gohr    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
103372de9068SAndreas Gohr    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
103472de9068SAndreas Gohr    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
103582257610Sandi  );
103682257610Sandi
103782257610Sandi  /**
103882257610Sandi   * UTF-8 Case lookup table
103982257610Sandi   *
104082257610Sandi   * This lookuptable defines the lower case letters to their correspponding
104172de9068SAndreas Gohr   * upper case letter in UTF-8
104282257610Sandi   *
104382257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
104482257610Sandi   */
104554662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
1046df957b36SAndreas Gohr  if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
104772de9068SAndreas Gohr    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
104872de9068SAndreas Gohr    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
104972de9068SAndreas Gohr    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
105072de9068SAndreas Gohr    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
105172de9068SAndreas Gohr    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
105272de9068SAndreas Gohr    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
105372de9068SAndreas Gohr    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
105472de9068SAndreas Gohr    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
105572de9068SAndreas Gohr    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
105672de9068SAndreas Gohr    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
105772de9068SAndreas Gohr    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
105872de9068SAndreas Gohr    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
105972de9068SAndreas Gohr    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
106072de9068SAndreas Gohr    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
106172de9068SAndreas Gohr    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
106272de9068SAndreas Gohr    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
106372de9068SAndreas Gohr    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
106472de9068SAndreas Gohr    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
106572de9068SAndreas Gohr    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
106672de9068SAndreas Gohr    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
106772de9068SAndreas Gohr    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
106872de9068SAndreas Gohr    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
106972de9068SAndreas Gohr    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
107072de9068SAndreas Gohr    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
107172de9068SAndreas Gohr    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
107272de9068SAndreas Gohr    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
107372de9068SAndreas Gohr    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
107472de9068SAndreas Gohr    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
107572de9068SAndreas Gohr    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
107672de9068SAndreas Gohr    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
107772de9068SAndreas Gohr    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
107872de9068SAndreas Gohr    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
107972de9068SAndreas Gohr    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
108072de9068SAndreas Gohr    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
108172de9068SAndreas Gohr    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
108272de9068SAndreas Gohr    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
108372de9068SAndreas Gohr    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
108472de9068SAndreas Gohr    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
108572de9068SAndreas Gohr    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
108672de9068SAndreas Gohr    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
108772de9068SAndreas Gohr    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
108872de9068SAndreas Gohr    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
108972de9068SAndreas Gohr    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
109072de9068SAndreas Gohr    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
109172de9068SAndreas Gohr    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
109272de9068SAndreas Gohr    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
109372de9068SAndreas Gohr    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
109472de9068SAndreas Gohr    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
109572de9068SAndreas Gohr    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
109672de9068SAndreas Gohr    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
109772de9068SAndreas Gohr    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
109872de9068SAndreas Gohr    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
109972de9068SAndreas Gohr    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
110072de9068SAndreas Gohr    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
110172de9068SAndreas Gohr    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
110272de9068SAndreas Gohr    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
110372de9068SAndreas Gohr    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
110472de9068SAndreas Gohr    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
110572de9068SAndreas Gohr    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
110672de9068SAndreas Gohr    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
110772de9068SAndreas Gohr    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
110872de9068SAndreas Gohr    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
110972de9068SAndreas Gohr    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
111072de9068SAndreas Gohr    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
111172de9068SAndreas Gohr    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
111272de9068SAndreas Gohr    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
111372de9068SAndreas Gohr    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
111472de9068SAndreas Gohr  );
111572de9068SAndreas Gohr}; // end of case lookup tables
1116ab77016bSAndreas Gohr
111782257610Sandi/**
111882257610Sandi * UTF-8 lookup table for lower case accented letters
111982257610Sandi *
112082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
112182257610Sandi * range. This are lower case letters only.
112282257610Sandi *
112382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
112482257610Sandi * @see    utf8_deaccent()
112582257610Sandi */
112654662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
1127df957b36SAndreas Gohrif(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
112882257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
112982257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
113082257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
113182257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
113282257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
113382257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
113482257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
113582257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
113682257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
113782257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
113882257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
113982257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
114082257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
114182257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
114274c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
114382257610Sandi);
114482257610Sandi
114582257610Sandi/**
114682257610Sandi * UTF-8 lookup table for upper case accented letters
114782257610Sandi *
114882257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
114982257610Sandi * range. This are upper case letters only.
115082257610Sandi *
115182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
115282257610Sandi * @see    utf8_deaccent()
115382257610Sandi */
115454662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
1155df957b36SAndreas Gohrif(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1156df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1157df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1158df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1159df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1160df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1161df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1162df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1163df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1164df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1165df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1166df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1167df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1168df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1169df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
117074c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
117182257610Sandi);
117282257610Sandi
1173099ada41Sandi/**
1174099ada41Sandi * UTF-8 array of common special characters
1175099ada41Sandi *
1176099ada41Sandi * This array should contain all special characters (not a letter or digit)
1177099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1178099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1179099ada41Sandi * chars.
1180099ada41Sandi *
1181099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1182ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1183099ada41Sandi *
1184099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1185099ada41Sandi * @see    utf8_stripspecials()
1186099ada41Sandi */
118754662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1188df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1189099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1190ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
11915c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
11925c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1193099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1194099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1195099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1196099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1197099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1198099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1199099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1200099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1201fae4b5fcSAndreas Gohr  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1202099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1203099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1204099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1205099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1206099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1207099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1208099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1209099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1210099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1211099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1212099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1213099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1214099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1215099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1216099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1217099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1218099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1219099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1220099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1221099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1222099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1223099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1224099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1225099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1226099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1227099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1228099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1229099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1230099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1231099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1232099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1233099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1234d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1235d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1236d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1237d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1238099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1239099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1240099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1241099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1242d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1243d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1244d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1245d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1246d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1247d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1248fae4b5fcSAndreas Gohr  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1249fae4b5fcSAndreas Gohr  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1250fae4b5fcSAndreas Gohr  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
12517de9cff5SAndreas Gohr  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
12527de9cff5SAndreas Gohr  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1253099ada41Sandi);
1254340756e4Sandi
1255720307d9Schris// utf8 version of above data
1256720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1257df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
125837242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1259720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1260fae4b5fcSAndreas Gohr    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1261720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1262720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1263720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1264720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1265720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1266720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1267720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1268720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1269720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1270720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1271720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1272d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1273d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1274d5b23302STom N Harris    '�'.
1275d5b23302STom N Harris    '�ﹼﹽ'.
1276d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1277fae4b5fcSAndreas Gohr    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
12787de9cff5SAndreas Gohr    '����������������������������������������������������������������'.
12797de9cff5SAndreas Gohr    '   ⁠';
1280720307d9Schris
12818a831f2bSAndreas Gohr/**
12828a831f2bSAndreas Gohr * Romanization lookup table
12838a831f2bSAndreas Gohr *
12848a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
12858a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
12868a831f2bSAndreas Gohr *
12878a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
12888a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
12898a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
12908a831f2bSAndreas Gohr *
12918a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
12928a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
12938a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
12948a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
12958a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
12968a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
12978a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
12988a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
1299014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com>
1300fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de>
13018a831f2bSAndreas Gohr */
130254662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
1303df957b36SAndreas Gohrif(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1304176ae32bSAndreas Gohr  // scandinavian - differs from what we do in deaccent
1305176ae32bSAndreas Gohr  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1306176ae32bSAndreas Gohr
13078a831f2bSAndreas Gohr  //russian cyrillic
13088a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
13098a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
13108a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
13118a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
13128a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
13138a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1314d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1315f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
13168a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
13178a831f2bSAndreas Gohr  // Ukrainian cyrillic
13188a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
13198a831f2bSAndreas Gohr  // Georgian
13208a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
13218a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
13228a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
13238a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
13248a831f2bSAndreas Gohr  'ჰ'=>'xh',
13258a831f2bSAndreas Gohr  //Sanskrit
13268a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
13278a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
13288a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
13298a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
13308a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
13318a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
13328a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
13338a831f2bSAndreas Gohr  //Hebrew
13343dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
13353dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
13363dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
13378a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
13388a831f2bSAndreas Gohr  //Arabic
13398a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
13408a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
13418a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
13428a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
13438a831f2bSAndreas Gohr
1344799e0977SAndreas Gohr  // Japanese characters  (last update: 2008-05-09)
13459476a253SAndreas Gohr
13468a831f2bSAndreas Gohr  // Japanese hiragana
1347fed467f8SDenis Scheither
1348fed467f8SDenis Scheither  // 3 character syllables, っ doubles the consonant after
1349fed467f8SDenis Scheither  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1350879205e1SAndreas Gohr  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1351799e0977SAndreas Gohr  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1352879205e1SAndreas Gohr  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1353879205e1SAndreas Gohr  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1354879205e1SAndreas Gohr  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1355879205e1SAndreas Gohr  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1356879205e1SAndreas Gohr  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1357879205e1SAndreas Gohr  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1358879205e1SAndreas Gohr  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1359879205e1SAndreas Gohr  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1360879205e1SAndreas Gohr
1361879205e1SAndreas Gohr  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1362879205e1SAndreas Gohr  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1363879205e1SAndreas Gohr  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1364fed467f8SDenis Scheither
1365fed467f8SDenis Scheither   // 2 character syllables - normal
1366879205e1SAndreas Gohr  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1367fed467f8SDenis Scheither  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1368fed467f8SDenis Scheither  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1369799e0977SAndreas Gohr  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1370799e0977SAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1371fed467f8SDenis Scheither  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1372fed467f8SDenis Scheither  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1373fed467f8SDenis Scheither  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1374fed467f8SDenis Scheither  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1375fed467f8SDenis Scheither  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1376fed467f8SDenis Scheither  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1377879205e1SAndreas Gohr  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1378879205e1SAndreas Gohr  'うぇ'=>'we','うぃ'=>'wi',
1379879205e1SAndreas Gohr  'いぇ'=>'ye',
1380fed467f8SDenis Scheither
1381fed467f8SDenis Scheither  // 2 character syllables, っ doubles the consonant after
1382fed467f8SDenis Scheither  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1383fed467f8SDenis Scheither  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1384fed467f8SDenis Scheither  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1385fed467f8SDenis Scheither  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1386fed467f8SDenis Scheither  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1387fed467f8SDenis Scheither  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1388fed467f8SDenis Scheither  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1389fed467f8SDenis Scheither  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1390fed467f8SDenis Scheither  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1391fed467f8SDenis Scheither  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1392799e0977SAndreas Gohr  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1393fed467f8SDenis Scheither
1394fed467f8SDenis Scheither  // 1 character syllabels
1395fed467f8SDenis Scheither  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1396879205e1SAndreas Gohr  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1397fed467f8SDenis Scheither  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1398fed467f8SDenis Scheither  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
13999476a253SAndreas Gohr  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1400fed467f8SDenis Scheither  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1401fed467f8SDenis Scheither  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1402fed467f8SDenis Scheither  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1403fed467f8SDenis Scheither  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1404fed467f8SDenis Scheither  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1405fed467f8SDenis Scheither  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1406fed467f8SDenis Scheither  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1407879205e1SAndreas Gohr  'わ'=>'wa','を'=>'wo',
1408879205e1SAndreas Gohr  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1409879205e1SAndreas Gohr  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
14109476a253SAndreas Gohr  // old characters
14119476a253SAndreas Gohr  'ゑ'=>'we','ゐ'=>'wi',
1412fed467f8SDenis Scheither
14139476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
14149476a253SAndreas Gohr  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
14159476a253SAndreas Gohr  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1416fed467f8SDenis Scheither
14179476a253SAndreas Gohr  // never seen one of those (disabled for the moment)
1418879205e1SAndreas Gohr  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
14199476a253SAndreas Gohr  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
14209476a253SAndreas Gohr  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
14219476a253SAndreas Gohr  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
14229476a253SAndreas Gohr  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
14239476a253SAndreas Gohr  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
14249476a253SAndreas Gohr  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
14259476a253SAndreas Gohr  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
14269476a253SAndreas Gohr  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
14279476a253SAndreas Gohr  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
14289476a253SAndreas Gohr  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
14299476a253SAndreas Gohr  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
14309476a253SAndreas Gohr  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
14319476a253SAndreas Gohr  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1432fed467f8SDenis Scheither
1433fed467f8SDenis Scheither  // 'spare' characters from other romanization systems
1434fed467f8SDenis Scheither  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1435fed467f8SDenis Scheither  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1436fed467f8SDenis Scheither  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1437fed467f8SDenis Scheither  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1438fed467f8SDenis Scheither  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1439fed467f8SDenis Scheither  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1440fed467f8SDenis Scheither  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1441fed467f8SDenis Scheither  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1442fed467f8SDenis Scheither  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1443fed467f8SDenis Scheither  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1444fed467f8SDenis Scheither
1445fed467f8SDenis Scheither
14468a831f2bSAndreas Gohr  // Japanese katakana
1447fed467f8SDenis Scheither
1448fed467f8SDenis Scheither  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1449fed467f8SDenis Scheither  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1450fed467f8SDenis Scheither  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1451fed467f8SDenis Scheither  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1452fed467f8SDenis Scheither  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1453fed467f8SDenis Scheither  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1454fed467f8SDenis Scheither  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1455fed467f8SDenis Scheither  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1456fed467f8SDenis Scheither  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1457fed467f8SDenis Scheither  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1458799e0977SAndreas Gohr  'ッティー'=>'ttii',
1459799e0977SAndreas Gohr  'ッヂィー'=>'ddii',
1460fed467f8SDenis Scheither
1461fed467f8SDenis Scheither  // 3 character syllables - doubled vowels
1462fed467f8SDenis Scheither  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1463fed467f8SDenis Scheither  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1464fed467f8SDenis Scheither  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1465fed467f8SDenis Scheither  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1466fed467f8SDenis Scheither  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1467fed467f8SDenis Scheither  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1468fed467f8SDenis Scheither  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1469fed467f8SDenis Scheither  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1470fed467f8SDenis Scheither  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1471fed467f8SDenis Scheither  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1472fed467f8SDenis Scheither  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1473fed467f8SDenis Scheither  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1474fed467f8SDenis Scheither  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1475fed467f8SDenis Scheither  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1476fed467f8SDenis Scheither  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1477fed467f8SDenis Scheither  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1478fed467f8SDenis Scheither  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1479fed467f8SDenis Scheither  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1480fed467f8SDenis Scheither  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1481fed467f8SDenis Scheither  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1482fed467f8SDenis Scheither  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1483fed467f8SDenis Scheither  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1484fed467f8SDenis Scheither  'ウェー'=>'wee','ウィー'=>'wii',
1485fed467f8SDenis Scheither  'イェー'=>'yee',
1486799e0977SAndreas Gohr  'ティー'=>'tii',
1487799e0977SAndreas Gohr  'ヂィー'=>'dii',
1488fed467f8SDenis Scheither
1489fed467f8SDenis Scheither  // 3 character syllables - doubled consonants
1490fed467f8SDenis Scheither  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1491fed467f8SDenis Scheither  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1492fed467f8SDenis Scheither  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1493fed467f8SDenis Scheither  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1494fed467f8SDenis Scheither  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1495fed467f8SDenis Scheither  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1496fed467f8SDenis Scheither  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1497fed467f8SDenis Scheither  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1498fed467f8SDenis Scheither  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1499799e0977SAndreas Gohr  'ッティ'=>'tti',
1500799e0977SAndreas Gohr  'ッヂィ'=>'ddi',
1501fed467f8SDenis Scheither
1502fed467f8SDenis Scheither  // 3 character syllables - doubled vowel and consonants
1503fed467f8SDenis Scheither  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1504fed467f8SDenis Scheither  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1505fed467f8SDenis Scheither  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1506fed467f8SDenis Scheither  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1507fed467f8SDenis Scheither  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1508fed467f8SDenis Scheither  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1509fed467f8SDenis Scheither  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1510fed467f8SDenis Scheither  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1511799e0977SAndreas Gohr  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1512799e0977SAndreas Gohr  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1513fed467f8SDenis Scheither  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1514fed467f8SDenis Scheither
1515fed467f8SDenis Scheither  // 2 character syllables - normal
1516799e0977SAndreas Gohr  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1517799e0977SAndreas Gohr  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1518799e0977SAndreas Gohr  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1519fed467f8SDenis Scheither  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1520fed467f8SDenis Scheither  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1521fed467f8SDenis Scheither  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1522fed467f8SDenis Scheither  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1523fed467f8SDenis Scheither  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1524fed467f8SDenis Scheither  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1525fed467f8SDenis Scheither  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1526fed467f8SDenis Scheither  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1527879205e1SAndreas Gohr  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1528879205e1SAndreas Gohr  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1529fed467f8SDenis Scheither  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1530fed467f8SDenis Scheither  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1531fed467f8SDenis Scheither  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1532799e0977SAndreas Gohr  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1533fed467f8SDenis Scheither  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1534fed467f8SDenis Scheither  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1535fed467f8SDenis Scheither  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1536fed467f8SDenis Scheither  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1537fed467f8SDenis Scheither  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1538fed467f8SDenis Scheither  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1539fed467f8SDenis Scheither  'ウェ'=>'we','ウィ'=>'wi',
1540fed467f8SDenis Scheither  'イェ'=>'ye',
1541799e0977SAndreas Gohr  'ティ'=>'ti',
1542799e0977SAndreas Gohr  'ヂィ'=>'di',
1543fed467f8SDenis Scheither
1544fed467f8SDenis Scheither  // 2 character syllables - doubled vocal
1545fed467f8SDenis Scheither  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1546fed467f8SDenis Scheither  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1547fed467f8SDenis Scheither  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1548fed467f8SDenis Scheither  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1549fed467f8SDenis Scheither  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1550fed467f8SDenis Scheither  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1551fed467f8SDenis Scheither  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1552fed467f8SDenis Scheither  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1553fed467f8SDenis Scheither  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1554fed467f8SDenis Scheither  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1555fed467f8SDenis Scheither  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1556799e0977SAndreas Gohr  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1557fed467f8SDenis Scheither  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1558fed467f8SDenis Scheither  'ワー'=>'waa','ヲー'=>'woo',
1559fed467f8SDenis Scheither  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1560fed467f8SDenis Scheither  'ヵー'=>'kaa','ヶー'=>'kee',
15619476a253SAndreas Gohr  // old characters
15629476a253SAndreas Gohr  'ヱー'=>'wee','ヰー'=>'wii',
1563fed467f8SDenis Scheither
1564879205e1SAndreas Gohr  // seperate katakana 'n'
1565879205e1SAndreas Gohr  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1566879205e1SAndreas Gohr  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1567879205e1SAndreas Gohr
1568fed467f8SDenis Scheither  // 2 character syllables - doubled consonants
1569fed467f8SDenis Scheither  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1570fed467f8SDenis Scheither  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1571fed467f8SDenis Scheither  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1572fed467f8SDenis Scheither  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1573fed467f8SDenis Scheither  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1574fed467f8SDenis Scheither  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1575fed467f8SDenis Scheither  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1576fed467f8SDenis Scheither  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1577799e0977SAndreas Gohr  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1578799e0977SAndreas Gohr  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1579fed467f8SDenis Scheither  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1580fed467f8SDenis Scheither
1581fed467f8SDenis Scheither  // 1 character syllables
1582fed467f8SDenis Scheither  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1583fed467f8SDenis Scheither  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1584fed467f8SDenis Scheither  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1585fed467f8SDenis Scheither  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1586fed467f8SDenis Scheither  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1587fed467f8SDenis Scheither  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1588fed467f8SDenis Scheither  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1589fed467f8SDenis Scheither  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1590fed467f8SDenis Scheither  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1591fed467f8SDenis Scheither  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1592879205e1SAndreas Gohr  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1593fed467f8SDenis Scheither  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1594fed467f8SDenis Scheither  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1595fed467f8SDenis Scheither  'ワ'=>'wa','ヲ'=>'wo',
1596fed467f8SDenis Scheither  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1597fed467f8SDenis Scheither  'ヵ'=>'ka','ヶ'=>'ke',
15989476a253SAndreas Gohr  // old characters
15999476a253SAndreas Gohr  'ヱ'=>'we','ヰ'=>'wi',
1600fed467f8SDenis Scheither
16019476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1602fed467f8SDenis Scheither  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1603fed467f8SDenis Scheither  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1604fed467f8SDenis Scheither
1605799e0977SAndreas Gohr  // special characters
1606799e0977SAndreas Gohr  '・'=>'_','、'=>'_',
1607799e0977SAndreas Gohr  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1608799e0977SAndreas Gohr
1609fed467f8SDenis Scheither  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1610fed467f8SDenis Scheither  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1611fed467f8SDenis Scheither  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1612fed467f8SDenis Scheither  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1613fed467f8SDenis Scheither  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1614fed467f8SDenis Scheither  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1615fed467f8SDenis Scheither  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1616fed467f8SDenis Scheither  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1617fed467f8SDenis Scheither  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1618fed467f8SDenis Scheither  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1619fed467f8SDenis Scheither  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1620fed467f8SDenis Scheither  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
16218a831f2bSAndreas Gohr
16228a831f2bSAndreas Gohr  // "Greeklish"
16238a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
16248a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
16258a831f2bSAndreas Gohr
16268a831f2bSAndreas Gohr  // Thai
16278a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
16288a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
16298a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
16308a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
16318a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
16328a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1633014d0ab6SAndreas Gohr  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1634014d0ab6SAndreas Gohr  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1635014d0ab6SAndreas Gohr  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1636014d0ab6SAndreas Gohr  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1637014d0ab6SAndreas Gohr  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1638014d0ab6SAndreas Gohr  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1639014d0ab6SAndreas Gohr  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1640014d0ab6SAndreas Gohr  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1641014d0ab6SAndreas Gohr  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1642014d0ab6SAndreas Gohr  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1643014d0ab6SAndreas Gohr	'๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1644014d0ab6SAndreas Gohr  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
16458a831f2bSAndreas Gohr
16468a831f2bSAndreas Gohr  // Korean
16478a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
16488a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
16498a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
16508a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
16518a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
16528a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
16538a831f2bSAndreas Gohr);
1654340756e4Sandi
1655340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
16568a831f2bSAndreas Gohr
1657