xref: /dokuwiki/inc/utf8.php (revision 7de9cff5dce5558a179b6f9b3379f2bb9a93544c)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr  }else{
16ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr  }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22ab77016bSAndreas Gohr
2382257610Sandi/**
2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi *
2649c713a3Sandi * Slashes are not encoded
2749c713a3Sandi *
28f59b22f0Sandi * When the second parameter is true the string will
29f59b22f0Sandi * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi * This makes it safe to run it multiple times on the
31f59b22f0Sandi * same string (default is true)
32f59b22f0Sandi *
3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi * @see    urlencode
3549c713a3Sandi */
36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi    return $file;
39f59b22f0Sandi  }
40f59b22f0Sandi  $file = urlencode($file);
4149c713a3Sandi  $file = str_replace('%2F','/',$file);
4249c713a3Sandi  return $file;
4349c713a3Sandi}
4449c713a3Sandi
4549c713a3Sandi/**
4649c713a3Sandi * URL-Decode a filename
4749c713a3Sandi *
48f59b22f0Sandi * This is just a wrapper around urldecode
49f59b22f0Sandi *
5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
51f59b22f0Sandi * @see    urldecode
5249c713a3Sandi */
5349c713a3Sandifunction utf8_decodeFN($file){
54f59b22f0Sandi  $file = urldecode($file);
5549c713a3Sandi  return $file;
5649c713a3Sandi}
5749c713a3Sandi
58f29bd553Sandi/**
5944f669e9Sandi * Checks if a string contains 7bit ASCII only
6044f669e9Sandi *
6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6244f669e9Sandi */
6344f669e9Sandifunction utf8_isASCII($str){
6444f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6544f669e9Sandi    if(ord($str{$i}) >127) return false;
6644f669e9Sandi  }
6744f669e9Sandi  return true;
6844f669e9Sandi}
6944f669e9Sandi
7044f669e9Sandi/**
71e1906e6eSandi * Strips all highbyte chars
72e1906e6eSandi *
73e1906e6eSandi * Returns a pure ASCII7 string
74e1906e6eSandi *
75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
76e1906e6eSandi */
77e1906e6eSandifunction utf8_strip($str){
78e1906e6eSandi  $ascii = '';
79e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
80e1906e6eSandi    if(ord($str{$i}) <128){
81e1906e6eSandi      $ascii .= $str{$i};
82e1906e6eSandi    }
83e1906e6eSandi  }
84e1906e6eSandi  return $ascii;
85e1906e6eSandi}
86e1906e6eSandi
87e1906e6eSandi/**
88f29bd553Sandi * Tries to detect if a string is in Unicode encoding
89f29bd553Sandi *
90f29bd553Sandi * @author <bmorel@ssi.fr>
91f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
92f29bd553Sandi */
93f29bd553Sandifunction utf8_check($Str) {
94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
955e613a5cSchris  $b = ord($Str[$i]);
965e613a5cSchris  if ($b < 0x80) continue; # 0bbbbbbb
975e613a5cSchris  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
985e613a5cSchris  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
995e613a5cSchris  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
1005e613a5cSchris  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
1015e613a5cSchris  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102f29bd553Sandi  else return false; # Does not match any model
103f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105f29bd553Sandi   return false;
106f29bd553Sandi  }
107f29bd553Sandi }
108f29bd553Sandi return true;
109f29bd553Sandi}
11049c713a3Sandi
1112f954959Sandi/**
112f29317c1Sandi * Unicode aware replacement for strlen()
1132f954959Sandi *
114f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
115f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
116f29317c1Sandi * even faster than mb_strlen.
1172f954959Sandi *
118f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1192f954959Sandi * @see    strlen()
120f29317c1Sandi * @see    utf8_decode()
1212f954959Sandi */
1222f954959Sandifunction utf8_strlen($string){
123dc57ef04Sandi  return strlen(utf8_decode($string));
1242f954959Sandi}
1252f954959Sandi
1267077c942Sandi/**
12710f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1287077c942Sandi *
12910f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
13010f09f2aSAndreas Gohr *
13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
1325e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk>
13310f09f2aSAndreas Gohr * @param string
13410f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13510f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13644881bd0Shenning.noren * @return mixed string or false if failure
1377077c942Sandi */
13810f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
139ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
14010f09f2aSAndreas Gohr        if( $length === null ){
14119a32233Schris            return mb_substr($str, $offset);
1427d8be200Sandi        }else{
14319a32233Schris            return mb_substr($str, $offset, $length);
144f29317c1Sandi        }
145f29317c1Sandi    }
146f29317c1Sandi
1472626ee0cSchris    /*
1482626ee0cSchris     * Notes:
1492626ee0cSchris     *
1502626ee0cSchris     * no mb string support, so we'll use pcre regex's with 'u' flag
1512626ee0cSchris     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1522626ee0cSchris     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1532626ee0cSchris     *
1542626ee0cSchris     * substr documentation states false can be returned in some cases (e.g. offset > string length)
1552626ee0cSchris     * mb_substr never returns false, it will return an empty string instead.
1562626ee0cSchris     *
1572626ee0cSchris     * calculating the number of characters in the string is a relatively expensive operation, so
1582626ee0cSchris     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1592626ee0cSchris     */
16010f09f2aSAndreas Gohr
1612626ee0cSchris    // cast parameters to appropriate types to avoid multiple notices/warnings
1622626ee0cSchris    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1632626ee0cSchris    $offset = (int)$offset;
1642626ee0cSchris    if (!is_null($length)) $length = (int)$length;
16510f09f2aSAndreas Gohr
1662626ee0cSchris    // handle trivial cases
1675e613a5cSchris    if ($length === 0) return '';
1682626ee0cSchris    if ($offset < 0 && $length < 0 && $length < $offset) return '';
1695e613a5cSchris
1702626ee0cSchris    $offset_pattern = '';
1712626ee0cSchris    $length_pattern = '';
1722626ee0cSchris
1732626ee0cSchris    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1742626ee0cSchris    if ($offset < 0) {
1752626ee0cSchris      $strlen = strlen(utf8_decode($str));        // see notes
1762626ee0cSchris      $offset = $strlen + $offset;
1772626ee0cSchris      if ($offset < 0) $offset = 0;
1782626ee0cSchris    }
1792626ee0cSchris
1802626ee0cSchris    // establish a pattern for offset, a non-captured group equal in length to offset
1812626ee0cSchris    if ($offset > 0) {
1822626ee0cSchris      $Ox = (int)($offset/65535);
1832626ee0cSchris      $Oy = $offset%65535;
1842626ee0cSchris
1852626ee0cSchris      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1862626ee0cSchris      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1872626ee0cSchris    } else {
1882626ee0cSchris      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1892626ee0cSchris    }
1902626ee0cSchris
1912626ee0cSchris    // establish a pattern for length
1922626ee0cSchris    if (is_null($length)) {
1932626ee0cSchris      $length_pattern = '(.*)$';                  // the rest of the string
1942626ee0cSchris    } else {
1952626ee0cSchris
1962626ee0cSchris      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
1972626ee0cSchris      if ($offset > $strlen) return '';           // another trivial case
1982626ee0cSchris
1992626ee0cSchris      if ($length > 0) {
2002626ee0cSchris
2012626ee0cSchris        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2022626ee0cSchris
2032626ee0cSchris        $Lx = (int)($length/65535);
2042626ee0cSchris        $Ly = $length%65535;
2052626ee0cSchris
2062626ee0cSchris        // +ve length requires ... a captured group of length characters
2072626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2082626ee0cSchris        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2092626ee0cSchris
2102626ee0cSchris      } else if ($length < 0) {
2112626ee0cSchris
2122626ee0cSchris        if ($length < ($offset - $strlen)) return '';
2132626ee0cSchris
2142626ee0cSchris        $Lx = (int)((-$length)/65535);
2152626ee0cSchris        $Ly = (-$length)%65535;
2162626ee0cSchris
2172626ee0cSchris        // -ve length requires ... capture everything except a group of -length characters
2182626ee0cSchris        //                         anchored at the tail-end of the string
2192626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2202626ee0cSchris        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
22110f09f2aSAndreas Gohr      }
22210f09f2aSAndreas Gohr    }
22310f09f2aSAndreas Gohr
2242626ee0cSchris    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2252626ee0cSchris    return $match[1];
2262626ee0cSchris}
22710f09f2aSAndreas Gohr
228f29317c1Sandi/**
229dc57ef04Sandi * Unicode aware replacement for substr_replace()
230dc57ef04Sandi *
231dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
232dc57ef04Sandi * @see    substr_replace()
233dc57ef04Sandi */
234dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
235dc57ef04Sandi  $ret = '';
236dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
237dc57ef04Sandi  $ret .= $replacement;
238dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
239dc57ef04Sandi  return $ret;
240dc57ef04Sandi}
241dc57ef04Sandi
242dc57ef04Sandi/**
243f29317c1Sandi * Unicode aware replacement for ltrim()
244f29317c1Sandi *
245f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
246f29317c1Sandi * @see    ltrim()
247f29317c1Sandi * @return string
248f29317c1Sandi */
249f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
250f29317c1Sandi  if($charlist == '') return ltrim($str);
251f29317c1Sandi
252f29317c1Sandi  //quote charlist for use in a characterclass
253f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
254f29317c1Sandi
255f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
256f29317c1Sandi}
257f29317c1Sandi
258f29317c1Sandi/**
259ea2eed85Sandi * Unicode aware replacement for rtrim()
260f29317c1Sandi *
261f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
262f29317c1Sandi * @see    rtrim()
263f29317c1Sandi * @return string
264f29317c1Sandi */
265f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
266f29317c1Sandi  if($charlist == '') return rtrim($str);
267f29317c1Sandi
268f29317c1Sandi  //quote charlist for use in a characterclass
269f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
270f29317c1Sandi
271f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
272f29317c1Sandi}
273f29317c1Sandi
274f29317c1Sandi/**
275f29317c1Sandi * Unicode aware replacement for trim()
276f29317c1Sandi *
277f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
278f29317c1Sandi * @see    trim()
279f29317c1Sandi * @return string
280f29317c1Sandi */
281f29317c1Sandifunction  utf8_trim($str,$charlist='') {
282f29317c1Sandi  if($charlist == '') return trim($str);
283f29317c1Sandi
28440421069SAndreas Gohr  return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
285f29317c1Sandi}
286f29317c1Sandi
2872f954959Sandi
28849c713a3Sandi/**
28982257610Sandi * This is a unicode aware replacement for strtolower()
29082257610Sandi *
29182257610Sandi * Uses mb_string extension if available
29282257610Sandi *
29372de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
29482257610Sandi * @see    strtolower()
29582257610Sandi * @see    utf8_strtoupper()
29682257610Sandi */
29782257610Sandifunction utf8_strtolower($string){
298ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
29982257610Sandi
30082257610Sandi  global $UTF8_UPPER_TO_LOWER;
30172de9068SAndreas Gohr  return strtr($string,$UTF8_UPPER_TO_LOWER);
30282257610Sandi}
30382257610Sandi
30482257610Sandi/**
30582257610Sandi * This is a unicode aware replacement for strtoupper()
30682257610Sandi *
30782257610Sandi * Uses mb_string extension if available
30882257610Sandi *
30972de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
31082257610Sandi * @see    strtoupper()
31182257610Sandi * @see    utf8_strtoupper()
31282257610Sandi */
31382257610Sandifunction utf8_strtoupper($string){
314ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
31582257610Sandi
31682257610Sandi  global $UTF8_LOWER_TO_UPPER;
31772de9068SAndreas Gohr  return strtr($string,$UTF8_LOWER_TO_UPPER);
31882257610Sandi}
31982257610Sandi
32082257610Sandi/**
32126ece5a7SAndreas Gohr * UTF-8 aware alternative to ucfirst
32226ece5a7SAndreas Gohr * Make a string's first character uppercase
32326ece5a7SAndreas Gohr *
32426ece5a7SAndreas Gohr * @author Harry Fuecks
32526ece5a7SAndreas Gohr * @param string
32626ece5a7SAndreas Gohr * @return string with first character as upper case (if applicable)
32726ece5a7SAndreas Gohr */
32826ece5a7SAndreas Gohrfunction utf8_ucfirst($str){
32926ece5a7SAndreas Gohr  switch ( utf8_strlen($str) ) {
33026ece5a7SAndreas Gohr    case 0:
33126ece5a7SAndreas Gohr        return '';
33226ece5a7SAndreas Gohr    case 1:
33326ece5a7SAndreas Gohr        return utf8_strtoupper($str);
33426ece5a7SAndreas Gohr    default:
33526ece5a7SAndreas Gohr        preg_match('/^(.{1})(.*)$/us', $str, $matches);
33626ece5a7SAndreas Gohr        return utf8_strtoupper($matches[1]).$matches[2];
33726ece5a7SAndreas Gohr  }
33826ece5a7SAndreas Gohr}
33926ece5a7SAndreas Gohr
34026ece5a7SAndreas Gohr/**
34126ece5a7SAndreas Gohr * UTF-8 aware alternative to ucwords
34226ece5a7SAndreas Gohr * Uppercase the first character of each word in a string
34326ece5a7SAndreas Gohr *
34426ece5a7SAndreas Gohr * @author Harry Fuecks
34526ece5a7SAndreas Gohr * @param string
34626ece5a7SAndreas Gohr * @return string with first char of each word uppercase
34726ece5a7SAndreas Gohr * @see http://www.php.net/ucwords
34826ece5a7SAndreas Gohr */
34926ece5a7SAndreas Gohrfunction utf8_ucwords($str) {
35026ece5a7SAndreas Gohr  // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
35126ece5a7SAndreas Gohr  // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
35226ece5a7SAndreas Gohr  // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
35326ece5a7SAndreas Gohr  $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
35426ece5a7SAndreas Gohr
35526ece5a7SAndreas Gohr  return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
35626ece5a7SAndreas Gohr}
35726ece5a7SAndreas Gohr
35826ece5a7SAndreas Gohr/**
35926ece5a7SAndreas Gohr * Callback function for preg_replace_callback call in utf8_ucwords
36026ece5a7SAndreas Gohr * You don't need to call this yourself
36126ece5a7SAndreas Gohr *
36226ece5a7SAndreas Gohr * @author Harry Fuecks
36326ece5a7SAndreas Gohr * @param array of matches corresponding to a single word
36426ece5a7SAndreas Gohr * @return string with first char of the word in uppercase
36526ece5a7SAndreas Gohr * @see utf8_ucwords
36626ece5a7SAndreas Gohr * @see utf8_strtoupper
36726ece5a7SAndreas Gohr */
36826ece5a7SAndreas Gohrfunction utf8_ucwords_callback($matches) {
36926ece5a7SAndreas Gohr  $leadingws = $matches[2];
37026ece5a7SAndreas Gohr  $ucfirst = utf8_strtoupper($matches[3]);
37126ece5a7SAndreas Gohr  $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
37226ece5a7SAndreas Gohr  return $leadingws . $ucword;
37326ece5a7SAndreas Gohr}
37426ece5a7SAndreas Gohr
37526ece5a7SAndreas Gohr/**
37682257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
37782257610Sandi *
37882257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
37982257610Sandi * letters. Default is to deaccent both cases ($case = 0)
38082257610Sandi *
38182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
38282257610Sandi */
38382257610Sandifunction utf8_deaccent($string,$case=0){
38482257610Sandi  if($case <= 0){
38582257610Sandi    global $UTF8_LOWER_ACCENTS;
38672de9068SAndreas Gohr    $string = strtr($string,$UTF8_LOWER_ACCENTS);
38782257610Sandi  }
38882257610Sandi  if($case >= 0){
38982257610Sandi    global $UTF8_UPPER_ACCENTS;
39072de9068SAndreas Gohr    $string = strtr($string,$UTF8_UPPER_ACCENTS);
39182257610Sandi  }
39282257610Sandi  return $string;
39382257610Sandi}
39482257610Sandi
39582257610Sandi/**
3968a831f2bSAndreas Gohr * Romanize a non-latin string
3978a831f2bSAndreas Gohr *
3988a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3998a831f2bSAndreas Gohr */
4008a831f2bSAndreas Gohrfunction utf8_romanize($string){
4018a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
4028a831f2bSAndreas Gohr
4038a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
4048a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
4058a831f2bSAndreas Gohr}
4068a831f2bSAndreas Gohr
4078a831f2bSAndreas Gohr/**
408099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
409099ada41Sandi *
410099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
411099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
412099ada41Sandi *
413099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
414099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
415099ada41Sandi * @param  string $repl       Replace special with this string
416b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
417099ada41Sandi */
418b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
419099ada41Sandi  global $UTF8_SPECIAL_CHARS;
420720307d9Schris  global $UTF8_SPECIAL_CHARS2;
421099ada41Sandi
4225c812709Sandi  static $specials = null;
4235c812709Sandi  if(is_null($specials)){
424720307d9Schris#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
425720307d9Schris    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4265c812709Sandi  }
427099ada41Sandi
428b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
429099ada41Sandi}
430099ada41Sandi
431099ada41Sandi/**
4322f954959Sandi * This is an Unicode aware replacement for strpos
4332f954959Sandi *
43472de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
4352f954959Sandi * @see    strpos()
43672de9068SAndreas Gohr * @param  string
43772de9068SAndreas Gohr * @param  string
43872de9068SAndreas Gohr * @param  integer
43972de9068SAndreas Gohr * @return integer
4402f954959Sandi */
4412f954959Sandifunction utf8_strpos($haystack, $needle, $offset=0){
44272de9068SAndreas Gohr    $comp = 0;
44372de9068SAndreas Gohr    $length = null;
4442f954959Sandi
44572de9068SAndreas Gohr    while (is_null($length) || $length < $offset) {
44672de9068SAndreas Gohr        $pos = strpos($haystack, $needle, $offset + $comp);
44772de9068SAndreas Gohr
44872de9068SAndreas Gohr        if ($pos === false)
449f29317c1Sandi            return false;
45072de9068SAndreas Gohr
45172de9068SAndreas Gohr        $length = utf8_strlen(substr($haystack, 0, $pos));
45272de9068SAndreas Gohr
45372de9068SAndreas Gohr        if ($length < $offset)
45472de9068SAndreas Gohr            $comp = $pos - $length;
455f29317c1Sandi    }
4562f954959Sandi
45772de9068SAndreas Gohr    return $length;
45872de9068SAndreas Gohr}
459f29317c1Sandi
4602f954959Sandi
4612f954959Sandi/**
462ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
463ea2eed85Sandi *
4649f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
465ea2eed85Sandi * @author <vpribish at shopping dot com>
466ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
467ea2eed85Sandi */
468ea2eed85Sandifunction utf8_tohtml ($str) {
469ea2eed85Sandi    $ret = '';
4709f9fb0e5STom N Harris    foreach (utf8_to_unicode($str) as $cp) {
4719f9fb0e5STom N Harris        if ($cp < 0x80)
4729f9fb0e5STom N Harris            $ret .= chr($cp);
4739f9fb0e5STom N Harris        elseif ($cp < 0x100)
4749f9fb0e5STom N Harris            $ret .= "&#$cp;";
4759f9fb0e5STom N Harris        else
4769f9fb0e5STom N Harris            $ret .= '&#x'.dechex($cp).';';
4779f9fb0e5STom N Harris    }
4789f9fb0e5STom N Harris    return $ret;
4799f9fb0e5STom N Harris}
4809f9fb0e5STom N Harris
4819f9fb0e5STom N Harris/**
4829f9fb0e5STom N Harris * Decodes HTML entities to UTF-8 characters
4839f9fb0e5STom N Harris *
4849f9fb0e5STom N Harris * Convert any &#..; entity to a codepoint,
4859f9fb0e5STom N Harris * The entities flag defaults to only decoding numeric entities.
4869f9fb0e5STom N Harris * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
4879f9fb0e5STom N Harris * are handled as well. Avoids the problem that would occur if you
4889f9fb0e5STom N Harris * had to decode "&amp;#38;&#38;amp;#38;"
4899f9fb0e5STom N Harris *
4909f9fb0e5STom N Harris * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
4919f9fb0e5STom N Harris * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
4929f9fb0e5STom N Harris * what it should be                   -> "&#38;&amp#38;"
4939f9fb0e5STom N Harris *
4949f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
4959f9fb0e5STom N Harris * @param  string  $str      UTF-8 encoded string
4969f9fb0e5STom N Harris * @param  boolean $entities Flag controlling decoding of named entities.
4979f9fb0e5STom N Harris * @return UTF-8 encoded string with numeric (and named) entities replaced.
4989f9fb0e5STom N Harris */
4999f9fb0e5STom N Harrisfunction utf8_unhtml($str, $entities=null) {
5009f9fb0e5STom N Harris    static $decoder = null;
5019f9fb0e5STom N Harris    if (is_null($decoder))
5029f9fb0e5STom N Harris      $decoder = new utf8_entity_decoder();
5039f9fb0e5STom N Harris    if (is_null($entities))
5049f9fb0e5STom N Harris        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
5059f9fb0e5STom N Harris                                     'utf8_decode_numeric', $str);
5069f9fb0e5STom N Harris    else
5079f9fb0e5STom N Harris        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
5089f9fb0e5STom N Harris                                     array(&$decoder, 'decode'), $str);
5099f9fb0e5STom N Harris}
5109f9fb0e5STom N Harrisfunction utf8_decode_numeric($ent) {
5119f9fb0e5STom N Harris    switch ($ent[2]) {
5129f9fb0e5STom N Harris      case 'X':
5139f9fb0e5STom N Harris      case 'x':
5149f9fb0e5STom N Harris          $cp = hexdec($ent[3]);
5159f9fb0e5STom N Harris          break;
5169f9fb0e5STom N Harris      default:
5179f9fb0e5STom N Harris          $cp = intval($ent[3]);
5189f9fb0e5STom N Harris          break;
5199f9fb0e5STom N Harris    }
5209f9fb0e5STom N Harris    return unicode_to_utf8(array($cp));
5219f9fb0e5STom N Harris}
5229f9fb0e5STom N Harrisclass utf8_entity_decoder {
5239f9fb0e5STom N Harris    var $table;
5249f9fb0e5STom N Harris    function utf8_entity_decoder() {
5259f9fb0e5STom N Harris        $table = get_html_translation_table(HTML_ENTITIES);
5269f9fb0e5STom N Harris        $table = array_flip($table);
5279f9fb0e5STom N Harris        $this->table = array_map(array(&$this,'makeutf8'), $table);
5289f9fb0e5STom N Harris    }
5299f9fb0e5STom N Harris    function makeutf8($c) {
5309f9fb0e5STom N Harris        return unicode_to_utf8(array(ord($c)));
5319f9fb0e5STom N Harris    }
5329f9fb0e5STom N Harris    function decode($ent) {
5339f9fb0e5STom N Harris        if ($ent[1] == '#') {
5349f9fb0e5STom N Harris            return utf8_decode_numeric($ent);
5359f9fb0e5STom N Harris        } elseif (array_key_exists($ent[0],$this->table)) {
5369f9fb0e5STom N Harris            return $this->table[$ent[0]];
5379f9fb0e5STom N Harris        } else {
5389f9fb0e5STom N Harris            return $ent[0];
539ea2eed85Sandi        }
540ea2eed85Sandi    }
541ea2eed85Sandi}
542ea2eed85Sandi
543ea2eed85Sandi/**
5441abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
5451abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
5461abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
5471abfaba4SAndreas Gohr * are not allowed.
54882257610Sandi *
5491abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
5501abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
5511abfaba4SAndreas Gohr * level E_USER_WARNING
5521abfaba4SAndreas Gohr *
5531abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
5541abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
5551abfaba4SAndreas Gohr *
5561abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
5571abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
5581abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
5591abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
56044881bd0Shenning.noren * @return mixed array of unicode code points or false if UTF-8 invalid
5611abfaba4SAndreas Gohr * @see    unicode_to_utf8
5621abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
5631abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
56482257610Sandi */
5651abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
5661abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
5671abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
5681abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
5691abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
57082257610Sandi
5711abfaba4SAndreas Gohr    $out = array();
5721abfaba4SAndreas Gohr
5731abfaba4SAndreas Gohr    $len = strlen($str);
5741abfaba4SAndreas Gohr
5751abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
5761abfaba4SAndreas Gohr
5771abfaba4SAndreas Gohr        $in = ord($str{$i});
5781abfaba4SAndreas Gohr
5791abfaba4SAndreas Gohr        if ( $mState == 0) {
5801abfaba4SAndreas Gohr
5811abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
5821abfaba4SAndreas Gohr            // multi-octet sequence.
5831abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
5841abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
5851abfaba4SAndreas Gohr                $out[] = $in;
5861abfaba4SAndreas Gohr                $mBytes = 1;
5871abfaba4SAndreas Gohr
5881abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
5891abfaba4SAndreas Gohr                // First octet of 2 octet sequence
5901abfaba4SAndreas Gohr                $mUcs4 = ($in);
5911abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
5921abfaba4SAndreas Gohr                $mState = 1;
5931abfaba4SAndreas Gohr                $mBytes = 2;
5941abfaba4SAndreas Gohr
5951abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
5961abfaba4SAndreas Gohr                // First octet of 3 octet sequence
5971abfaba4SAndreas Gohr                $mUcs4 = ($in);
5981abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
5991abfaba4SAndreas Gohr                $mState = 2;
6001abfaba4SAndreas Gohr                $mBytes = 3;
6011abfaba4SAndreas Gohr
6021abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
6031abfaba4SAndreas Gohr                // First octet of 4 octet sequence
6041abfaba4SAndreas Gohr                $mUcs4 = ($in);
6051abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
6061abfaba4SAndreas Gohr                $mState = 3;
6071abfaba4SAndreas Gohr                $mBytes = 4;
6081abfaba4SAndreas Gohr
6091abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
6101abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
6111abfaba4SAndreas Gohr                 *
6121abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
6131abfaba4SAndreas Gohr                 * (a) not the shortest form or
6141abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
6151abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
6161abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
6171abfaba4SAndreas Gohr                 */
6181abfaba4SAndreas Gohr                $mUcs4 = ($in);
6191abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
6201abfaba4SAndreas Gohr                $mState = 4;
6211abfaba4SAndreas Gohr                $mBytes = 5;
6221abfaba4SAndreas Gohr
6231abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
6241abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
6251abfaba4SAndreas Gohr                $mUcs4 = ($in);
6261abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
6271abfaba4SAndreas Gohr                $mState = 5;
6281abfaba4SAndreas Gohr                $mBytes = 6;
6291abfaba4SAndreas Gohr
6301abfaba4SAndreas Gohr            } elseif($strict) {
6311abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
6321abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
6331abfaba4SAndreas Gohr                 */
6341abfaba4SAndreas Gohr                trigger_error(
6351abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
6361abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
6371abfaba4SAndreas Gohr                        E_USER_WARNING
6381abfaba4SAndreas Gohr                    );
63944881bd0Shenning.noren                return false;
6401abfaba4SAndreas Gohr
6411abfaba4SAndreas Gohr            }
6421abfaba4SAndreas Gohr
6431abfaba4SAndreas Gohr        } else {
6441abfaba4SAndreas Gohr
6451abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
6461abfaba4SAndreas Gohr            // sequence
6471abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
6481abfaba4SAndreas Gohr
6491abfaba4SAndreas Gohr                // Legal continuation.
6501abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
6511abfaba4SAndreas Gohr                $tmp = $in;
6521abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
6531abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
6541abfaba4SAndreas Gohr
6551abfaba4SAndreas Gohr                /**
6561abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
6571abfaba4SAndreas Gohr                 * Unicode codepoint to be output
6581abfaba4SAndreas Gohr                 */
6591abfaba4SAndreas Gohr                if (0 == --$mState) {
6601abfaba4SAndreas Gohr
6611abfaba4SAndreas Gohr                    /*
6621abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
6631abfaba4SAndreas Gohr                     */
6641abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
6651abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
6661abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
6671abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
6681abfaba4SAndreas Gohr                        (4 < $mBytes) ||
6691abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
6701abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
6711abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
6721abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
6731abfaba4SAndreas Gohr
6741abfaba4SAndreas Gohr                        if($strict){
6751abfaba4SAndreas Gohr                            trigger_error(
6761abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
6771abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
6781abfaba4SAndreas Gohr                                    E_USER_WARNING
6791abfaba4SAndreas Gohr                                );
6801abfaba4SAndreas Gohr
68144881bd0Shenning.noren                            return false;
6821abfaba4SAndreas Gohr                        }
6831abfaba4SAndreas Gohr
6841abfaba4SAndreas Gohr                    }
6851abfaba4SAndreas Gohr
6861abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
6871abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
6881abfaba4SAndreas Gohr                        $out[] = $mUcs4;
6891abfaba4SAndreas Gohr                    }
6901abfaba4SAndreas Gohr
6911abfaba4SAndreas Gohr                    //initialize UTF8 cache
6921abfaba4SAndreas Gohr                    $mState = 0;
6931abfaba4SAndreas Gohr                    $mUcs4  = 0;
6941abfaba4SAndreas Gohr                    $mBytes = 1;
6951abfaba4SAndreas Gohr                }
6961abfaba4SAndreas Gohr
6971abfaba4SAndreas Gohr            } elseif($strict) {
6981abfaba4SAndreas Gohr                /**
6991abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
7001abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
7011abfaba4SAndreas Gohr                 */
7021abfaba4SAndreas Gohr                trigger_error(
7031abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
7041abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
7051abfaba4SAndreas Gohr                        E_USER_WARNING
7061abfaba4SAndreas Gohr                    );
7071abfaba4SAndreas Gohr
70844881bd0Shenning.noren                return false;
70982257610Sandi            }
71082257610Sandi        }
71182257610Sandi    }
7121abfaba4SAndreas Gohr    return $out;
71382257610Sandi}
71482257610Sandi
71582257610Sandi/**
7161abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
7171abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
7181abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
7191abfaba4SAndreas Gohr * are not allowed.
72082257610Sandi *
7211abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
7221abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
7231abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
7241abfaba4SAndreas Gohr *
7251abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
7261abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
7271abfaba4SAndreas Gohr * reference the array by it's keys
7281abfaba4SAndreas Gohr *
7291abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
7301abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
73144881bd0Shenning.noren * @return mixed UTF-8 string or false if array contains invalid code points
7321abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
7331abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
7341abfaba4SAndreas Gohr * @see    utf8_to_unicode
7351abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
7361abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
73782257610Sandi */
7381abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
7391abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
7401abfaba4SAndreas Gohr    ob_start();
741f949a01cSAndreas Gohr
7421abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
7431abfaba4SAndreas Gohr
7441abfaba4SAndreas Gohr        # ASCII range (including control chars)
7451abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
7461abfaba4SAndreas Gohr
7471abfaba4SAndreas Gohr            echo chr($arr[$k]);
7481abfaba4SAndreas Gohr
7491abfaba4SAndreas Gohr        # 2 byte sequence
7501abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
7511abfaba4SAndreas Gohr
7521abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
7531abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
7541abfaba4SAndreas Gohr
7551abfaba4SAndreas Gohr        # Byte order mark (skip)
7561abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
7571abfaba4SAndreas Gohr
7581abfaba4SAndreas Gohr            // nop -- zap the BOM
7591abfaba4SAndreas Gohr
7601abfaba4SAndreas Gohr        # Test for illegal surrogates
7611abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
7621abfaba4SAndreas Gohr
7631abfaba4SAndreas Gohr            // found a surrogate
7641abfaba4SAndreas Gohr            if($strict){
7651abfaba4SAndreas Gohr                trigger_error(
7661abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
7671abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
7681abfaba4SAndreas Gohr                    E_USER_WARNING
7691abfaba4SAndreas Gohr                    );
77044881bd0Shenning.noren                return false;
7711abfaba4SAndreas Gohr            }
7721abfaba4SAndreas Gohr
7731abfaba4SAndreas Gohr        # 3 byte sequence
7741abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
7751abfaba4SAndreas Gohr
7761abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
7771abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
7781abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
7791abfaba4SAndreas Gohr
7801abfaba4SAndreas Gohr        # 4 byte sequence
7811abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
7821abfaba4SAndreas Gohr
7831abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
7841abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
7851abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
7861abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
7871abfaba4SAndreas Gohr
7881abfaba4SAndreas Gohr        } elseif($strict) {
7891abfaba4SAndreas Gohr
7901abfaba4SAndreas Gohr            trigger_error(
7911abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
7921abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
7931abfaba4SAndreas Gohr                E_USER_WARNING
7941abfaba4SAndreas Gohr                );
7951abfaba4SAndreas Gohr
7961abfaba4SAndreas Gohr            // out of range
79744881bd0Shenning.noren            return false;
79882257610Sandi        }
79982257610Sandi    }
8001abfaba4SAndreas Gohr
8011abfaba4SAndreas Gohr    $result = ob_get_contents();
8021abfaba4SAndreas Gohr    ob_end_clean();
8031abfaba4SAndreas Gohr    return $result;
80482257610Sandi}
80582257610Sandi
80682257610Sandi/**
80715fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
80815fa0b4fSAndreas Gohr *
80915fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
81015fa0b4fSAndreas Gohr */
81115fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
81215fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
813ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
81415fa0b4fSAndreas Gohr
81515fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
81615fa0b4fSAndreas Gohr  foreach($uni as $cp){
81715fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
81815fa0b4fSAndreas Gohr  }
81915fa0b4fSAndreas Gohr  return $out;
82015fa0b4fSAndreas Gohr}
82115fa0b4fSAndreas Gohr
82215fa0b4fSAndreas Gohr/**
82315fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
82415fa0b4fSAndreas Gohr *
82515fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
82615fa0b4fSAndreas Gohr */
82715fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
82815fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
82915fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
83015fa0b4fSAndreas Gohr}
83115fa0b4fSAndreas Gohr
8320eac1afbSAndreas Gohr/**
8330eac1afbSAndreas Gohr * Replace bad bytes with an alternative character
8340eac1afbSAndreas Gohr *
8350eac1afbSAndreas Gohr * ASCII character is recommended for replacement char
8360eac1afbSAndreas Gohr *
8370eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string
8380eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms
8390eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars
8400eac1afbSAndreas Gohr *
8410eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
8420eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8
8430eac1afbSAndreas Gohr * @param string to search
8440eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII
8450eac1afbSAndreas Gohr * @return string
8460eac1afbSAndreas Gohr */
8470eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') {
8480eac1afbSAndreas Gohr    $UTF8_BAD =
8490eac1afbSAndreas Gohr     '([\x00-\x7F]'.                          # ASCII (including control chars)
8500eac1afbSAndreas Gohr     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
8510eac1afbSAndreas Gohr     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
8520eac1afbSAndreas Gohr     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
8530eac1afbSAndreas Gohr     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
8540eac1afbSAndreas Gohr     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
8550eac1afbSAndreas Gohr     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
8560eac1afbSAndreas Gohr     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
8570eac1afbSAndreas Gohr     '|(.{1}))';                              # invalid byte
8580eac1afbSAndreas Gohr    ob_start();
8590eac1afbSAndreas Gohr    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
8600eac1afbSAndreas Gohr        if ( !isset($matches[2])) {
8610eac1afbSAndreas Gohr            echo $matches[0];
8620eac1afbSAndreas Gohr        } else {
8630eac1afbSAndreas Gohr            echo $replace;
8640eac1afbSAndreas Gohr        }
8650eac1afbSAndreas Gohr        $str = substr($str,strlen($matches[0]));
8660eac1afbSAndreas Gohr    }
8670eac1afbSAndreas Gohr    $result = ob_get_contents();
8680eac1afbSAndreas Gohr    ob_end_clean();
8690eac1afbSAndreas Gohr    return $result;
8700eac1afbSAndreas Gohr}
871ab77016bSAndreas Gohr
8725953e889Schris/**
8735953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary
8745953e889Schris *
8755953e889Schris * @param $str   string   utf8 character string
8765953e889Schris * @param $i     int      byte index into $str
8775953e889Schris * @param $next  bool     direction to search for boundary,
8785953e889Schris *                           false = up (current character)
8795953e889Schris *                           true = down (next character)
8805953e889Schris *
8815953e889Schris * @return int            byte index into $str now pointing to a utf8 character boundary
8825953e889Schris *
8835953e889Schris * @author       chris smith <chris@jalakai.co.uk>
8845953e889Schris */
8855953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) {
8865953e889Schris
887f50163d1Schris  if ($i <= 0) return 0;
888f50163d1Schris
8895953e889Schris  $limit = strlen($str);
890f50163d1Schris  if ($i>=$limit) return $limit;
891f50163d1Schris
892f50163d1Schris  if ($next) {
8935953e889Schris    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
8945953e889Schris  } else {
8955953e889Schris    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
8965953e889Schris  }
8975953e889Schris
8985953e889Schris  return $i;
8995953e889Schris}
9005953e889Schris
901ab77016bSAndreas Gohr// only needed if no mb_string available
902ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
90315fa0b4fSAndreas Gohr  /**
90482257610Sandi   * UTF-8 Case lookup table
90582257610Sandi   *
90682257610Sandi   * This lookuptable defines the upper case letters to their correspponding
90782257610Sandi   * lower case letter in UTF-8
90882257610Sandi   *
90982257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
91082257610Sandi   */
91154662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
91254662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
91372de9068SAndreas Gohr    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
91472de9068SAndreas Gohr    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
91572de9068SAndreas Gohr    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
91672de9068SAndreas Gohr    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
91772de9068SAndreas Gohr    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
91872de9068SAndreas Gohr    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
91972de9068SAndreas Gohr    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
92072de9068SAndreas Gohr    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
92172de9068SAndreas Gohr    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
92272de9068SAndreas Gohr    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
92372de9068SAndreas Gohr    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
92472de9068SAndreas Gohr    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
92572de9068SAndreas Gohr    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
92672de9068SAndreas Gohr    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
92772de9068SAndreas Gohr    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
92872de9068SAndreas Gohr    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
92972de9068SAndreas Gohr    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
93072de9068SAndreas Gohr    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
93172de9068SAndreas Gohr    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
93272de9068SAndreas Gohr    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
93372de9068SAndreas Gohr    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
93472de9068SAndreas Gohr    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
93572de9068SAndreas Gohr    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
93672de9068SAndreas Gohr    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
93772de9068SAndreas Gohr    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
93872de9068SAndreas Gohr    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
93972de9068SAndreas Gohr    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
94072de9068SAndreas Gohr    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
94172de9068SAndreas Gohr    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
94272de9068SAndreas Gohr    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
94372de9068SAndreas Gohr    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
94472de9068SAndreas Gohr    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
94572de9068SAndreas Gohr    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
94672de9068SAndreas Gohr    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
94772de9068SAndreas Gohr    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
94872de9068SAndreas Gohr    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
94972de9068SAndreas Gohr    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
95072de9068SAndreas Gohr    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
95172de9068SAndreas Gohr    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
95272de9068SAndreas Gohr    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
95372de9068SAndreas Gohr    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
95472de9068SAndreas Gohr    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
95572de9068SAndreas Gohr    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
95672de9068SAndreas Gohr    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
95772de9068SAndreas Gohr    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
95872de9068SAndreas Gohr    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
95972de9068SAndreas Gohr    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
96072de9068SAndreas Gohr    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
96172de9068SAndreas Gohr    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
96272de9068SAndreas Gohr    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
96372de9068SAndreas Gohr    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
96472de9068SAndreas Gohr    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
96572de9068SAndreas Gohr    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
96672de9068SAndreas Gohr    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
96772de9068SAndreas Gohr    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
96872de9068SAndreas Gohr    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
96972de9068SAndreas Gohr    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
97072de9068SAndreas Gohr    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
97172de9068SAndreas Gohr    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
97272de9068SAndreas Gohr    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
97372de9068SAndreas Gohr    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
97472de9068SAndreas Gohr    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
97572de9068SAndreas Gohr    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
97672de9068SAndreas Gohr    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
97772de9068SAndreas Gohr    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
97872de9068SAndreas Gohr    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
97972de9068SAndreas Gohr    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
98082257610Sandi  );
98182257610Sandi
98282257610Sandi  /**
98382257610Sandi   * UTF-8 Case lookup table
98482257610Sandi   *
98582257610Sandi   * This lookuptable defines the lower case letters to their correspponding
98672de9068SAndreas Gohr   * upper case letter in UTF-8
98782257610Sandi   *
98882257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
98982257610Sandi   */
99054662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
99172de9068SAndreas Gohr  $UTF8_UPPER_TO_LOWER = array (
99272de9068SAndreas Gohr    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
99372de9068SAndreas Gohr    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
99472de9068SAndreas Gohr    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
99572de9068SAndreas Gohr    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
99672de9068SAndreas Gohr    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
99772de9068SAndreas Gohr    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
99872de9068SAndreas Gohr    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
99972de9068SAndreas Gohr    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
100072de9068SAndreas Gohr    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
100172de9068SAndreas Gohr    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
100272de9068SAndreas Gohr    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
100372de9068SAndreas Gohr    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
100472de9068SAndreas Gohr    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
100572de9068SAndreas Gohr    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
100672de9068SAndreas Gohr    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
100772de9068SAndreas Gohr    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
100872de9068SAndreas Gohr    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
100972de9068SAndreas Gohr    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
101072de9068SAndreas Gohr    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
101172de9068SAndreas Gohr    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
101272de9068SAndreas Gohr    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
101372de9068SAndreas Gohr    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
101472de9068SAndreas Gohr    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
101572de9068SAndreas Gohr    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
101672de9068SAndreas Gohr    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
101772de9068SAndreas Gohr    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
101872de9068SAndreas Gohr    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
101972de9068SAndreas Gohr    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
102072de9068SAndreas Gohr    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
102172de9068SAndreas Gohr    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
102272de9068SAndreas Gohr    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
102372de9068SAndreas Gohr    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
102472de9068SAndreas Gohr    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
102572de9068SAndreas Gohr    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
102672de9068SAndreas Gohr    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
102772de9068SAndreas Gohr    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
102872de9068SAndreas Gohr    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
102972de9068SAndreas Gohr    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
103072de9068SAndreas Gohr    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
103172de9068SAndreas Gohr    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
103272de9068SAndreas Gohr    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
103372de9068SAndreas Gohr    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
103472de9068SAndreas Gohr    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
103572de9068SAndreas Gohr    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
103672de9068SAndreas Gohr    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
103772de9068SAndreas Gohr    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
103872de9068SAndreas Gohr    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
103972de9068SAndreas Gohr    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
104072de9068SAndreas Gohr    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
104172de9068SAndreas Gohr    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
104272de9068SAndreas Gohr    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
104372de9068SAndreas Gohr    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
104472de9068SAndreas Gohr    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
104572de9068SAndreas Gohr    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
104672de9068SAndreas Gohr    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
104772de9068SAndreas Gohr    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
104872de9068SAndreas Gohr    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
104972de9068SAndreas Gohr    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
105072de9068SAndreas Gohr    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
105172de9068SAndreas Gohr    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
105272de9068SAndreas Gohr    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
105372de9068SAndreas Gohr    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
105472de9068SAndreas Gohr    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
105572de9068SAndreas Gohr    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
105672de9068SAndreas Gohr    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
105772de9068SAndreas Gohr    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
105872de9068SAndreas Gohr    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
105972de9068SAndreas Gohr  );
106072de9068SAndreas Gohr}; // end of case lookup tables
1061ab77016bSAndreas Gohr
106282257610Sandi/**
106382257610Sandi * UTF-8 lookup table for lower case accented letters
106482257610Sandi *
106582257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
106682257610Sandi * range. This are lower case letters only.
106782257610Sandi *
106882257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
106982257610Sandi * @see    utf8_deaccent()
107082257610Sandi */
107154662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
107282257610Sandi$UTF8_LOWER_ACCENTS = array(
107382257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
107482257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
107582257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
107682257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
107782257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
107882257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
107982257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
108082257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
108182257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
108282257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
108382257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
108482257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
108582257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
108682257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
108774c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
108882257610Sandi);
108982257610Sandi
109082257610Sandi/**
109182257610Sandi * UTF-8 lookup table for upper case accented letters
109282257610Sandi *
109382257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
109482257610Sandi * range. This are upper case letters only.
109582257610Sandi *
109682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
109782257610Sandi * @see    utf8_deaccent()
109882257610Sandi */
109954662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
110082257610Sandi$UTF8_UPPER_ACCENTS = array(
1101df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1102df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1103df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1104df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1105df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1106df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1107df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1108df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1109df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1110df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1111df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1112df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1113df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1114df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
111574c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
111682257610Sandi);
111782257610Sandi
1118099ada41Sandi/**
1119099ada41Sandi * UTF-8 array of common special characters
1120099ada41Sandi *
1121099ada41Sandi * This array should contain all special characters (not a letter or digit)
1122099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1123099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1124099ada41Sandi * chars.
1125099ada41Sandi *
1126099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1127ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1128099ada41Sandi *
1129099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1130099ada41Sandi * @see    utf8_stripspecials()
1131099ada41Sandi */
113254662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1133099ada41Sandi$UTF8_SPECIAL_CHARS = array(
1134099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1135ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
11365c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
11375c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1138099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1139099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1140099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1141099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1142099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1143099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1144099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1145099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1146fae4b5fcSAndreas Gohr  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1147099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1148099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1149099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1150099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1151099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1152099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1153099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1154099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1155099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1156099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1157099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1158099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1159099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1160099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1161099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1162099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1163099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1164099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1165099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1166099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1167099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1168099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1169099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1170099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1171099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1172099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1173099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1174099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1175099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1176099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1177099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1178099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1179d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1180d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1181d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1182d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1183099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1184099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1185099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1186099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1187d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1188d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1189d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1190d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1191d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1192d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1193fae4b5fcSAndreas Gohr  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1194fae4b5fcSAndreas Gohr  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1195fae4b5fcSAndreas Gohr  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1196*7de9cff5SAndreas Gohr  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1197*7de9cff5SAndreas Gohr  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1198099ada41Sandi);
1199340756e4Sandi
1200720307d9Schris// utf8 version of above data
1201720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1202720307d9Schris$UTF8_SPECIAL_CHARS2 =
120337242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1204720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1205fae4b5fcSAndreas Gohr    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1206720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1207720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1208720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1209720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1210720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1211720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1212720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1213720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1214720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1215720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1216720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1217d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1218d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1219d5b23302STom N Harris    '�'.
1220d5b23302STom N Harris    '�ﹼﹽ'.
1221d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1222fae4b5fcSAndreas Gohr    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1223*7de9cff5SAndreas Gohr    '����������������������������������������������������������������'.
1224*7de9cff5SAndreas Gohr    '   ⁠';
1225720307d9Schris
12268a831f2bSAndreas Gohr/**
12278a831f2bSAndreas Gohr * Romanization lookup table
12288a831f2bSAndreas Gohr *
12298a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
12308a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
12318a831f2bSAndreas Gohr *
12328a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
12338a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
12348a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
12358a831f2bSAndreas Gohr *
12368a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
12378a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
12388a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
12398a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
12408a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
12418a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
12428a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
12438a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
1244014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com>
1245fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de>
12468a831f2bSAndreas Gohr */
124754662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
12488a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
1249176ae32bSAndreas Gohr  // scandinavian - differs from what we do in deaccent
1250176ae32bSAndreas Gohr  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1251176ae32bSAndreas Gohr
12528a831f2bSAndreas Gohr  //russian cyrillic
12538a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
12548a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
12558a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
12568a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
12578a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
12588a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1259d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1260f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
12618a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
12628a831f2bSAndreas Gohr  // Ukrainian cyrillic
12638a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
12648a831f2bSAndreas Gohr  // Georgian
12658a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
12668a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
12678a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
12688a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
12698a831f2bSAndreas Gohr  'ჰ'=>'xh',
12708a831f2bSAndreas Gohr  //Sanskrit
12718a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
12728a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
12738a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
12748a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
12758a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
12768a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
12778a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
12788a831f2bSAndreas Gohr  //Hebrew
12793dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
12803dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
12813dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
12828a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
12838a831f2bSAndreas Gohr  //Arabic
12848a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
12858a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
12868a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
12878a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
12888a831f2bSAndreas Gohr
1289799e0977SAndreas Gohr  // Japanese characters  (last update: 2008-05-09)
12909476a253SAndreas Gohr
12918a831f2bSAndreas Gohr  // Japanese hiragana
1292fed467f8SDenis Scheither
1293fed467f8SDenis Scheither  // 3 character syllables, っ doubles the consonant after
1294fed467f8SDenis Scheither  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1295879205e1SAndreas Gohr  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1296799e0977SAndreas Gohr  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1297879205e1SAndreas Gohr  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1298879205e1SAndreas Gohr  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1299879205e1SAndreas Gohr  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1300879205e1SAndreas Gohr  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1301879205e1SAndreas Gohr  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1302879205e1SAndreas Gohr  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1303879205e1SAndreas Gohr  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1304879205e1SAndreas Gohr  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1305879205e1SAndreas Gohr
1306879205e1SAndreas Gohr  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1307879205e1SAndreas Gohr  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1308879205e1SAndreas Gohr  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1309fed467f8SDenis Scheither
1310fed467f8SDenis Scheither   // 2 character syllables - normal
1311879205e1SAndreas Gohr  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1312fed467f8SDenis Scheither  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1313fed467f8SDenis Scheither  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1314799e0977SAndreas Gohr  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1315799e0977SAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1316fed467f8SDenis Scheither  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1317fed467f8SDenis Scheither  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1318fed467f8SDenis Scheither  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1319fed467f8SDenis Scheither  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1320fed467f8SDenis Scheither  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1321fed467f8SDenis Scheither  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1322879205e1SAndreas Gohr  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1323879205e1SAndreas Gohr  'うぇ'=>'we','うぃ'=>'wi',
1324879205e1SAndreas Gohr  'いぇ'=>'ye',
1325fed467f8SDenis Scheither
1326fed467f8SDenis Scheither  // 2 character syllables, っ doubles the consonant after
1327fed467f8SDenis Scheither  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1328fed467f8SDenis Scheither  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1329fed467f8SDenis Scheither  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1330fed467f8SDenis Scheither  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1331fed467f8SDenis Scheither  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1332fed467f8SDenis Scheither  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1333fed467f8SDenis Scheither  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1334fed467f8SDenis Scheither  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1335fed467f8SDenis Scheither  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1336fed467f8SDenis Scheither  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1337799e0977SAndreas Gohr  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1338fed467f8SDenis Scheither
1339fed467f8SDenis Scheither  // 1 character syllabels
1340fed467f8SDenis Scheither  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1341879205e1SAndreas Gohr  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1342fed467f8SDenis Scheither  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1343fed467f8SDenis Scheither  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
13449476a253SAndreas Gohr  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1345fed467f8SDenis Scheither  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1346fed467f8SDenis Scheither  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1347fed467f8SDenis Scheither  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1348fed467f8SDenis Scheither  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1349fed467f8SDenis Scheither  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1350fed467f8SDenis Scheither  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1351fed467f8SDenis Scheither  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1352879205e1SAndreas Gohr  'わ'=>'wa','を'=>'wo',
1353879205e1SAndreas Gohr  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1354879205e1SAndreas Gohr  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
13559476a253SAndreas Gohr  // old characters
13569476a253SAndreas Gohr  'ゑ'=>'we','ゐ'=>'wi',
1357fed467f8SDenis Scheither
13589476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
13599476a253SAndreas Gohr  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
13609476a253SAndreas Gohr  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1361fed467f8SDenis Scheither
13629476a253SAndreas Gohr  // never seen one of those (disabled for the moment)
1363879205e1SAndreas Gohr  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
13649476a253SAndreas Gohr  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
13659476a253SAndreas Gohr  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
13669476a253SAndreas Gohr  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
13679476a253SAndreas Gohr  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
13689476a253SAndreas Gohr  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
13699476a253SAndreas Gohr  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
13709476a253SAndreas Gohr  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
13719476a253SAndreas Gohr  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
13729476a253SAndreas Gohr  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
13739476a253SAndreas Gohr  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
13749476a253SAndreas Gohr  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
13759476a253SAndreas Gohr  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
13769476a253SAndreas Gohr  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1377fed467f8SDenis Scheither
1378fed467f8SDenis Scheither  // 'spare' characters from other romanization systems
1379fed467f8SDenis Scheither  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1380fed467f8SDenis Scheither  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1381fed467f8SDenis Scheither  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1382fed467f8SDenis Scheither  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1383fed467f8SDenis Scheither  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1384fed467f8SDenis Scheither  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1385fed467f8SDenis Scheither  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1386fed467f8SDenis Scheither  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1387fed467f8SDenis Scheither  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1388fed467f8SDenis Scheither  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1389fed467f8SDenis Scheither
1390fed467f8SDenis Scheither
13918a831f2bSAndreas Gohr  // Japanese katakana
1392fed467f8SDenis Scheither
1393fed467f8SDenis Scheither  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1394fed467f8SDenis Scheither  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1395fed467f8SDenis Scheither  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1396fed467f8SDenis Scheither  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1397fed467f8SDenis Scheither  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1398fed467f8SDenis Scheither  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1399fed467f8SDenis Scheither  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1400fed467f8SDenis Scheither  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1401fed467f8SDenis Scheither  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1402fed467f8SDenis Scheither  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1403799e0977SAndreas Gohr  'ッティー'=>'ttii',
1404799e0977SAndreas Gohr  'ッヂィー'=>'ddii',
1405fed467f8SDenis Scheither
1406fed467f8SDenis Scheither  // 3 character syllables - doubled vowels
1407fed467f8SDenis Scheither  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1408fed467f8SDenis Scheither  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1409fed467f8SDenis Scheither  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1410fed467f8SDenis Scheither  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1411fed467f8SDenis Scheither  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1412fed467f8SDenis Scheither  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1413fed467f8SDenis Scheither  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1414fed467f8SDenis Scheither  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1415fed467f8SDenis Scheither  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1416fed467f8SDenis Scheither  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1417fed467f8SDenis Scheither  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1418fed467f8SDenis Scheither  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1419fed467f8SDenis Scheither  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1420fed467f8SDenis Scheither  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1421fed467f8SDenis Scheither  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1422fed467f8SDenis Scheither  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1423fed467f8SDenis Scheither  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1424fed467f8SDenis Scheither  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1425fed467f8SDenis Scheither  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1426fed467f8SDenis Scheither  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1427fed467f8SDenis Scheither  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1428fed467f8SDenis Scheither  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1429fed467f8SDenis Scheither  'ウェー'=>'wee','ウィー'=>'wii',
1430fed467f8SDenis Scheither  'イェー'=>'yee',
1431799e0977SAndreas Gohr  'ティー'=>'tii',
1432799e0977SAndreas Gohr  'ヂィー'=>'dii',
1433fed467f8SDenis Scheither
1434fed467f8SDenis Scheither  // 3 character syllables - doubled consonants
1435fed467f8SDenis Scheither  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1436fed467f8SDenis Scheither  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1437fed467f8SDenis Scheither  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1438fed467f8SDenis Scheither  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1439fed467f8SDenis Scheither  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1440fed467f8SDenis Scheither  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1441fed467f8SDenis Scheither  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1442fed467f8SDenis Scheither  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1443fed467f8SDenis Scheither  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1444799e0977SAndreas Gohr  'ッティ'=>'tti',
1445799e0977SAndreas Gohr  'ッヂィ'=>'ddi',
1446fed467f8SDenis Scheither
1447fed467f8SDenis Scheither  // 3 character syllables - doubled vowel and consonants
1448fed467f8SDenis Scheither  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1449fed467f8SDenis Scheither  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1450fed467f8SDenis Scheither  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1451fed467f8SDenis Scheither  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1452fed467f8SDenis Scheither  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1453fed467f8SDenis Scheither  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1454fed467f8SDenis Scheither  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1455fed467f8SDenis Scheither  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1456799e0977SAndreas Gohr  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1457799e0977SAndreas Gohr  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1458fed467f8SDenis Scheither  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1459fed467f8SDenis Scheither
1460fed467f8SDenis Scheither  // 2 character syllables - normal
1461799e0977SAndreas Gohr  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1462799e0977SAndreas Gohr  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1463799e0977SAndreas Gohr  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1464fed467f8SDenis Scheither  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1465fed467f8SDenis Scheither  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1466fed467f8SDenis Scheither  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1467fed467f8SDenis Scheither  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1468fed467f8SDenis Scheither  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1469fed467f8SDenis Scheither  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1470fed467f8SDenis Scheither  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1471fed467f8SDenis Scheither  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1472879205e1SAndreas Gohr  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1473879205e1SAndreas Gohr  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1474fed467f8SDenis Scheither  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1475fed467f8SDenis Scheither  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1476fed467f8SDenis Scheither  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1477799e0977SAndreas Gohr  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1478fed467f8SDenis Scheither  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1479fed467f8SDenis Scheither  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1480fed467f8SDenis Scheither  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1481fed467f8SDenis Scheither  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1482fed467f8SDenis Scheither  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1483fed467f8SDenis Scheither  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1484fed467f8SDenis Scheither  'ウェ'=>'we','ウィ'=>'wi',
1485fed467f8SDenis Scheither  'イェ'=>'ye',
1486799e0977SAndreas Gohr  'ティ'=>'ti',
1487799e0977SAndreas Gohr  'ヂィ'=>'di',
1488fed467f8SDenis Scheither
1489fed467f8SDenis Scheither  // 2 character syllables - doubled vocal
1490fed467f8SDenis Scheither  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1491fed467f8SDenis Scheither  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1492fed467f8SDenis Scheither  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1493fed467f8SDenis Scheither  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1494fed467f8SDenis Scheither  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1495fed467f8SDenis Scheither  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1496fed467f8SDenis Scheither  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1497fed467f8SDenis Scheither  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1498fed467f8SDenis Scheither  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1499fed467f8SDenis Scheither  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1500fed467f8SDenis Scheither  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1501799e0977SAndreas Gohr  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1502fed467f8SDenis Scheither  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1503fed467f8SDenis Scheither  'ワー'=>'waa','ヲー'=>'woo',
1504fed467f8SDenis Scheither  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1505fed467f8SDenis Scheither  'ヵー'=>'kaa','ヶー'=>'kee',
15069476a253SAndreas Gohr  // old characters
15079476a253SAndreas Gohr  'ヱー'=>'wee','ヰー'=>'wii',
1508fed467f8SDenis Scheither
1509879205e1SAndreas Gohr  // seperate katakana 'n'
1510879205e1SAndreas Gohr  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1511879205e1SAndreas Gohr  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1512879205e1SAndreas Gohr
1513fed467f8SDenis Scheither  // 2 character syllables - doubled consonants
1514fed467f8SDenis Scheither  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1515fed467f8SDenis Scheither  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1516fed467f8SDenis Scheither  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1517fed467f8SDenis Scheither  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1518fed467f8SDenis Scheither  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1519fed467f8SDenis Scheither  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1520fed467f8SDenis Scheither  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1521fed467f8SDenis Scheither  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1522799e0977SAndreas Gohr  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1523799e0977SAndreas Gohr  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1524fed467f8SDenis Scheither  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1525fed467f8SDenis Scheither
1526fed467f8SDenis Scheither  // 1 character syllables
1527fed467f8SDenis Scheither  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1528fed467f8SDenis Scheither  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1529fed467f8SDenis Scheither  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1530fed467f8SDenis Scheither  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1531fed467f8SDenis Scheither  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1532fed467f8SDenis Scheither  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1533fed467f8SDenis Scheither  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1534fed467f8SDenis Scheither  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1535fed467f8SDenis Scheither  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1536fed467f8SDenis Scheither  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1537879205e1SAndreas Gohr  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1538fed467f8SDenis Scheither  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1539fed467f8SDenis Scheither  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1540fed467f8SDenis Scheither  'ワ'=>'wa','ヲ'=>'wo',
1541fed467f8SDenis Scheither  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1542fed467f8SDenis Scheither  'ヵ'=>'ka','ヶ'=>'ke',
15439476a253SAndreas Gohr  // old characters
15449476a253SAndreas Gohr  'ヱ'=>'we','ヰ'=>'wi',
1545fed467f8SDenis Scheither
15469476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1547fed467f8SDenis Scheither  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1548fed467f8SDenis Scheither  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1549fed467f8SDenis Scheither
1550799e0977SAndreas Gohr  // special characters
1551799e0977SAndreas Gohr  '・'=>'_','、'=>'_',
1552799e0977SAndreas Gohr  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1553799e0977SAndreas Gohr
1554fed467f8SDenis Scheither  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1555fed467f8SDenis Scheither  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1556fed467f8SDenis Scheither  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1557fed467f8SDenis Scheither  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1558fed467f8SDenis Scheither  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1559fed467f8SDenis Scheither  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1560fed467f8SDenis Scheither  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1561fed467f8SDenis Scheither  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1562fed467f8SDenis Scheither  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1563fed467f8SDenis Scheither  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1564fed467f8SDenis Scheither  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1565fed467f8SDenis Scheither  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
15668a831f2bSAndreas Gohr
15678a831f2bSAndreas Gohr  // "Greeklish"
15688a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
15698a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
15708a831f2bSAndreas Gohr
15718a831f2bSAndreas Gohr  // Thai
15728a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
15738a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
15748a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
15758a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
15768a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
15778a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1578014d0ab6SAndreas Gohr  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1579014d0ab6SAndreas Gohr  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1580014d0ab6SAndreas Gohr  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1581014d0ab6SAndreas Gohr  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1582014d0ab6SAndreas Gohr  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1583014d0ab6SAndreas Gohr  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1584014d0ab6SAndreas Gohr  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1585014d0ab6SAndreas Gohr  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1586014d0ab6SAndreas Gohr  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1587014d0ab6SAndreas Gohr  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1588014d0ab6SAndreas Gohr	'๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1589014d0ab6SAndreas Gohr  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
15908a831f2bSAndreas Gohr
15918a831f2bSAndreas Gohr  // Korean
15928a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
15938a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
15948a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
15958a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
15968a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
15978a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
15988a831f2bSAndreas Gohr);
1599340756e4Sandi
1600340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
16018a831f2bSAndreas Gohr
1602