xref: /dokuwiki/inc/utf8.php (revision 72de906899b8636e7c60c79f81509d00d9b7bd2b)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr  }else{
16ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr  }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22ab77016bSAndreas Gohr
2382257610Sandi/**
2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi *
2649c713a3Sandi * Slashes are not encoded
2749c713a3Sandi *
28f59b22f0Sandi * When the second parameter is true the string will
29f59b22f0Sandi * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi * This makes it safe to run it multiple times on the
31f59b22f0Sandi * same string (default is true)
32f59b22f0Sandi *
3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi * @see    urlencode
3549c713a3Sandi */
36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi    return $file;
39f59b22f0Sandi  }
40f59b22f0Sandi  $file = urlencode($file);
4149c713a3Sandi  $file = str_replace('%2F','/',$file);
4249c713a3Sandi  return $file;
4349c713a3Sandi}
4449c713a3Sandi
4549c713a3Sandi/**
4649c713a3Sandi * URL-Decode a filename
4749c713a3Sandi *
48f59b22f0Sandi * This is just a wrapper around urldecode
49f59b22f0Sandi *
5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
51f59b22f0Sandi * @see    urldecode
5249c713a3Sandi */
5349c713a3Sandifunction utf8_decodeFN($file){
54f59b22f0Sandi  $file = urldecode($file);
5549c713a3Sandi  return $file;
5649c713a3Sandi}
5749c713a3Sandi
58f29bd553Sandi/**
5944f669e9Sandi * Checks if a string contains 7bit ASCII only
6044f669e9Sandi *
6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6244f669e9Sandi */
6344f669e9Sandifunction utf8_isASCII($str){
6444f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6544f669e9Sandi    if(ord($str{$i}) >127) return false;
6644f669e9Sandi  }
6744f669e9Sandi  return true;
6844f669e9Sandi}
6944f669e9Sandi
7044f669e9Sandi/**
71e1906e6eSandi * Strips all highbyte chars
72e1906e6eSandi *
73e1906e6eSandi * Returns a pure ASCII7 string
74e1906e6eSandi *
75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
76e1906e6eSandi */
77e1906e6eSandifunction utf8_strip($str){
78e1906e6eSandi  $ascii = '';
79e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
80e1906e6eSandi    if(ord($str{$i}) <128){
81e1906e6eSandi      $ascii .= $str{$i};
82e1906e6eSandi    }
83e1906e6eSandi  }
84e1906e6eSandi  return $ascii;
85e1906e6eSandi}
86e1906e6eSandi
87e1906e6eSandi/**
88f29bd553Sandi * Tries to detect if a string is in Unicode encoding
89f29bd553Sandi *
90f29bd553Sandi * @author <bmorel@ssi.fr>
91f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
92f29bd553Sandi */
93f29bd553Sandifunction utf8_check($Str) {
94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
955e613a5cSchris  $b = ord($Str[$i]);
965e613a5cSchris  if ($b < 0x80) continue; # 0bbbbbbb
975e613a5cSchris  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
985e613a5cSchris  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
995e613a5cSchris  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
1005e613a5cSchris  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
1015e613a5cSchris  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102f29bd553Sandi  else return false; # Does not match any model
103f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105f29bd553Sandi   return false;
106f29bd553Sandi  }
107f29bd553Sandi }
108f29bd553Sandi return true;
109f29bd553Sandi}
11049c713a3Sandi
1112f954959Sandi/**
112f29317c1Sandi * Unicode aware replacement for strlen()
1132f954959Sandi *
114f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
115f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
116f29317c1Sandi * even faster than mb_strlen.
1172f954959Sandi *
118f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1192f954959Sandi * @see    strlen()
120f29317c1Sandi * @see    utf8_decode()
1212f954959Sandi */
1222f954959Sandifunction utf8_strlen($string){
123dc57ef04Sandi  return strlen(utf8_decode($string));
1242f954959Sandi}
1252f954959Sandi
1267077c942Sandi/**
12710f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1287077c942Sandi *
12910f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
13010f09f2aSAndreas Gohr *
13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
1325e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk>
13310f09f2aSAndreas Gohr * @param string
13410f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13510f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13644881bd0Shenning.noren * @return mixed string or false if failure
1377077c942Sandi */
13810f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
139ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
14010f09f2aSAndreas Gohr        if( $length === null ){
14119a32233Schris            return mb_substr($str, $offset);
1427d8be200Sandi        }else{
14319a32233Schris            return mb_substr($str, $offset, $length);
144f29317c1Sandi        }
145f29317c1Sandi    }
146f29317c1Sandi
1472626ee0cSchris    /*
1482626ee0cSchris     * Notes:
1492626ee0cSchris     *
1502626ee0cSchris     * no mb string support, so we'll use pcre regex's with 'u' flag
1512626ee0cSchris     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1522626ee0cSchris     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1532626ee0cSchris     *
1542626ee0cSchris     * substr documentation states false can be returned in some cases (e.g. offset > string length)
1552626ee0cSchris     * mb_substr never returns false, it will return an empty string instead.
1562626ee0cSchris     *
1572626ee0cSchris     * calculating the number of characters in the string is a relatively expensive operation, so
1582626ee0cSchris     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1592626ee0cSchris     */
16010f09f2aSAndreas Gohr
1612626ee0cSchris    // cast parameters to appropriate types to avoid multiple notices/warnings
1622626ee0cSchris    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1632626ee0cSchris    $offset = (int)$offset;
1642626ee0cSchris    if (!is_null($length)) $length = (int)$length;
16510f09f2aSAndreas Gohr
1662626ee0cSchris    // handle trivial cases
1675e613a5cSchris    if ($length === 0) return '';
1682626ee0cSchris    if ($offset < 0 && $length < 0 && $length < $offset) return '';
1695e613a5cSchris
1702626ee0cSchris    $offset_pattern = '';
1712626ee0cSchris    $length_pattern = '';
1722626ee0cSchris
1732626ee0cSchris    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1742626ee0cSchris    if ($offset < 0) {
1752626ee0cSchris      $strlen = strlen(utf8_decode($str));        // see notes
1762626ee0cSchris      $offset = $strlen + $offset;
1772626ee0cSchris      if ($offset < 0) $offset = 0;
1782626ee0cSchris    }
1792626ee0cSchris
1802626ee0cSchris    // establish a pattern for offset, a non-captured group equal in length to offset
1812626ee0cSchris    if ($offset > 0) {
1822626ee0cSchris      $Ox = (int)($offset/65535);
1832626ee0cSchris      $Oy = $offset%65535;
1842626ee0cSchris
1852626ee0cSchris      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1862626ee0cSchris      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1872626ee0cSchris    } else {
1882626ee0cSchris      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1892626ee0cSchris    }
1902626ee0cSchris
1912626ee0cSchris    // establish a pattern for length
1922626ee0cSchris    if (is_null($length)) {
1932626ee0cSchris      $length_pattern = '(.*)$';                  // the rest of the string
1942626ee0cSchris    } else {
1952626ee0cSchris
1962626ee0cSchris      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
1972626ee0cSchris      if ($offset > $strlen) return '';           // another trivial case
1982626ee0cSchris
1992626ee0cSchris      if ($length > 0) {
2002626ee0cSchris
2012626ee0cSchris        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2022626ee0cSchris
2032626ee0cSchris        $Lx = (int)($length/65535);
2042626ee0cSchris        $Ly = $length%65535;
2052626ee0cSchris
2062626ee0cSchris        // +ve length requires ... a captured group of length characters
2072626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2082626ee0cSchris        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2092626ee0cSchris
2102626ee0cSchris      } else if ($length < 0) {
2112626ee0cSchris
2122626ee0cSchris        if ($length < ($offset - $strlen)) return '';
2132626ee0cSchris
2142626ee0cSchris        $Lx = (int)((-$length)/65535);
2152626ee0cSchris        $Ly = (-$length)%65535;
2162626ee0cSchris
2172626ee0cSchris        // -ve length requires ... capture everything except a group of -length characters
2182626ee0cSchris        //                         anchored at the tail-end of the string
2192626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2202626ee0cSchris        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
22110f09f2aSAndreas Gohr      }
22210f09f2aSAndreas Gohr    }
22310f09f2aSAndreas Gohr
2242626ee0cSchris    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2252626ee0cSchris    return $match[1];
2262626ee0cSchris}
22710f09f2aSAndreas Gohr
228f29317c1Sandi/**
229dc57ef04Sandi * Unicode aware replacement for substr_replace()
230dc57ef04Sandi *
231dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
232dc57ef04Sandi * @see    substr_replace()
233dc57ef04Sandi */
234dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
235dc57ef04Sandi  $ret = '';
236dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
237dc57ef04Sandi  $ret .= $replacement;
238dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
239dc57ef04Sandi  return $ret;
240dc57ef04Sandi}
241dc57ef04Sandi
242dc57ef04Sandi/**
243f29317c1Sandi * Unicode aware replacement for explode
244f29317c1Sandi *
245f29317c1Sandi * @TODO   support third limit arg
246f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
247f29317c1Sandi * @see    explode();
248f29317c1Sandi */
249f29317c1Sandifunction utf8_explode($sep, $str) {
250f29317c1Sandi  if ( $sep == '' ) {
251f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
25244881bd0Shenning.noren    return false;
253f29317c1Sandi  }
254f29317c1Sandi
255f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
256f29317c1Sandi}
257f29317c1Sandi
258f29317c1Sandi/**
259f29317c1Sandi * Unicode aware replacement for strrepalce()
260f29317c1Sandi *
261f29317c1Sandi * @todo   support PHP5 count (fourth arg)
262f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
263f29317c1Sandi * @see    strreplace();
264f29317c1Sandi */
265f29317c1Sandifunction utf8_str_replace($s,$r,$str){
266f29317c1Sandi  if(!is_array($s)){
267f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
268f29317c1Sandi  }else{
269f29317c1Sandi    foreach ($s as $k => $v) {
270f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
271f29317c1Sandi    }
272f29317c1Sandi  }
273f29317c1Sandi  return preg_replace($s,$r,$str);
274f29317c1Sandi}
275f29317c1Sandi
276f29317c1Sandi/**
277f29317c1Sandi * Unicode aware replacement for ltrim()
278f29317c1Sandi *
279f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
280f29317c1Sandi * @see    ltrim()
281f29317c1Sandi * @return string
282f29317c1Sandi */
283f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
284f29317c1Sandi  if($charlist == '') return ltrim($str);
285f29317c1Sandi
286f29317c1Sandi  //quote charlist for use in a characterclass
287f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
288f29317c1Sandi
289f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
290f29317c1Sandi}
291f29317c1Sandi
292f29317c1Sandi/**
293ea2eed85Sandi * Unicode aware replacement for rtrim()
294f29317c1Sandi *
295f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
296f29317c1Sandi * @see    rtrim()
297f29317c1Sandi * @return string
298f29317c1Sandi */
299f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
300f29317c1Sandi  if($charlist == '') return rtrim($str);
301f29317c1Sandi
302f29317c1Sandi  //quote charlist for use in a characterclass
303f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
304f29317c1Sandi
305f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
306f29317c1Sandi}
307f29317c1Sandi
308f29317c1Sandi/**
309f29317c1Sandi * Unicode aware replacement for trim()
310f29317c1Sandi *
311f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
312f29317c1Sandi * @see    trim()
313f29317c1Sandi * @return string
314f29317c1Sandi */
315f29317c1Sandifunction  utf8_trim($str,$charlist='') {
316f29317c1Sandi  if($charlist == '') return trim($str);
317f29317c1Sandi
318f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
319f29317c1Sandi}
320f29317c1Sandi
3212f954959Sandi
32249c713a3Sandi/**
32382257610Sandi * This is a unicode aware replacement for strtolower()
32482257610Sandi *
32582257610Sandi * Uses mb_string extension if available
32682257610Sandi *
327*72de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
32882257610Sandi * @see    strtolower()
32982257610Sandi * @see    utf8_strtoupper()
33082257610Sandi */
33182257610Sandifunction utf8_strtolower($string){
332ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
33382257610Sandi
33482257610Sandi  global $UTF8_UPPER_TO_LOWER;
335*72de9068SAndreas Gohr  return strtr($string,$UTF8_UPPER_TO_LOWER);
33682257610Sandi}
33782257610Sandi
33882257610Sandi/**
33982257610Sandi * This is a unicode aware replacement for strtoupper()
34082257610Sandi *
34182257610Sandi * Uses mb_string extension if available
34282257610Sandi *
343*72de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
34482257610Sandi * @see    strtoupper()
34582257610Sandi * @see    utf8_strtoupper()
34682257610Sandi */
34782257610Sandifunction utf8_strtoupper($string){
348ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
34982257610Sandi
35082257610Sandi  global $UTF8_LOWER_TO_UPPER;
351*72de9068SAndreas Gohr  return strtr($string,$UTF8_LOWER_TO_UPPER);
35282257610Sandi}
35382257610Sandi
35482257610Sandi/**
35582257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
35682257610Sandi *
35782257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
35882257610Sandi * letters. Default is to deaccent both cases ($case = 0)
35982257610Sandi *
36082257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
36182257610Sandi */
36282257610Sandifunction utf8_deaccent($string,$case=0){
36382257610Sandi  if($case <= 0){
36482257610Sandi    global $UTF8_LOWER_ACCENTS;
365*72de9068SAndreas Gohr    $string = strtr($string,$UTF8_LOWER_ACCENTS);
36682257610Sandi  }
36782257610Sandi  if($case >= 0){
36882257610Sandi    global $UTF8_UPPER_ACCENTS;
369*72de9068SAndreas Gohr    $string = strtr($string,$UTF8_UPPER_ACCENTS);
37082257610Sandi  }
37182257610Sandi  return $string;
37282257610Sandi}
37382257610Sandi
37482257610Sandi/**
3758a831f2bSAndreas Gohr * Romanize a non-latin string
3768a831f2bSAndreas Gohr *
3778a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3788a831f2bSAndreas Gohr */
3798a831f2bSAndreas Gohrfunction utf8_romanize($string){
3808a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3818a831f2bSAndreas Gohr
3828a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3838a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3848a831f2bSAndreas Gohr}
3858a831f2bSAndreas Gohr
3868a831f2bSAndreas Gohr/**
387099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
388099ada41Sandi *
389099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
390099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
391099ada41Sandi *
392099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
393099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
394099ada41Sandi * @param  string $repl       Replace special with this string
395b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
396099ada41Sandi */
397b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
398099ada41Sandi  global $UTF8_SPECIAL_CHARS;
399720307d9Schris  global $UTF8_SPECIAL_CHARS2;
400099ada41Sandi
4015c812709Sandi  static $specials = null;
4025c812709Sandi  if(is_null($specials)){
403720307d9Schris#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
404720307d9Schris    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4055c812709Sandi  }
406099ada41Sandi
407b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
408099ada41Sandi}
409099ada41Sandi
410099ada41Sandi/**
4112f954959Sandi * This is an Unicode aware replacement for strpos
4122f954959Sandi *
413*72de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
4142f954959Sandi * @see    strpos()
415*72de9068SAndreas Gohr * @param  string
416*72de9068SAndreas Gohr * @param  string
417*72de9068SAndreas Gohr * @param  integer
418*72de9068SAndreas Gohr * @return integer
4192f954959Sandi */
4202f954959Sandifunction utf8_strpos($haystack, $needle, $offset=0){
421*72de9068SAndreas Gohr    $comp = 0;
422*72de9068SAndreas Gohr    $length = null;
4232f954959Sandi
424*72de9068SAndreas Gohr    while (is_null($length) || $length < $offset) {
425*72de9068SAndreas Gohr        $pos = strpos($haystack, $needle, $offset + $comp);
426*72de9068SAndreas Gohr
427*72de9068SAndreas Gohr        if ($pos === false)
428f29317c1Sandi            return false;
429*72de9068SAndreas Gohr
430*72de9068SAndreas Gohr        $length = utf8_strlen(substr($haystack, 0, $pos));
431*72de9068SAndreas Gohr
432*72de9068SAndreas Gohr        if ($length < $offset)
433*72de9068SAndreas Gohr            $comp = $pos - $length;
434f29317c1Sandi    }
4352f954959Sandi
436*72de9068SAndreas Gohr    return $length;
437*72de9068SAndreas Gohr}
438f29317c1Sandi
4392f954959Sandi
4402f954959Sandi/**
441ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
442ea2eed85Sandi *
4439f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
444ea2eed85Sandi * @author <vpribish at shopping dot com>
445ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
446ea2eed85Sandi */
447ea2eed85Sandifunction utf8_tohtml ($str) {
448ea2eed85Sandi    $ret = '';
4499f9fb0e5STom N Harris    foreach (utf8_to_unicode($str) as $cp) {
4509f9fb0e5STom N Harris        if ($cp < 0x80)
4519f9fb0e5STom N Harris            $ret .= chr($cp);
4529f9fb0e5STom N Harris        elseif ($cp < 0x100)
4539f9fb0e5STom N Harris            $ret .= "&#$cp;";
4549f9fb0e5STom N Harris        else
4559f9fb0e5STom N Harris            $ret .= '&#x'.dechex($cp).';';
4569f9fb0e5STom N Harris    }
4579f9fb0e5STom N Harris    return $ret;
4589f9fb0e5STom N Harris}
4599f9fb0e5STom N Harris
4609f9fb0e5STom N Harris/**
4619f9fb0e5STom N Harris * Decodes HTML entities to UTF-8 characters
4629f9fb0e5STom N Harris *
4639f9fb0e5STom N Harris * Convert any &#..; entity to a codepoint,
4649f9fb0e5STom N Harris * The entities flag defaults to only decoding numeric entities.
4659f9fb0e5STom N Harris * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
4669f9fb0e5STom N Harris * are handled as well. Avoids the problem that would occur if you
4679f9fb0e5STom N Harris * had to decode "&amp;#38;&#38;amp;#38;"
4689f9fb0e5STom N Harris *
4699f9fb0e5STom N Harris * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
4709f9fb0e5STom N Harris * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
4719f9fb0e5STom N Harris * what it should be                   -> "&#38;&amp#38;"
4729f9fb0e5STom N Harris *
4739f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
4749f9fb0e5STom N Harris * @param  string  $str      UTF-8 encoded string
4759f9fb0e5STom N Harris * @param  boolean $entities Flag controlling decoding of named entities.
4769f9fb0e5STom N Harris * @return UTF-8 encoded string with numeric (and named) entities replaced.
4779f9fb0e5STom N Harris */
4789f9fb0e5STom N Harrisfunction utf8_unhtml($str, $entities=null) {
4799f9fb0e5STom N Harris    static $decoder = null;
4809f9fb0e5STom N Harris    if (is_null($decoder))
4819f9fb0e5STom N Harris      $decoder = new utf8_entity_decoder();
4829f9fb0e5STom N Harris    if (is_null($entities))
4839f9fb0e5STom N Harris        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
4849f9fb0e5STom N Harris                                     'utf8_decode_numeric', $str);
4859f9fb0e5STom N Harris    else
4869f9fb0e5STom N Harris        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
4879f9fb0e5STom N Harris                                     array(&$decoder, 'decode'), $str);
4889f9fb0e5STom N Harris}
4899f9fb0e5STom N Harrisfunction utf8_decode_numeric($ent) {
4909f9fb0e5STom N Harris    switch ($ent[2]) {
4919f9fb0e5STom N Harris      case 'X':
4929f9fb0e5STom N Harris      case 'x':
4939f9fb0e5STom N Harris          $cp = hexdec($ent[3]);
4949f9fb0e5STom N Harris          break;
4959f9fb0e5STom N Harris      default:
4969f9fb0e5STom N Harris          $cp = intval($ent[3]);
4979f9fb0e5STom N Harris          break;
4989f9fb0e5STom N Harris    }
4999f9fb0e5STom N Harris    return unicode_to_utf8(array($cp));
5009f9fb0e5STom N Harris}
5019f9fb0e5STom N Harrisclass utf8_entity_decoder {
5029f9fb0e5STom N Harris    var $table;
5039f9fb0e5STom N Harris    function utf8_entity_decoder() {
5049f9fb0e5STom N Harris        $table = get_html_translation_table(HTML_ENTITIES);
5059f9fb0e5STom N Harris        $table = array_flip($table);
5069f9fb0e5STom N Harris        $this->table = array_map(array(&$this,'makeutf8'), $table);
5079f9fb0e5STom N Harris    }
5089f9fb0e5STom N Harris    function makeutf8($c) {
5099f9fb0e5STom N Harris        return unicode_to_utf8(array(ord($c)));
5109f9fb0e5STom N Harris    }
5119f9fb0e5STom N Harris    function decode($ent) {
5129f9fb0e5STom N Harris        if ($ent[1] == '#') {
5139f9fb0e5STom N Harris            return utf8_decode_numeric($ent);
5149f9fb0e5STom N Harris        } elseif (array_key_exists($ent[0],$this->table)) {
5159f9fb0e5STom N Harris            return $this->table[$ent[0]];
5169f9fb0e5STom N Harris        } else {
5179f9fb0e5STom N Harris            return $ent[0];
518ea2eed85Sandi        }
519ea2eed85Sandi    }
520ea2eed85Sandi}
521ea2eed85Sandi
522ea2eed85Sandi/**
5231abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
5241abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
5251abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
5261abfaba4SAndreas Gohr * are not allowed.
52782257610Sandi *
5281abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
5291abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
5301abfaba4SAndreas Gohr * level E_USER_WARNING
5311abfaba4SAndreas Gohr *
5321abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
5331abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
5341abfaba4SAndreas Gohr *
5351abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
5361abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
5371abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
5381abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
53944881bd0Shenning.noren * @return mixed array of unicode code points or false if UTF-8 invalid
5401abfaba4SAndreas Gohr * @see    unicode_to_utf8
5411abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
5421abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
54382257610Sandi */
5441abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
5451abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
5461abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
5471abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
5481abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
54982257610Sandi
5501abfaba4SAndreas Gohr    $out = array();
5511abfaba4SAndreas Gohr
5521abfaba4SAndreas Gohr    $len = strlen($str);
5531abfaba4SAndreas Gohr
5541abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
5551abfaba4SAndreas Gohr
5561abfaba4SAndreas Gohr        $in = ord($str{$i});
5571abfaba4SAndreas Gohr
5581abfaba4SAndreas Gohr        if ( $mState == 0) {
5591abfaba4SAndreas Gohr
5601abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
5611abfaba4SAndreas Gohr            // multi-octet sequence.
5621abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
5631abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
5641abfaba4SAndreas Gohr                $out[] = $in;
5651abfaba4SAndreas Gohr                $mBytes = 1;
5661abfaba4SAndreas Gohr
5671abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
5681abfaba4SAndreas Gohr                // First octet of 2 octet sequence
5691abfaba4SAndreas Gohr                $mUcs4 = ($in);
5701abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
5711abfaba4SAndreas Gohr                $mState = 1;
5721abfaba4SAndreas Gohr                $mBytes = 2;
5731abfaba4SAndreas Gohr
5741abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
5751abfaba4SAndreas Gohr                // First octet of 3 octet sequence
5761abfaba4SAndreas Gohr                $mUcs4 = ($in);
5771abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
5781abfaba4SAndreas Gohr                $mState = 2;
5791abfaba4SAndreas Gohr                $mBytes = 3;
5801abfaba4SAndreas Gohr
5811abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
5821abfaba4SAndreas Gohr                // First octet of 4 octet sequence
5831abfaba4SAndreas Gohr                $mUcs4 = ($in);
5841abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
5851abfaba4SAndreas Gohr                $mState = 3;
5861abfaba4SAndreas Gohr                $mBytes = 4;
5871abfaba4SAndreas Gohr
5881abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
5891abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
5901abfaba4SAndreas Gohr                 *
5911abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
5921abfaba4SAndreas Gohr                 * (a) not the shortest form or
5931abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
5941abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
5951abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5961abfaba4SAndreas Gohr                 */
5971abfaba4SAndreas Gohr                $mUcs4 = ($in);
5981abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5991abfaba4SAndreas Gohr                $mState = 4;
6001abfaba4SAndreas Gohr                $mBytes = 5;
6011abfaba4SAndreas Gohr
6021abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
6031abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
6041abfaba4SAndreas Gohr                $mUcs4 = ($in);
6051abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
6061abfaba4SAndreas Gohr                $mState = 5;
6071abfaba4SAndreas Gohr                $mBytes = 6;
6081abfaba4SAndreas Gohr
6091abfaba4SAndreas Gohr            } elseif($strict) {
6101abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
6111abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
6121abfaba4SAndreas Gohr                 */
6131abfaba4SAndreas Gohr                trigger_error(
6141abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
6151abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
6161abfaba4SAndreas Gohr                        E_USER_WARNING
6171abfaba4SAndreas Gohr                    );
61844881bd0Shenning.noren                return false;
6191abfaba4SAndreas Gohr
6201abfaba4SAndreas Gohr            }
6211abfaba4SAndreas Gohr
6221abfaba4SAndreas Gohr        } else {
6231abfaba4SAndreas Gohr
6241abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
6251abfaba4SAndreas Gohr            // sequence
6261abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
6271abfaba4SAndreas Gohr
6281abfaba4SAndreas Gohr                // Legal continuation.
6291abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
6301abfaba4SAndreas Gohr                $tmp = $in;
6311abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
6321abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
6331abfaba4SAndreas Gohr
6341abfaba4SAndreas Gohr                /**
6351abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
6361abfaba4SAndreas Gohr                 * Unicode codepoint to be output
6371abfaba4SAndreas Gohr                 */
6381abfaba4SAndreas Gohr                if (0 == --$mState) {
6391abfaba4SAndreas Gohr
6401abfaba4SAndreas Gohr                    /*
6411abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
6421abfaba4SAndreas Gohr                     */
6431abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
6441abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
6451abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
6461abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
6471abfaba4SAndreas Gohr                        (4 < $mBytes) ||
6481abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
6491abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
6501abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
6511abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
6521abfaba4SAndreas Gohr
6531abfaba4SAndreas Gohr                        if($strict){
6541abfaba4SAndreas Gohr                            trigger_error(
6551abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
6561abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
6571abfaba4SAndreas Gohr                                    E_USER_WARNING
6581abfaba4SAndreas Gohr                                );
6591abfaba4SAndreas Gohr
66044881bd0Shenning.noren                            return false;
6611abfaba4SAndreas Gohr                        }
6621abfaba4SAndreas Gohr
6631abfaba4SAndreas Gohr                    }
6641abfaba4SAndreas Gohr
6651abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
6661abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
6671abfaba4SAndreas Gohr                        $out[] = $mUcs4;
6681abfaba4SAndreas Gohr                    }
6691abfaba4SAndreas Gohr
6701abfaba4SAndreas Gohr                    //initialize UTF8 cache
6711abfaba4SAndreas Gohr                    $mState = 0;
6721abfaba4SAndreas Gohr                    $mUcs4  = 0;
6731abfaba4SAndreas Gohr                    $mBytes = 1;
6741abfaba4SAndreas Gohr                }
6751abfaba4SAndreas Gohr
6761abfaba4SAndreas Gohr            } elseif($strict) {
6771abfaba4SAndreas Gohr                /**
6781abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
6791abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
6801abfaba4SAndreas Gohr                 */
6811abfaba4SAndreas Gohr                trigger_error(
6821abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
6831abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
6841abfaba4SAndreas Gohr                        E_USER_WARNING
6851abfaba4SAndreas Gohr                    );
6861abfaba4SAndreas Gohr
68744881bd0Shenning.noren                return false;
68882257610Sandi            }
68982257610Sandi        }
69082257610Sandi    }
6911abfaba4SAndreas Gohr    return $out;
69282257610Sandi}
69382257610Sandi
69482257610Sandi/**
6951abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6961abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6971abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6981abfaba4SAndreas Gohr * are not allowed.
69982257610Sandi *
7001abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
7011abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
7021abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
7031abfaba4SAndreas Gohr *
7041abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
7051abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
7061abfaba4SAndreas Gohr * reference the array by it's keys
7071abfaba4SAndreas Gohr *
7081abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
7091abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
71044881bd0Shenning.noren * @return mixed UTF-8 string or false if array contains invalid code points
7111abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
7121abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
7131abfaba4SAndreas Gohr * @see    utf8_to_unicode
7141abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
7151abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
71682257610Sandi */
7171abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
7181abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
7191abfaba4SAndreas Gohr    ob_start();
720f949a01cSAndreas Gohr
7211abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
7221abfaba4SAndreas Gohr
7231abfaba4SAndreas Gohr        # ASCII range (including control chars)
7241abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
7251abfaba4SAndreas Gohr
7261abfaba4SAndreas Gohr            echo chr($arr[$k]);
7271abfaba4SAndreas Gohr
7281abfaba4SAndreas Gohr        # 2 byte sequence
7291abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
7301abfaba4SAndreas Gohr
7311abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
7321abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
7331abfaba4SAndreas Gohr
7341abfaba4SAndreas Gohr        # Byte order mark (skip)
7351abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
7361abfaba4SAndreas Gohr
7371abfaba4SAndreas Gohr            // nop -- zap the BOM
7381abfaba4SAndreas Gohr
7391abfaba4SAndreas Gohr        # Test for illegal surrogates
7401abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
7411abfaba4SAndreas Gohr
7421abfaba4SAndreas Gohr            // found a surrogate
7431abfaba4SAndreas Gohr            if($strict){
7441abfaba4SAndreas Gohr                trigger_error(
7451abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
7461abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
7471abfaba4SAndreas Gohr                    E_USER_WARNING
7481abfaba4SAndreas Gohr                    );
74944881bd0Shenning.noren                return false;
7501abfaba4SAndreas Gohr            }
7511abfaba4SAndreas Gohr
7521abfaba4SAndreas Gohr        # 3 byte sequence
7531abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
7541abfaba4SAndreas Gohr
7551abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
7561abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
7571abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
7581abfaba4SAndreas Gohr
7591abfaba4SAndreas Gohr        # 4 byte sequence
7601abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
7611abfaba4SAndreas Gohr
7621abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
7631abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
7641abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
7651abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
7661abfaba4SAndreas Gohr
7671abfaba4SAndreas Gohr        } elseif($strict) {
7681abfaba4SAndreas Gohr
7691abfaba4SAndreas Gohr            trigger_error(
7701abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
7711abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
7721abfaba4SAndreas Gohr                E_USER_WARNING
7731abfaba4SAndreas Gohr                );
7741abfaba4SAndreas Gohr
7751abfaba4SAndreas Gohr            // out of range
77644881bd0Shenning.noren            return false;
77782257610Sandi        }
77882257610Sandi    }
7791abfaba4SAndreas Gohr
7801abfaba4SAndreas Gohr    $result = ob_get_contents();
7811abfaba4SAndreas Gohr    ob_end_clean();
7821abfaba4SAndreas Gohr    return $result;
78382257610Sandi}
78482257610Sandi
78582257610Sandi/**
78615fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
78715fa0b4fSAndreas Gohr *
78815fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
78915fa0b4fSAndreas Gohr */
79015fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
79115fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
792ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
79315fa0b4fSAndreas Gohr
79415fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
79515fa0b4fSAndreas Gohr  foreach($uni as $cp){
79615fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
79715fa0b4fSAndreas Gohr  }
79815fa0b4fSAndreas Gohr  return $out;
79915fa0b4fSAndreas Gohr}
80015fa0b4fSAndreas Gohr
80115fa0b4fSAndreas Gohr/**
80215fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
80315fa0b4fSAndreas Gohr *
80415fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
80515fa0b4fSAndreas Gohr */
80615fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
80715fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
80815fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
80915fa0b4fSAndreas Gohr}
81015fa0b4fSAndreas Gohr
8110eac1afbSAndreas Gohr/**
8120eac1afbSAndreas Gohr * Replace bad bytes with an alternative character
8130eac1afbSAndreas Gohr *
8140eac1afbSAndreas Gohr * ASCII character is recommended for replacement char
8150eac1afbSAndreas Gohr *
8160eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string
8170eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms
8180eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars
8190eac1afbSAndreas Gohr *
8200eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
8210eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8
8220eac1afbSAndreas Gohr * @param string to search
8230eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII
8240eac1afbSAndreas Gohr * @return string
8250eac1afbSAndreas Gohr */
8260eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') {
8270eac1afbSAndreas Gohr    $UTF8_BAD =
8280eac1afbSAndreas Gohr     '([\x00-\x7F]'.                          # ASCII (including control chars)
8290eac1afbSAndreas Gohr     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
8300eac1afbSAndreas Gohr     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
8310eac1afbSAndreas Gohr     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
8320eac1afbSAndreas Gohr     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
8330eac1afbSAndreas Gohr     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
8340eac1afbSAndreas Gohr     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
8350eac1afbSAndreas Gohr     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
8360eac1afbSAndreas Gohr     '|(.{1}))';                              # invalid byte
8370eac1afbSAndreas Gohr    ob_start();
8380eac1afbSAndreas Gohr    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
8390eac1afbSAndreas Gohr        if ( !isset($matches[2])) {
8400eac1afbSAndreas Gohr            echo $matches[0];
8410eac1afbSAndreas Gohr        } else {
8420eac1afbSAndreas Gohr            echo $replace;
8430eac1afbSAndreas Gohr        }
8440eac1afbSAndreas Gohr        $str = substr($str,strlen($matches[0]));
8450eac1afbSAndreas Gohr    }
8460eac1afbSAndreas Gohr    $result = ob_get_contents();
8470eac1afbSAndreas Gohr    ob_end_clean();
8480eac1afbSAndreas Gohr    return $result;
8490eac1afbSAndreas Gohr}
850ab77016bSAndreas Gohr
8515953e889Schris/**
8525953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary
8535953e889Schris *
8545953e889Schris * @param $str   string   utf8 character string
8555953e889Schris * @param $i     int      byte index into $str
8565953e889Schris * @param $next  bool     direction to search for boundary,
8575953e889Schris *                           false = up (current character)
8585953e889Schris *                           true = down (next character)
8595953e889Schris *
8605953e889Schris * @return int            byte index into $str now pointing to a utf8 character boundary
8615953e889Schris *
8625953e889Schris * @author       chris smith <chris@jalakai.co.uk>
8635953e889Schris */
8645953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) {
8655953e889Schris
866f50163d1Schris  if ($i <= 0) return 0;
867f50163d1Schris
8685953e889Schris  $limit = strlen($str);
869f50163d1Schris  if ($i>=$limit) return $limit;
870f50163d1Schris
871f50163d1Schris  if ($next) {
8725953e889Schris    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
8735953e889Schris  } else {
8745953e889Schris    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
8755953e889Schris  }
8765953e889Schris
8775953e889Schris  return $i;
8785953e889Schris}
8795953e889Schris
880ab77016bSAndreas Gohr// only needed if no mb_string available
881ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
88215fa0b4fSAndreas Gohr  /**
88382257610Sandi   * UTF-8 Case lookup table
88482257610Sandi   *
88582257610Sandi   * This lookuptable defines the upper case letters to their correspponding
88682257610Sandi   * lower case letter in UTF-8
88782257610Sandi   *
88882257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
88982257610Sandi   */
89054662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
89154662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
892*72de9068SAndreas Gohr    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
893*72de9068SAndreas Gohr    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
894*72de9068SAndreas Gohr    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
895*72de9068SAndreas Gohr    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
896*72de9068SAndreas Gohr    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
897*72de9068SAndreas Gohr    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
898*72de9068SAndreas Gohr    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
899*72de9068SAndreas Gohr    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
900*72de9068SAndreas Gohr    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
901*72de9068SAndreas Gohr    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
902*72de9068SAndreas Gohr    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
903*72de9068SAndreas Gohr    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
904*72de9068SAndreas Gohr    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
905*72de9068SAndreas Gohr    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
906*72de9068SAndreas Gohr    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
907*72de9068SAndreas Gohr    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
908*72de9068SAndreas Gohr    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
909*72de9068SAndreas Gohr    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
910*72de9068SAndreas Gohr    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
911*72de9068SAndreas Gohr    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
912*72de9068SAndreas Gohr    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
913*72de9068SAndreas Gohr    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
914*72de9068SAndreas Gohr    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
915*72de9068SAndreas Gohr    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
916*72de9068SAndreas Gohr    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
917*72de9068SAndreas Gohr    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
918*72de9068SAndreas Gohr    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
919*72de9068SAndreas Gohr    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
920*72de9068SAndreas Gohr    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
921*72de9068SAndreas Gohr    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
922*72de9068SAndreas Gohr    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
923*72de9068SAndreas Gohr    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
924*72de9068SAndreas Gohr    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
925*72de9068SAndreas Gohr    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
926*72de9068SAndreas Gohr    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
927*72de9068SAndreas Gohr    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
928*72de9068SAndreas Gohr    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
929*72de9068SAndreas Gohr    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
930*72de9068SAndreas Gohr    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
931*72de9068SAndreas Gohr    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
932*72de9068SAndreas Gohr    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
933*72de9068SAndreas Gohr    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
934*72de9068SAndreas Gohr    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
935*72de9068SAndreas Gohr    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
936*72de9068SAndreas Gohr    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
937*72de9068SAndreas Gohr    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
938*72de9068SAndreas Gohr    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
939*72de9068SAndreas Gohr    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
940*72de9068SAndreas Gohr    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
941*72de9068SAndreas Gohr    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
942*72de9068SAndreas Gohr    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
943*72de9068SAndreas Gohr    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
944*72de9068SAndreas Gohr    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
945*72de9068SAndreas Gohr    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
946*72de9068SAndreas Gohr    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
947*72de9068SAndreas Gohr    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
948*72de9068SAndreas Gohr    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
949*72de9068SAndreas Gohr    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
950*72de9068SAndreas Gohr    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
951*72de9068SAndreas Gohr    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
952*72de9068SAndreas Gohr    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
953*72de9068SAndreas Gohr    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
954*72de9068SAndreas Gohr    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
955*72de9068SAndreas Gohr    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
956*72de9068SAndreas Gohr    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
957*72de9068SAndreas Gohr    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
958*72de9068SAndreas Gohr    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
95982257610Sandi  );
96082257610Sandi
96182257610Sandi  /**
96282257610Sandi   * UTF-8 Case lookup table
96382257610Sandi   *
96482257610Sandi   * This lookuptable defines the lower case letters to their correspponding
965*72de9068SAndreas Gohr   * upper case letter in UTF-8
96682257610Sandi   *
96782257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
96882257610Sandi   */
96954662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
970*72de9068SAndreas Gohr  $UTF8_UPPER_TO_LOWER = array (
971*72de9068SAndreas Gohr    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
972*72de9068SAndreas Gohr    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
973*72de9068SAndreas Gohr    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
974*72de9068SAndreas Gohr    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
975*72de9068SAndreas Gohr    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
976*72de9068SAndreas Gohr    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
977*72de9068SAndreas Gohr    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
978*72de9068SAndreas Gohr    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
979*72de9068SAndreas Gohr    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
980*72de9068SAndreas Gohr    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
981*72de9068SAndreas Gohr    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
982*72de9068SAndreas Gohr    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
983*72de9068SAndreas Gohr    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
984*72de9068SAndreas Gohr    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
985*72de9068SAndreas Gohr    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
986*72de9068SAndreas Gohr    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
987*72de9068SAndreas Gohr    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
988*72de9068SAndreas Gohr    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
989*72de9068SAndreas Gohr    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
990*72de9068SAndreas Gohr    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
991*72de9068SAndreas Gohr    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
992*72de9068SAndreas Gohr    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
993*72de9068SAndreas Gohr    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
994*72de9068SAndreas Gohr    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
995*72de9068SAndreas Gohr    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
996*72de9068SAndreas Gohr    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
997*72de9068SAndreas Gohr    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
998*72de9068SAndreas Gohr    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
999*72de9068SAndreas Gohr    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1000*72de9068SAndreas Gohr    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1001*72de9068SAndreas Gohr    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1002*72de9068SAndreas Gohr    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1003*72de9068SAndreas Gohr    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1004*72de9068SAndreas Gohr    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1005*72de9068SAndreas Gohr    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1006*72de9068SAndreas Gohr    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1007*72de9068SAndreas Gohr    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1008*72de9068SAndreas Gohr    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1009*72de9068SAndreas Gohr    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1010*72de9068SAndreas Gohr    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1011*72de9068SAndreas Gohr    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1012*72de9068SAndreas Gohr    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1013*72de9068SAndreas Gohr    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1014*72de9068SAndreas Gohr    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1015*72de9068SAndreas Gohr    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1016*72de9068SAndreas Gohr    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1017*72de9068SAndreas Gohr    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1018*72de9068SAndreas Gohr    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1019*72de9068SAndreas Gohr    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1020*72de9068SAndreas Gohr    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1021*72de9068SAndreas Gohr    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1022*72de9068SAndreas Gohr    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1023*72de9068SAndreas Gohr    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1024*72de9068SAndreas Gohr    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1025*72de9068SAndreas Gohr    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1026*72de9068SAndreas Gohr    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1027*72de9068SAndreas Gohr    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1028*72de9068SAndreas Gohr    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1029*72de9068SAndreas Gohr    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1030*72de9068SAndreas Gohr    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1031*72de9068SAndreas Gohr    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1032*72de9068SAndreas Gohr    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1033*72de9068SAndreas Gohr    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1034*72de9068SAndreas Gohr    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1035*72de9068SAndreas Gohr    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1036*72de9068SAndreas Gohr    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1037*72de9068SAndreas Gohr    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1038*72de9068SAndreas Gohr  );
1039*72de9068SAndreas Gohr}; // end of case lookup tables
1040ab77016bSAndreas Gohr
104182257610Sandi/**
104282257610Sandi * UTF-8 lookup table for lower case accented letters
104382257610Sandi *
104482257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
104582257610Sandi * range. This are lower case letters only.
104682257610Sandi *
104782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
104882257610Sandi * @see    utf8_deaccent()
104982257610Sandi */
105054662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
105182257610Sandi$UTF8_LOWER_ACCENTS = array(
105282257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
105382257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
105482257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
105582257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
105682257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
105782257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
105882257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
105982257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
106082257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
106182257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
106282257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
106382257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
106482257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
106582257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
106674c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
106782257610Sandi);
106882257610Sandi
106982257610Sandi/**
107082257610Sandi * UTF-8 lookup table for upper case accented letters
107182257610Sandi *
107282257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
107382257610Sandi * range. This are upper case letters only.
107482257610Sandi *
107582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
107682257610Sandi * @see    utf8_deaccent()
107782257610Sandi */
107854662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
107982257610Sandi$UTF8_UPPER_ACCENTS = array(
1080df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1081df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1082df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1083df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1084df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1085df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1086df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1087df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1088df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1089df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1090df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1091df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1092df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1093df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
109474c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
109582257610Sandi);
109682257610Sandi
1097099ada41Sandi/**
1098099ada41Sandi * UTF-8 array of common special characters
1099099ada41Sandi *
1100099ada41Sandi * This array should contain all special characters (not a letter or digit)
1101099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1102099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1103099ada41Sandi * chars.
1104099ada41Sandi *
1105099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1106ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1107099ada41Sandi *
1108099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1109099ada41Sandi * @see    utf8_stripspecials()
1110099ada41Sandi */
111154662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1112099ada41Sandi$UTF8_SPECIAL_CHARS = array(
1113099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1114ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
11155c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
11165c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1117099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1118099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1119099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1120099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1121099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1122099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1123099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1124099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1125099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1126099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1127099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1128099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1129099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1130099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1131099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1132099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1133099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1134099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1135099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1136099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1137099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1138099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1139099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1140099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1141099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1142099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1143099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1144099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1145099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1146099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1147099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1148099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1149099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1150099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1151099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1152099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1153099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1154099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1155099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1156099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1157099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1158d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1159d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1160d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1161d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1162099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1163099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1164099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1165099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1166d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1167d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1168d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1169d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1170d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1171d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1172099ada41Sandi);
1173340756e4Sandi
1174720307d9Schris// utf8 version of above data
1175720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1176720307d9Schris$UTF8_SPECIAL_CHARS2 =
117737242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1178720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1179720307d9Schris    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1180720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1181720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1182720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1183720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1184720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1185720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1186720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1187720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1188720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1189720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1190720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1191d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1192d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1193d5b23302STom N Harris    '�'.
1194d5b23302STom N Harris    '�ﹼﹽ'.
1195d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1196d5b23302STom N Harris    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○';
1197720307d9Schris
11988a831f2bSAndreas Gohr/**
11998a831f2bSAndreas Gohr * Romanization lookup table
12008a831f2bSAndreas Gohr *
12018a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
12028a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
12038a831f2bSAndreas Gohr *
12048a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
12058a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
12068a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
12078a831f2bSAndreas Gohr *
12088a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
12098a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
12108a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
12118a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
12128a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
12138a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
12148a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
12158a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
12168a831f2bSAndreas Gohr */
121754662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
12188a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
12198a831f2bSAndreas Gohr  //russian cyrillic
12208a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
12218a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
12228a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
12238a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
12248a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
12258a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1226d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1227f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
12288a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
12298a831f2bSAndreas Gohr  // Ukrainian cyrillic
12308a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
12318a831f2bSAndreas Gohr  // Georgian
12328a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
12338a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
12348a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
12358a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
12368a831f2bSAndreas Gohr  'ჰ'=>'xh',
12378a831f2bSAndreas Gohr  //Sanskrit
12388a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
12398a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
12408a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
12418a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
12428a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
12438a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
12448a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
12458a831f2bSAndreas Gohr  //Hebrew
12463dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
12473dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
12483dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
12498a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
12508a831f2bSAndreas Gohr  //Arabic
12518a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
12528a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
12538a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
12548a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
12558a831f2bSAndreas Gohr
12568a831f2bSAndreas Gohr  // Japanese hiragana
12578a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
12588a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
12598a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
12608a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
12618a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
12628a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
12638a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
12648a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
12658a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
12668a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
12678a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
12688a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
12698a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
12708a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
12718a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
12728a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
12738a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
12748a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
12758a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
12768a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
12778a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
12788a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
12798a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
12808a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
12818a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
12828a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
12838a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
12848a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
12858a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
12868a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
12878a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
12888a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
12898a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
12908a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
12918a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
12928a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
12938a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
12948a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
12958a831f2bSAndreas Gohr  'じゅ'=>'zyu',
12968a831f2bSAndreas Gohr  // Japanese katakana
12978a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
12988a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
12998a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
13008a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
13018a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
13028a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
13038a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
13048a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
13058a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
13068a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
13078a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
13088a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
13098a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
13108a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
13118a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
13128a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
13138a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
13148a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
13158a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
13168a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
13178a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
13188a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
13198a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
13208a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
13218a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
13228a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
13238a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
13248a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
13258a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
13268a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
13278a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
13288a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
13298a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
13308a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
13318a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
13328a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
13338a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
13348a831f2bSAndreas Gohr
13358a831f2bSAndreas Gohr  // "Greeklish"
13368a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
13378a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
13388a831f2bSAndreas Gohr
13398a831f2bSAndreas Gohr  // Thai
13408a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
13418a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
13428a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
13438a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
13448a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
13458a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
13468a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
13478a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
13488a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
13498a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
13508a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
13518a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
13528a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
13538a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
13548a831f2bSAndreas Gohr
13558a831f2bSAndreas Gohr  // Korean
13568a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
13578a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
13588a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
13598a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
13608a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
13618a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
13628a831f2bSAndreas Gohr);
1363340756e4Sandi
1364340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
13658a831f2bSAndreas Gohr
1366