xref: /dokuwiki/inc/utf8.php (revision 9476a253b9cef65ab9f56f3c4938cde99d9cf11b)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr  }else{
16ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr  }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22ab77016bSAndreas Gohr
2382257610Sandi/**
2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi *
2649c713a3Sandi * Slashes are not encoded
2749c713a3Sandi *
28f59b22f0Sandi * When the second parameter is true the string will
29f59b22f0Sandi * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi * This makes it safe to run it multiple times on the
31f59b22f0Sandi * same string (default is true)
32f59b22f0Sandi *
3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi * @see    urlencode
3549c713a3Sandi */
36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi    return $file;
39f59b22f0Sandi  }
40f59b22f0Sandi  $file = urlencode($file);
4149c713a3Sandi  $file = str_replace('%2F','/',$file);
4249c713a3Sandi  return $file;
4349c713a3Sandi}
4449c713a3Sandi
4549c713a3Sandi/**
4649c713a3Sandi * URL-Decode a filename
4749c713a3Sandi *
48f59b22f0Sandi * This is just a wrapper around urldecode
49f59b22f0Sandi *
5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
51f59b22f0Sandi * @see    urldecode
5249c713a3Sandi */
5349c713a3Sandifunction utf8_decodeFN($file){
54f59b22f0Sandi  $file = urldecode($file);
5549c713a3Sandi  return $file;
5649c713a3Sandi}
5749c713a3Sandi
58f29bd553Sandi/**
5944f669e9Sandi * Checks if a string contains 7bit ASCII only
6044f669e9Sandi *
6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6244f669e9Sandi */
6344f669e9Sandifunction utf8_isASCII($str){
6444f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6544f669e9Sandi    if(ord($str{$i}) >127) return false;
6644f669e9Sandi  }
6744f669e9Sandi  return true;
6844f669e9Sandi}
6944f669e9Sandi
7044f669e9Sandi/**
71e1906e6eSandi * Strips all highbyte chars
72e1906e6eSandi *
73e1906e6eSandi * Returns a pure ASCII7 string
74e1906e6eSandi *
75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
76e1906e6eSandi */
77e1906e6eSandifunction utf8_strip($str){
78e1906e6eSandi  $ascii = '';
79e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
80e1906e6eSandi    if(ord($str{$i}) <128){
81e1906e6eSandi      $ascii .= $str{$i};
82e1906e6eSandi    }
83e1906e6eSandi  }
84e1906e6eSandi  return $ascii;
85e1906e6eSandi}
86e1906e6eSandi
87e1906e6eSandi/**
88f29bd553Sandi * Tries to detect if a string is in Unicode encoding
89f29bd553Sandi *
90f29bd553Sandi * @author <bmorel@ssi.fr>
91f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
92f29bd553Sandi */
93f29bd553Sandifunction utf8_check($Str) {
94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
955e613a5cSchris  $b = ord($Str[$i]);
965e613a5cSchris  if ($b < 0x80) continue; # 0bbbbbbb
975e613a5cSchris  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
985e613a5cSchris  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
995e613a5cSchris  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
1005e613a5cSchris  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
1015e613a5cSchris  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102f29bd553Sandi  else return false; # Does not match any model
103f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105f29bd553Sandi   return false;
106f29bd553Sandi  }
107f29bd553Sandi }
108f29bd553Sandi return true;
109f29bd553Sandi}
11049c713a3Sandi
1112f954959Sandi/**
112f29317c1Sandi * Unicode aware replacement for strlen()
1132f954959Sandi *
114f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
115f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
116f29317c1Sandi * even faster than mb_strlen.
1172f954959Sandi *
118f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1192f954959Sandi * @see    strlen()
120f29317c1Sandi * @see    utf8_decode()
1212f954959Sandi */
1222f954959Sandifunction utf8_strlen($string){
123dc57ef04Sandi  return strlen(utf8_decode($string));
1242f954959Sandi}
1252f954959Sandi
1267077c942Sandi/**
12710f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1287077c942Sandi *
12910f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
13010f09f2aSAndreas Gohr *
13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
1325e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk>
13310f09f2aSAndreas Gohr * @param string
13410f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13510f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13644881bd0Shenning.noren * @return mixed string or false if failure
1377077c942Sandi */
13810f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
139ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
14010f09f2aSAndreas Gohr        if( $length === null ){
14119a32233Schris            return mb_substr($str, $offset);
1427d8be200Sandi        }else{
14319a32233Schris            return mb_substr($str, $offset, $length);
144f29317c1Sandi        }
145f29317c1Sandi    }
146f29317c1Sandi
1472626ee0cSchris    /*
1482626ee0cSchris     * Notes:
1492626ee0cSchris     *
1502626ee0cSchris     * no mb string support, so we'll use pcre regex's with 'u' flag
1512626ee0cSchris     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1522626ee0cSchris     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1532626ee0cSchris     *
1542626ee0cSchris     * substr documentation states false can be returned in some cases (e.g. offset > string length)
1552626ee0cSchris     * mb_substr never returns false, it will return an empty string instead.
1562626ee0cSchris     *
1572626ee0cSchris     * calculating the number of characters in the string is a relatively expensive operation, so
1582626ee0cSchris     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1592626ee0cSchris     */
16010f09f2aSAndreas Gohr
1612626ee0cSchris    // cast parameters to appropriate types to avoid multiple notices/warnings
1622626ee0cSchris    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1632626ee0cSchris    $offset = (int)$offset;
1642626ee0cSchris    if (!is_null($length)) $length = (int)$length;
16510f09f2aSAndreas Gohr
1662626ee0cSchris    // handle trivial cases
1675e613a5cSchris    if ($length === 0) return '';
1682626ee0cSchris    if ($offset < 0 && $length < 0 && $length < $offset) return '';
1695e613a5cSchris
1702626ee0cSchris    $offset_pattern = '';
1712626ee0cSchris    $length_pattern = '';
1722626ee0cSchris
1732626ee0cSchris    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1742626ee0cSchris    if ($offset < 0) {
1752626ee0cSchris      $strlen = strlen(utf8_decode($str));        // see notes
1762626ee0cSchris      $offset = $strlen + $offset;
1772626ee0cSchris      if ($offset < 0) $offset = 0;
1782626ee0cSchris    }
1792626ee0cSchris
1802626ee0cSchris    // establish a pattern for offset, a non-captured group equal in length to offset
1812626ee0cSchris    if ($offset > 0) {
1822626ee0cSchris      $Ox = (int)($offset/65535);
1832626ee0cSchris      $Oy = $offset%65535;
1842626ee0cSchris
1852626ee0cSchris      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1862626ee0cSchris      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1872626ee0cSchris    } else {
1882626ee0cSchris      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1892626ee0cSchris    }
1902626ee0cSchris
1912626ee0cSchris    // establish a pattern for length
1922626ee0cSchris    if (is_null($length)) {
1932626ee0cSchris      $length_pattern = '(.*)$';                  // the rest of the string
1942626ee0cSchris    } else {
1952626ee0cSchris
1962626ee0cSchris      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
1972626ee0cSchris      if ($offset > $strlen) return '';           // another trivial case
1982626ee0cSchris
1992626ee0cSchris      if ($length > 0) {
2002626ee0cSchris
2012626ee0cSchris        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2022626ee0cSchris
2032626ee0cSchris        $Lx = (int)($length/65535);
2042626ee0cSchris        $Ly = $length%65535;
2052626ee0cSchris
2062626ee0cSchris        // +ve length requires ... a captured group of length characters
2072626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2082626ee0cSchris        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2092626ee0cSchris
2102626ee0cSchris      } else if ($length < 0) {
2112626ee0cSchris
2122626ee0cSchris        if ($length < ($offset - $strlen)) return '';
2132626ee0cSchris
2142626ee0cSchris        $Lx = (int)((-$length)/65535);
2152626ee0cSchris        $Ly = (-$length)%65535;
2162626ee0cSchris
2172626ee0cSchris        // -ve length requires ... capture everything except a group of -length characters
2182626ee0cSchris        //                         anchored at the tail-end of the string
2192626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2202626ee0cSchris        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
22110f09f2aSAndreas Gohr      }
22210f09f2aSAndreas Gohr    }
22310f09f2aSAndreas Gohr
2242626ee0cSchris    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2252626ee0cSchris    return $match[1];
2262626ee0cSchris}
22710f09f2aSAndreas Gohr
228f29317c1Sandi/**
229dc57ef04Sandi * Unicode aware replacement for substr_replace()
230dc57ef04Sandi *
231dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
232dc57ef04Sandi * @see    substr_replace()
233dc57ef04Sandi */
234dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
235dc57ef04Sandi  $ret = '';
236dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
237dc57ef04Sandi  $ret .= $replacement;
238dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
239dc57ef04Sandi  return $ret;
240dc57ef04Sandi}
241dc57ef04Sandi
242dc57ef04Sandi/**
243f29317c1Sandi * Unicode aware replacement for ltrim()
244f29317c1Sandi *
245f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
246f29317c1Sandi * @see    ltrim()
247f29317c1Sandi * @return string
248f29317c1Sandi */
249f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
250f29317c1Sandi  if($charlist == '') return ltrim($str);
251f29317c1Sandi
252f29317c1Sandi  //quote charlist for use in a characterclass
253f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
254f29317c1Sandi
255f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
256f29317c1Sandi}
257f29317c1Sandi
258f29317c1Sandi/**
259ea2eed85Sandi * Unicode aware replacement for rtrim()
260f29317c1Sandi *
261f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
262f29317c1Sandi * @see    rtrim()
263f29317c1Sandi * @return string
264f29317c1Sandi */
265f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
266f29317c1Sandi  if($charlist == '') return rtrim($str);
267f29317c1Sandi
268f29317c1Sandi  //quote charlist for use in a characterclass
269f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
270f29317c1Sandi
271f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
272f29317c1Sandi}
273f29317c1Sandi
274f29317c1Sandi/**
275f29317c1Sandi * Unicode aware replacement for trim()
276f29317c1Sandi *
277f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
278f29317c1Sandi * @see    trim()
279f29317c1Sandi * @return string
280f29317c1Sandi */
281f29317c1Sandifunction  utf8_trim($str,$charlist='') {
282f29317c1Sandi  if($charlist == '') return trim($str);
283f29317c1Sandi
28440421069SAndreas Gohr  return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
285f29317c1Sandi}
286f29317c1Sandi
2872f954959Sandi
28849c713a3Sandi/**
28982257610Sandi * This is a unicode aware replacement for strtolower()
29082257610Sandi *
29182257610Sandi * Uses mb_string extension if available
29282257610Sandi *
29372de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
29482257610Sandi * @see    strtolower()
29582257610Sandi * @see    utf8_strtoupper()
29682257610Sandi */
29782257610Sandifunction utf8_strtolower($string){
298ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
29982257610Sandi
30082257610Sandi  global $UTF8_UPPER_TO_LOWER;
30172de9068SAndreas Gohr  return strtr($string,$UTF8_UPPER_TO_LOWER);
30282257610Sandi}
30382257610Sandi
30482257610Sandi/**
30582257610Sandi * This is a unicode aware replacement for strtoupper()
30682257610Sandi *
30782257610Sandi * Uses mb_string extension if available
30882257610Sandi *
30972de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
31082257610Sandi * @see    strtoupper()
31182257610Sandi * @see    utf8_strtoupper()
31282257610Sandi */
31382257610Sandifunction utf8_strtoupper($string){
314ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
31582257610Sandi
31682257610Sandi  global $UTF8_LOWER_TO_UPPER;
31772de9068SAndreas Gohr  return strtr($string,$UTF8_LOWER_TO_UPPER);
31882257610Sandi}
31982257610Sandi
32082257610Sandi/**
32182257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
32282257610Sandi *
32382257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
32482257610Sandi * letters. Default is to deaccent both cases ($case = 0)
32582257610Sandi *
32682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
32782257610Sandi */
32882257610Sandifunction utf8_deaccent($string,$case=0){
32982257610Sandi  if($case <= 0){
33082257610Sandi    global $UTF8_LOWER_ACCENTS;
33172de9068SAndreas Gohr    $string = strtr($string,$UTF8_LOWER_ACCENTS);
33282257610Sandi  }
33382257610Sandi  if($case >= 0){
33482257610Sandi    global $UTF8_UPPER_ACCENTS;
33572de9068SAndreas Gohr    $string = strtr($string,$UTF8_UPPER_ACCENTS);
33682257610Sandi  }
33782257610Sandi  return $string;
33882257610Sandi}
33982257610Sandi
34082257610Sandi/**
3418a831f2bSAndreas Gohr * Romanize a non-latin string
3428a831f2bSAndreas Gohr *
3438a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3448a831f2bSAndreas Gohr */
3458a831f2bSAndreas Gohrfunction utf8_romanize($string){
3468a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3478a831f2bSAndreas Gohr
3488a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3498a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3508a831f2bSAndreas Gohr}
3518a831f2bSAndreas Gohr
3528a831f2bSAndreas Gohr/**
353099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
354099ada41Sandi *
355099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
356099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
357099ada41Sandi *
358099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
359099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
360099ada41Sandi * @param  string $repl       Replace special with this string
361b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
362099ada41Sandi */
363b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
364099ada41Sandi  global $UTF8_SPECIAL_CHARS;
365720307d9Schris  global $UTF8_SPECIAL_CHARS2;
366099ada41Sandi
3675c812709Sandi  static $specials = null;
3685c812709Sandi  if(is_null($specials)){
369720307d9Schris#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
370720307d9Schris    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
3715c812709Sandi  }
372099ada41Sandi
373b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
374099ada41Sandi}
375099ada41Sandi
376099ada41Sandi/**
3772f954959Sandi * This is an Unicode aware replacement for strpos
3782f954959Sandi *
37972de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org>
3802f954959Sandi * @see    strpos()
38172de9068SAndreas Gohr * @param  string
38272de9068SAndreas Gohr * @param  string
38372de9068SAndreas Gohr * @param  integer
38472de9068SAndreas Gohr * @return integer
3852f954959Sandi */
3862f954959Sandifunction utf8_strpos($haystack, $needle, $offset=0){
38772de9068SAndreas Gohr    $comp = 0;
38872de9068SAndreas Gohr    $length = null;
3892f954959Sandi
39072de9068SAndreas Gohr    while (is_null($length) || $length < $offset) {
39172de9068SAndreas Gohr        $pos = strpos($haystack, $needle, $offset + $comp);
39272de9068SAndreas Gohr
39372de9068SAndreas Gohr        if ($pos === false)
394f29317c1Sandi            return false;
39572de9068SAndreas Gohr
39672de9068SAndreas Gohr        $length = utf8_strlen(substr($haystack, 0, $pos));
39772de9068SAndreas Gohr
39872de9068SAndreas Gohr        if ($length < $offset)
39972de9068SAndreas Gohr            $comp = $pos - $length;
400f29317c1Sandi    }
4012f954959Sandi
40272de9068SAndreas Gohr    return $length;
40372de9068SAndreas Gohr}
404f29317c1Sandi
4052f954959Sandi
4062f954959Sandi/**
407ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
408ea2eed85Sandi *
4099f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
410ea2eed85Sandi * @author <vpribish at shopping dot com>
411ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
412ea2eed85Sandi */
413ea2eed85Sandifunction utf8_tohtml ($str) {
414ea2eed85Sandi    $ret = '';
4159f9fb0e5STom N Harris    foreach (utf8_to_unicode($str) as $cp) {
4169f9fb0e5STom N Harris        if ($cp < 0x80)
4179f9fb0e5STom N Harris            $ret .= chr($cp);
4189f9fb0e5STom N Harris        elseif ($cp < 0x100)
4199f9fb0e5STom N Harris            $ret .= "&#$cp;";
4209f9fb0e5STom N Harris        else
4219f9fb0e5STom N Harris            $ret .= '&#x'.dechex($cp).';';
4229f9fb0e5STom N Harris    }
4239f9fb0e5STom N Harris    return $ret;
4249f9fb0e5STom N Harris}
4259f9fb0e5STom N Harris
4269f9fb0e5STom N Harris/**
4279f9fb0e5STom N Harris * Decodes HTML entities to UTF-8 characters
4289f9fb0e5STom N Harris *
4299f9fb0e5STom N Harris * Convert any &#..; entity to a codepoint,
4309f9fb0e5STom N Harris * The entities flag defaults to only decoding numeric entities.
4319f9fb0e5STom N Harris * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
4329f9fb0e5STom N Harris * are handled as well. Avoids the problem that would occur if you
4339f9fb0e5STom N Harris * had to decode "&amp;#38;&#38;amp;#38;"
4349f9fb0e5STom N Harris *
4359f9fb0e5STom N Harris * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
4369f9fb0e5STom N Harris * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
4379f9fb0e5STom N Harris * what it should be                   -> "&#38;&amp#38;"
4389f9fb0e5STom N Harris *
4399f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
4409f9fb0e5STom N Harris * @param  string  $str      UTF-8 encoded string
4419f9fb0e5STom N Harris * @param  boolean $entities Flag controlling decoding of named entities.
4429f9fb0e5STom N Harris * @return UTF-8 encoded string with numeric (and named) entities replaced.
4439f9fb0e5STom N Harris */
4449f9fb0e5STom N Harrisfunction utf8_unhtml($str, $entities=null) {
4459f9fb0e5STom N Harris    static $decoder = null;
4469f9fb0e5STom N Harris    if (is_null($decoder))
4479f9fb0e5STom N Harris      $decoder = new utf8_entity_decoder();
4489f9fb0e5STom N Harris    if (is_null($entities))
4499f9fb0e5STom N Harris        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
4509f9fb0e5STom N Harris                                     'utf8_decode_numeric', $str);
4519f9fb0e5STom N Harris    else
4529f9fb0e5STom N Harris        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
4539f9fb0e5STom N Harris                                     array(&$decoder, 'decode'), $str);
4549f9fb0e5STom N Harris}
4559f9fb0e5STom N Harrisfunction utf8_decode_numeric($ent) {
4569f9fb0e5STom N Harris    switch ($ent[2]) {
4579f9fb0e5STom N Harris      case 'X':
4589f9fb0e5STom N Harris      case 'x':
4599f9fb0e5STom N Harris          $cp = hexdec($ent[3]);
4609f9fb0e5STom N Harris          break;
4619f9fb0e5STom N Harris      default:
4629f9fb0e5STom N Harris          $cp = intval($ent[3]);
4639f9fb0e5STom N Harris          break;
4649f9fb0e5STom N Harris    }
4659f9fb0e5STom N Harris    return unicode_to_utf8(array($cp));
4669f9fb0e5STom N Harris}
4679f9fb0e5STom N Harrisclass utf8_entity_decoder {
4689f9fb0e5STom N Harris    var $table;
4699f9fb0e5STom N Harris    function utf8_entity_decoder() {
4709f9fb0e5STom N Harris        $table = get_html_translation_table(HTML_ENTITIES);
4719f9fb0e5STom N Harris        $table = array_flip($table);
4729f9fb0e5STom N Harris        $this->table = array_map(array(&$this,'makeutf8'), $table);
4739f9fb0e5STom N Harris    }
4749f9fb0e5STom N Harris    function makeutf8($c) {
4759f9fb0e5STom N Harris        return unicode_to_utf8(array(ord($c)));
4769f9fb0e5STom N Harris    }
4779f9fb0e5STom N Harris    function decode($ent) {
4789f9fb0e5STom N Harris        if ($ent[1] == '#') {
4799f9fb0e5STom N Harris            return utf8_decode_numeric($ent);
4809f9fb0e5STom N Harris        } elseif (array_key_exists($ent[0],$this->table)) {
4819f9fb0e5STom N Harris            return $this->table[$ent[0]];
4829f9fb0e5STom N Harris        } else {
4839f9fb0e5STom N Harris            return $ent[0];
484ea2eed85Sandi        }
485ea2eed85Sandi    }
486ea2eed85Sandi}
487ea2eed85Sandi
488ea2eed85Sandi/**
4891abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
4901abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
4911abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
4921abfaba4SAndreas Gohr * are not allowed.
49382257610Sandi *
4941abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
4951abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
4961abfaba4SAndreas Gohr * level E_USER_WARNING
4971abfaba4SAndreas Gohr *
4981abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
4991abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
5001abfaba4SAndreas Gohr *
5011abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
5021abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
5031abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
5041abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
50544881bd0Shenning.noren * @return mixed array of unicode code points or false if UTF-8 invalid
5061abfaba4SAndreas Gohr * @see    unicode_to_utf8
5071abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
5081abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
50982257610Sandi */
5101abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
5111abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
5121abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
5131abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
5141abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
51582257610Sandi
5161abfaba4SAndreas Gohr    $out = array();
5171abfaba4SAndreas Gohr
5181abfaba4SAndreas Gohr    $len = strlen($str);
5191abfaba4SAndreas Gohr
5201abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
5211abfaba4SAndreas Gohr
5221abfaba4SAndreas Gohr        $in = ord($str{$i});
5231abfaba4SAndreas Gohr
5241abfaba4SAndreas Gohr        if ( $mState == 0) {
5251abfaba4SAndreas Gohr
5261abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
5271abfaba4SAndreas Gohr            // multi-octet sequence.
5281abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
5291abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
5301abfaba4SAndreas Gohr                $out[] = $in;
5311abfaba4SAndreas Gohr                $mBytes = 1;
5321abfaba4SAndreas Gohr
5331abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
5341abfaba4SAndreas Gohr                // First octet of 2 octet sequence
5351abfaba4SAndreas Gohr                $mUcs4 = ($in);
5361abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
5371abfaba4SAndreas Gohr                $mState = 1;
5381abfaba4SAndreas Gohr                $mBytes = 2;
5391abfaba4SAndreas Gohr
5401abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
5411abfaba4SAndreas Gohr                // First octet of 3 octet sequence
5421abfaba4SAndreas Gohr                $mUcs4 = ($in);
5431abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
5441abfaba4SAndreas Gohr                $mState = 2;
5451abfaba4SAndreas Gohr                $mBytes = 3;
5461abfaba4SAndreas Gohr
5471abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
5481abfaba4SAndreas Gohr                // First octet of 4 octet sequence
5491abfaba4SAndreas Gohr                $mUcs4 = ($in);
5501abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
5511abfaba4SAndreas Gohr                $mState = 3;
5521abfaba4SAndreas Gohr                $mBytes = 4;
5531abfaba4SAndreas Gohr
5541abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
5551abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
5561abfaba4SAndreas Gohr                 *
5571abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
5581abfaba4SAndreas Gohr                 * (a) not the shortest form or
5591abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
5601abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
5611abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5621abfaba4SAndreas Gohr                 */
5631abfaba4SAndreas Gohr                $mUcs4 = ($in);
5641abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5651abfaba4SAndreas Gohr                $mState = 4;
5661abfaba4SAndreas Gohr                $mBytes = 5;
5671abfaba4SAndreas Gohr
5681abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
5691abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
5701abfaba4SAndreas Gohr                $mUcs4 = ($in);
5711abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
5721abfaba4SAndreas Gohr                $mState = 5;
5731abfaba4SAndreas Gohr                $mBytes = 6;
5741abfaba4SAndreas Gohr
5751abfaba4SAndreas Gohr            } elseif($strict) {
5761abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
5771abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
5781abfaba4SAndreas Gohr                 */
5791abfaba4SAndreas Gohr                trigger_error(
5801abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
5811abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
5821abfaba4SAndreas Gohr                        E_USER_WARNING
5831abfaba4SAndreas Gohr                    );
58444881bd0Shenning.noren                return false;
5851abfaba4SAndreas Gohr
5861abfaba4SAndreas Gohr            }
5871abfaba4SAndreas Gohr
5881abfaba4SAndreas Gohr        } else {
5891abfaba4SAndreas Gohr
5901abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
5911abfaba4SAndreas Gohr            // sequence
5921abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
5931abfaba4SAndreas Gohr
5941abfaba4SAndreas Gohr                // Legal continuation.
5951abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
5961abfaba4SAndreas Gohr                $tmp = $in;
5971abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
5981abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
5991abfaba4SAndreas Gohr
6001abfaba4SAndreas Gohr                /**
6011abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
6021abfaba4SAndreas Gohr                 * Unicode codepoint to be output
6031abfaba4SAndreas Gohr                 */
6041abfaba4SAndreas Gohr                if (0 == --$mState) {
6051abfaba4SAndreas Gohr
6061abfaba4SAndreas Gohr                    /*
6071abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
6081abfaba4SAndreas Gohr                     */
6091abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
6101abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
6111abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
6121abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
6131abfaba4SAndreas Gohr                        (4 < $mBytes) ||
6141abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
6151abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
6161abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
6171abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
6181abfaba4SAndreas Gohr
6191abfaba4SAndreas Gohr                        if($strict){
6201abfaba4SAndreas Gohr                            trigger_error(
6211abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
6221abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
6231abfaba4SAndreas Gohr                                    E_USER_WARNING
6241abfaba4SAndreas Gohr                                );
6251abfaba4SAndreas Gohr
62644881bd0Shenning.noren                            return false;
6271abfaba4SAndreas Gohr                        }
6281abfaba4SAndreas Gohr
6291abfaba4SAndreas Gohr                    }
6301abfaba4SAndreas Gohr
6311abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
6321abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
6331abfaba4SAndreas Gohr                        $out[] = $mUcs4;
6341abfaba4SAndreas Gohr                    }
6351abfaba4SAndreas Gohr
6361abfaba4SAndreas Gohr                    //initialize UTF8 cache
6371abfaba4SAndreas Gohr                    $mState = 0;
6381abfaba4SAndreas Gohr                    $mUcs4  = 0;
6391abfaba4SAndreas Gohr                    $mBytes = 1;
6401abfaba4SAndreas Gohr                }
6411abfaba4SAndreas Gohr
6421abfaba4SAndreas Gohr            } elseif($strict) {
6431abfaba4SAndreas Gohr                /**
6441abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
6451abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
6461abfaba4SAndreas Gohr                 */
6471abfaba4SAndreas Gohr                trigger_error(
6481abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
6491abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
6501abfaba4SAndreas Gohr                        E_USER_WARNING
6511abfaba4SAndreas Gohr                    );
6521abfaba4SAndreas Gohr
65344881bd0Shenning.noren                return false;
65482257610Sandi            }
65582257610Sandi        }
65682257610Sandi    }
6571abfaba4SAndreas Gohr    return $out;
65882257610Sandi}
65982257610Sandi
66082257610Sandi/**
6611abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6621abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6631abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6641abfaba4SAndreas Gohr * are not allowed.
66582257610Sandi *
6661abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
6671abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
6681abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
6691abfaba4SAndreas Gohr *
6701abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
6711abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
6721abfaba4SAndreas Gohr * reference the array by it's keys
6731abfaba4SAndreas Gohr *
6741abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
6751abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
67644881bd0Shenning.noren * @return mixed UTF-8 string or false if array contains invalid code points
6771abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
6781abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
6791abfaba4SAndreas Gohr * @see    utf8_to_unicode
6801abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
6811abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
68282257610Sandi */
6831abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
6841abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
6851abfaba4SAndreas Gohr    ob_start();
686f949a01cSAndreas Gohr
6871abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
6881abfaba4SAndreas Gohr
6891abfaba4SAndreas Gohr        # ASCII range (including control chars)
6901abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
6911abfaba4SAndreas Gohr
6921abfaba4SAndreas Gohr            echo chr($arr[$k]);
6931abfaba4SAndreas Gohr
6941abfaba4SAndreas Gohr        # 2 byte sequence
6951abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
6961abfaba4SAndreas Gohr
6971abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
6981abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6991abfaba4SAndreas Gohr
7001abfaba4SAndreas Gohr        # Byte order mark (skip)
7011abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
7021abfaba4SAndreas Gohr
7031abfaba4SAndreas Gohr            // nop -- zap the BOM
7041abfaba4SAndreas Gohr
7051abfaba4SAndreas Gohr        # Test for illegal surrogates
7061abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
7071abfaba4SAndreas Gohr
7081abfaba4SAndreas Gohr            // found a surrogate
7091abfaba4SAndreas Gohr            if($strict){
7101abfaba4SAndreas Gohr                trigger_error(
7111abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
7121abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
7131abfaba4SAndreas Gohr                    E_USER_WARNING
7141abfaba4SAndreas Gohr                    );
71544881bd0Shenning.noren                return false;
7161abfaba4SAndreas Gohr            }
7171abfaba4SAndreas Gohr
7181abfaba4SAndreas Gohr        # 3 byte sequence
7191abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
7201abfaba4SAndreas Gohr
7211abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
7221abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
7231abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
7241abfaba4SAndreas Gohr
7251abfaba4SAndreas Gohr        # 4 byte sequence
7261abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
7271abfaba4SAndreas Gohr
7281abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
7291abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
7301abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
7311abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
7321abfaba4SAndreas Gohr
7331abfaba4SAndreas Gohr        } elseif($strict) {
7341abfaba4SAndreas Gohr
7351abfaba4SAndreas Gohr            trigger_error(
7361abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
7371abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
7381abfaba4SAndreas Gohr                E_USER_WARNING
7391abfaba4SAndreas Gohr                );
7401abfaba4SAndreas Gohr
7411abfaba4SAndreas Gohr            // out of range
74244881bd0Shenning.noren            return false;
74382257610Sandi        }
74482257610Sandi    }
7451abfaba4SAndreas Gohr
7461abfaba4SAndreas Gohr    $result = ob_get_contents();
7471abfaba4SAndreas Gohr    ob_end_clean();
7481abfaba4SAndreas Gohr    return $result;
74982257610Sandi}
75082257610Sandi
75182257610Sandi/**
75215fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
75315fa0b4fSAndreas Gohr *
75415fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
75515fa0b4fSAndreas Gohr */
75615fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
75715fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
758ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
75915fa0b4fSAndreas Gohr
76015fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
76115fa0b4fSAndreas Gohr  foreach($uni as $cp){
76215fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
76315fa0b4fSAndreas Gohr  }
76415fa0b4fSAndreas Gohr  return $out;
76515fa0b4fSAndreas Gohr}
76615fa0b4fSAndreas Gohr
76715fa0b4fSAndreas Gohr/**
76815fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
76915fa0b4fSAndreas Gohr *
77015fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
77115fa0b4fSAndreas Gohr */
77215fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
77315fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
77415fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
77515fa0b4fSAndreas Gohr}
77615fa0b4fSAndreas Gohr
7770eac1afbSAndreas Gohr/**
7780eac1afbSAndreas Gohr * Replace bad bytes with an alternative character
7790eac1afbSAndreas Gohr *
7800eac1afbSAndreas Gohr * ASCII character is recommended for replacement char
7810eac1afbSAndreas Gohr *
7820eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string
7830eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms
7840eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars
7850eac1afbSAndreas Gohr *
7860eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
7870eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8
7880eac1afbSAndreas Gohr * @param string to search
7890eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII
7900eac1afbSAndreas Gohr * @return string
7910eac1afbSAndreas Gohr */
7920eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') {
7930eac1afbSAndreas Gohr    $UTF8_BAD =
7940eac1afbSAndreas Gohr     '([\x00-\x7F]'.                          # ASCII (including control chars)
7950eac1afbSAndreas Gohr     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
7960eac1afbSAndreas Gohr     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
7970eac1afbSAndreas Gohr     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
7980eac1afbSAndreas Gohr     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
7990eac1afbSAndreas Gohr     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
8000eac1afbSAndreas Gohr     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
8010eac1afbSAndreas Gohr     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
8020eac1afbSAndreas Gohr     '|(.{1}))';                              # invalid byte
8030eac1afbSAndreas Gohr    ob_start();
8040eac1afbSAndreas Gohr    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
8050eac1afbSAndreas Gohr        if ( !isset($matches[2])) {
8060eac1afbSAndreas Gohr            echo $matches[0];
8070eac1afbSAndreas Gohr        } else {
8080eac1afbSAndreas Gohr            echo $replace;
8090eac1afbSAndreas Gohr        }
8100eac1afbSAndreas Gohr        $str = substr($str,strlen($matches[0]));
8110eac1afbSAndreas Gohr    }
8120eac1afbSAndreas Gohr    $result = ob_get_contents();
8130eac1afbSAndreas Gohr    ob_end_clean();
8140eac1afbSAndreas Gohr    return $result;
8150eac1afbSAndreas Gohr}
816ab77016bSAndreas Gohr
8175953e889Schris/**
8185953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary
8195953e889Schris *
8205953e889Schris * @param $str   string   utf8 character string
8215953e889Schris * @param $i     int      byte index into $str
8225953e889Schris * @param $next  bool     direction to search for boundary,
8235953e889Schris *                           false = up (current character)
8245953e889Schris *                           true = down (next character)
8255953e889Schris *
8265953e889Schris * @return int            byte index into $str now pointing to a utf8 character boundary
8275953e889Schris *
8285953e889Schris * @author       chris smith <chris@jalakai.co.uk>
8295953e889Schris */
8305953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) {
8315953e889Schris
832f50163d1Schris  if ($i <= 0) return 0;
833f50163d1Schris
8345953e889Schris  $limit = strlen($str);
835f50163d1Schris  if ($i>=$limit) return $limit;
836f50163d1Schris
837f50163d1Schris  if ($next) {
8385953e889Schris    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
8395953e889Schris  } else {
8405953e889Schris    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
8415953e889Schris  }
8425953e889Schris
8435953e889Schris  return $i;
8445953e889Schris}
8455953e889Schris
846ab77016bSAndreas Gohr// only needed if no mb_string available
847ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
84815fa0b4fSAndreas Gohr  /**
84982257610Sandi   * UTF-8 Case lookup table
85082257610Sandi   *
85182257610Sandi   * This lookuptable defines the upper case letters to their correspponding
85282257610Sandi   * lower case letter in UTF-8
85382257610Sandi   *
85482257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
85582257610Sandi   */
85654662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
85754662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
85872de9068SAndreas Gohr    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
85972de9068SAndreas Gohr    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
86072de9068SAndreas Gohr    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
86172de9068SAndreas Gohr    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
86272de9068SAndreas Gohr    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
86372de9068SAndreas Gohr    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
86472de9068SAndreas Gohr    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
86572de9068SAndreas Gohr    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
86672de9068SAndreas Gohr    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
86772de9068SAndreas Gohr    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
86872de9068SAndreas Gohr    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
86972de9068SAndreas Gohr    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
87072de9068SAndreas Gohr    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
87172de9068SAndreas Gohr    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
87272de9068SAndreas Gohr    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
87372de9068SAndreas Gohr    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
87472de9068SAndreas Gohr    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
87572de9068SAndreas Gohr    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
87672de9068SAndreas Gohr    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
87772de9068SAndreas Gohr    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
87872de9068SAndreas Gohr    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
87972de9068SAndreas Gohr    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
88072de9068SAndreas Gohr    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
88172de9068SAndreas Gohr    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
88272de9068SAndreas Gohr    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
88372de9068SAndreas Gohr    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
88472de9068SAndreas Gohr    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
88572de9068SAndreas Gohr    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
88672de9068SAndreas Gohr    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
88772de9068SAndreas Gohr    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
88872de9068SAndreas Gohr    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
88972de9068SAndreas Gohr    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
89072de9068SAndreas Gohr    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
89172de9068SAndreas Gohr    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
89272de9068SAndreas Gohr    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
89372de9068SAndreas Gohr    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
89472de9068SAndreas Gohr    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
89572de9068SAndreas Gohr    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
89672de9068SAndreas Gohr    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
89772de9068SAndreas Gohr    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
89872de9068SAndreas Gohr    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
89972de9068SAndreas Gohr    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
90072de9068SAndreas Gohr    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
90172de9068SAndreas Gohr    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
90272de9068SAndreas Gohr    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
90372de9068SAndreas Gohr    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
90472de9068SAndreas Gohr    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
90572de9068SAndreas Gohr    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
90672de9068SAndreas Gohr    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
90772de9068SAndreas Gohr    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
90872de9068SAndreas Gohr    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
90972de9068SAndreas Gohr    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
91072de9068SAndreas Gohr    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
91172de9068SAndreas Gohr    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
91272de9068SAndreas Gohr    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
91372de9068SAndreas Gohr    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
91472de9068SAndreas Gohr    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
91572de9068SAndreas Gohr    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
91672de9068SAndreas Gohr    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
91772de9068SAndreas Gohr    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
91872de9068SAndreas Gohr    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
91972de9068SAndreas Gohr    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
92072de9068SAndreas Gohr    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
92172de9068SAndreas Gohr    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
92272de9068SAndreas Gohr    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
92372de9068SAndreas Gohr    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
92472de9068SAndreas Gohr    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
92582257610Sandi  );
92682257610Sandi
92782257610Sandi  /**
92882257610Sandi   * UTF-8 Case lookup table
92982257610Sandi   *
93082257610Sandi   * This lookuptable defines the lower case letters to their correspponding
93172de9068SAndreas Gohr   * upper case letter in UTF-8
93282257610Sandi   *
93382257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
93482257610Sandi   */
93554662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
93672de9068SAndreas Gohr  $UTF8_UPPER_TO_LOWER = array (
93772de9068SAndreas Gohr    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
93872de9068SAndreas Gohr    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
93972de9068SAndreas Gohr    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
94072de9068SAndreas Gohr    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
94172de9068SAndreas Gohr    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
94272de9068SAndreas Gohr    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
94372de9068SAndreas Gohr    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
94472de9068SAndreas Gohr    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
94572de9068SAndreas Gohr    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
94672de9068SAndreas Gohr    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
94772de9068SAndreas Gohr    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
94872de9068SAndreas Gohr    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
94972de9068SAndreas Gohr    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
95072de9068SAndreas Gohr    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
95172de9068SAndreas Gohr    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
95272de9068SAndreas Gohr    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
95372de9068SAndreas Gohr    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
95472de9068SAndreas Gohr    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
95572de9068SAndreas Gohr    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
95672de9068SAndreas Gohr    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
95772de9068SAndreas Gohr    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
95872de9068SAndreas Gohr    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
95972de9068SAndreas Gohr    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
96072de9068SAndreas Gohr    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
96172de9068SAndreas Gohr    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
96272de9068SAndreas Gohr    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
96372de9068SAndreas Gohr    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
96472de9068SAndreas Gohr    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
96572de9068SAndreas Gohr    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
96672de9068SAndreas Gohr    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
96772de9068SAndreas Gohr    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
96872de9068SAndreas Gohr    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
96972de9068SAndreas Gohr    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
97072de9068SAndreas Gohr    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
97172de9068SAndreas Gohr    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
97272de9068SAndreas Gohr    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
97372de9068SAndreas Gohr    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
97472de9068SAndreas Gohr    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
97572de9068SAndreas Gohr    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
97672de9068SAndreas Gohr    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
97772de9068SAndreas Gohr    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
97872de9068SAndreas Gohr    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
97972de9068SAndreas Gohr    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
98072de9068SAndreas Gohr    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
98172de9068SAndreas Gohr    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
98272de9068SAndreas Gohr    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
98372de9068SAndreas Gohr    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
98472de9068SAndreas Gohr    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
98572de9068SAndreas Gohr    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
98672de9068SAndreas Gohr    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
98772de9068SAndreas Gohr    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
98872de9068SAndreas Gohr    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
98972de9068SAndreas Gohr    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
99072de9068SAndreas Gohr    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
99172de9068SAndreas Gohr    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
99272de9068SAndreas Gohr    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
99372de9068SAndreas Gohr    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
99472de9068SAndreas Gohr    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
99572de9068SAndreas Gohr    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
99672de9068SAndreas Gohr    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
99772de9068SAndreas Gohr    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
99872de9068SAndreas Gohr    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
99972de9068SAndreas Gohr    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
100072de9068SAndreas Gohr    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
100172de9068SAndreas Gohr    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
100272de9068SAndreas Gohr    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
100372de9068SAndreas Gohr    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
100472de9068SAndreas Gohr  );
100572de9068SAndreas Gohr}; // end of case lookup tables
1006ab77016bSAndreas Gohr
100782257610Sandi/**
100882257610Sandi * UTF-8 lookup table for lower case accented letters
100982257610Sandi *
101082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
101182257610Sandi * range. This are lower case letters only.
101282257610Sandi *
101382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
101482257610Sandi * @see    utf8_deaccent()
101582257610Sandi */
101654662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
101782257610Sandi$UTF8_LOWER_ACCENTS = array(
101882257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
101982257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
102082257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
102182257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
102282257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
102382257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
102482257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
102582257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
102682257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
102782257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
102882257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
102982257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
103082257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
103182257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
103274c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
103382257610Sandi);
103482257610Sandi
103582257610Sandi/**
103682257610Sandi * UTF-8 lookup table for upper case accented letters
103782257610Sandi *
103882257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
103982257610Sandi * range. This are upper case letters only.
104082257610Sandi *
104182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
104282257610Sandi * @see    utf8_deaccent()
104382257610Sandi */
104454662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
104582257610Sandi$UTF8_UPPER_ACCENTS = array(
1046df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1047df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1048df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1049df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1050df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1051df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1052df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1053df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1054df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1055df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1056df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1057df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1058df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1059df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
106074c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
106182257610Sandi);
106282257610Sandi
1063099ada41Sandi/**
1064099ada41Sandi * UTF-8 array of common special characters
1065099ada41Sandi *
1066099ada41Sandi * This array should contain all special characters (not a letter or digit)
1067099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1068099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1069099ada41Sandi * chars.
1070099ada41Sandi *
1071099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1072ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1073099ada41Sandi *
1074099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1075099ada41Sandi * @see    utf8_stripspecials()
1076099ada41Sandi */
107754662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1078099ada41Sandi$UTF8_SPECIAL_CHARS = array(
1079099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1080ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
10815c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
10825c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1083099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1084099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1085099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1086099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1087099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1088099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1089099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1090099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1091099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1092099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1093099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1094099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1095099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1096099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1097099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1098099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1099099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1100099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1101099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1102099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1103099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1104099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1105099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1106099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1107099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1108099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1109099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1110099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1111099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1112099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1113099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1114099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1115099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1116099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1117099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1118099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1119099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1120099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1121099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1122099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1123099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1124d5b23302STom N Harris  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1125d5b23302STom N Harris  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1126d5b23302STom N Harris  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1127d5b23302STom N Harris  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1128099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1129099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1130099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1131099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1132d5b23302STom N Harris          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1133d5b23302STom N Harris  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1134d5b23302STom N Harris  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1135d5b23302STom N Harris  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1136d5b23302STom N Harris  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1137d5b23302STom N Harris  0xffeb, 0xffec, 0xffed, 0xffee,
1138099ada41Sandi);
1139340756e4Sandi
1140720307d9Schris// utf8 version of above data
1141720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1142720307d9Schris$UTF8_SPECIAL_CHARS2 =
114337242afaSTom N Harris    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1144720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1145720307d9Schris    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1146720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1147720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1148720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1149720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1150720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1151720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1152720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1153720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1154720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1155720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1156720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1157d5b23302STom N Harris    '➷➸➹➺➻➼➽➾'.
1158d5b23302STom N Harris    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1159d5b23302STom N Harris    '�'.
1160d5b23302STom N Harris    '�ﹼﹽ'.
1161d5b23302STom N Harris    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1162d5b23302STom N Harris    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○';
1163720307d9Schris
11648a831f2bSAndreas Gohr/**
11658a831f2bSAndreas Gohr * Romanization lookup table
11668a831f2bSAndreas Gohr *
11678a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
11688a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
11698a831f2bSAndreas Gohr *
11708a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
11718a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
11728a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
11738a831f2bSAndreas Gohr *
11748a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
11758a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
11768a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
11778a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
11788a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
11798a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
11808a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
11818a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
1182014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com>
1183fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de>
11848a831f2bSAndreas Gohr */
118554662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
11868a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
11878a831f2bSAndreas Gohr  //russian cyrillic
11888a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
11898a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
11908a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
11918a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
11928a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
11938a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1194d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1195f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
11968a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
11978a831f2bSAndreas Gohr  // Ukrainian cyrillic
11988a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
11998a831f2bSAndreas Gohr  // Georgian
12008a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
12018a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
12028a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
12038a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
12048a831f2bSAndreas Gohr  'ჰ'=>'xh',
12058a831f2bSAndreas Gohr  //Sanskrit
12068a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
12078a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
12088a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
12098a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
12108a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
12118a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
12128a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
12138a831f2bSAndreas Gohr  //Hebrew
12143dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
12153dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
12163dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
12178a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
12188a831f2bSAndreas Gohr  //Arabic
12198a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
12208a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
12218a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
12228a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
12238a831f2bSAndreas Gohr
1224*9476a253SAndreas Gohr  // Japanese characters  (last update: 2008-05-05)
1225*9476a253SAndreas Gohr
12268a831f2bSAndreas Gohr  // Japanese hiragana
1227fed467f8SDenis Scheither
1228fed467f8SDenis Scheither  // 3 character syllables, っ doubles the consonant after
1229fed467f8SDenis Scheither  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1230fed467f8SDenis Scheither  'っびゃ'=>'bya','っびぇ'=>'bye','っびぃ'=>'byi','っびょ'=>'byo','っびゅ'=>'byu',
1231fed467f8SDenis Scheither  'っちゃ'=>'cha','っちぇ'=>'che','っち'=>'chi','っちょ'=>'cho','っちゅ'=>'chu',
1232fed467f8SDenis Scheither  'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1233fed467f8SDenis Scheither  'っきゃ'=>'kya','っきぇ'=>'kye','っきぃ'=>'kyi','っきょ'=>'kyo','っきゅ'=>'kyu',
1234fed467f8SDenis Scheither  'っぎゃ'=>'gya','っぎぇ'=>'gye','っぎぃ'=>'gyi','っぎょ'=>'gyo','っぎゅ'=>'gyu',
1235fed467f8SDenis Scheither  'っみゃ'=>'mya','っみぇ'=>'mye','っみぃ'=>'myi','っみょ'=>'myo','っみゅ'=>'myu',
1236fed467f8SDenis Scheither  'っにゃ'=>'nya','っにぇ'=>'nye','っにぃ'=>'nyi','っにょ'=>'nyo','っにゅ'=>'nyu',
1237fed467f8SDenis Scheither  'っりゃ'=>'rya','っりぇ'=>'rye','っりぃ'=>'ryi','っりょ'=>'ryo','っりゅ'=>'ryu',
1238fed467f8SDenis Scheither  'っしゃ'=>'sha','っしぇ'=>'she','っし'=>'shi','っしょ'=>'sho','っしゅ'=>'shu',
1239fed467f8SDenis Scheither
1240fed467f8SDenis Scheither   // 2 character syllables - normal
1241fed467f8SDenis Scheither  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo','ふ'=>'fu',
1242fed467f8SDenis Scheither  'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1243fed467f8SDenis Scheither  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1244fed467f8SDenis Scheither  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1245fed467f8SDenis Scheither  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1246fed467f8SDenis Scheither  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1247fed467f8SDenis Scheither  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1248fed467f8SDenis Scheither  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1249fed467f8SDenis Scheither  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1250fed467f8SDenis Scheither  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1251fed467f8SDenis Scheither  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1252fed467f8SDenis Scheither  'じゃ'=>'ja','じぇ'=>'je','じ'=>'ji','じょ'=>'jo','じゅ'=>'ju',
1253fed467f8SDenis Scheither
1254fed467f8SDenis Scheither  // 2 character syllables, っ doubles the consonant after
1255fed467f8SDenis Scheither  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1256fed467f8SDenis Scheither  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1257fed467f8SDenis Scheither  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1258fed467f8SDenis Scheither  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1259fed467f8SDenis Scheither  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1260fed467f8SDenis Scheither  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1261fed467f8SDenis Scheither  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1262fed467f8SDenis Scheither  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1263fed467f8SDenis Scheither  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1264fed467f8SDenis Scheither  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1265fed467f8SDenis Scheither  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'zzi','っぞ'=>'zzo','っず'=>'zzu',
1266fed467f8SDenis Scheither
1267fed467f8SDenis Scheither  // 1 character syllabels
1268fed467f8SDenis Scheither  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1269fed467f8SDenis Scheither  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu',
1270fed467f8SDenis Scheither  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1271fed467f8SDenis Scheither  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1272*9476a253SAndreas Gohr  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1273fed467f8SDenis Scheither  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1274fed467f8SDenis Scheither  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1275fed467f8SDenis Scheither  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1276fed467f8SDenis Scheither  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1277fed467f8SDenis Scheither  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1278fed467f8SDenis Scheither  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1279fed467f8SDenis Scheither  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1280fed467f8SDenis Scheither  'わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo',
1281fed467f8SDenis Scheither  'ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo','ず'=>'zu',
1282fed467f8SDenis Scheither  'や'=>'ya','いぇ'=>'ye','よ'=>'yo','ゆ'=>'yu',
1283*9476a253SAndreas Gohr  // old characters
1284*9476a253SAndreas Gohr  'ゑ'=>'we','ゐ'=>'wi',
1285fed467f8SDenis Scheither
1286*9476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1287*9476a253SAndreas Gohr  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1288*9476a253SAndreas Gohr  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1289fed467f8SDenis Scheither
1290*9476a253SAndreas Gohr  // never seen one of those (disabled for the moment)
1291*9476a253SAndreas Gohr  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1292*9476a253SAndreas Gohr  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1293*9476a253SAndreas Gohr  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1294*9476a253SAndreas Gohr  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1295*9476a253SAndreas Gohr  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1296*9476a253SAndreas Gohr  // 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1297*9476a253SAndreas Gohr  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1298*9476a253SAndreas Gohr  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1299*9476a253SAndreas Gohr  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1300*9476a253SAndreas Gohr  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1301*9476a253SAndreas Gohr  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1302*9476a253SAndreas Gohr  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1303*9476a253SAndreas Gohr  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1304*9476a253SAndreas Gohr  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1305fed467f8SDenis Scheither
1306fed467f8SDenis Scheither  // 'spare' characters from other romanization systems
1307fed467f8SDenis Scheither  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1308fed467f8SDenis Scheither  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1309fed467f8SDenis Scheither  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1310fed467f8SDenis Scheither  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1311fed467f8SDenis Scheither  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1312fed467f8SDenis Scheither  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1313fed467f8SDenis Scheither  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1314fed467f8SDenis Scheither  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1315fed467f8SDenis Scheither  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1316fed467f8SDenis Scheither  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1317fed467f8SDenis Scheither
1318fed467f8SDenis Scheither
13198a831f2bSAndreas Gohr  // Japanese katakana
1320fed467f8SDenis Scheither
1321fed467f8SDenis Scheither  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1322fed467f8SDenis Scheither  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1323fed467f8SDenis Scheither  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1324fed467f8SDenis Scheither  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1325fed467f8SDenis Scheither  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1326fed467f8SDenis Scheither  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1327fed467f8SDenis Scheither  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1328fed467f8SDenis Scheither  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1329fed467f8SDenis Scheither  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1330fed467f8SDenis Scheither  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1331fed467f8SDenis Scheither
1332fed467f8SDenis Scheither  // 3 character syllables - doubled vowels
1333fed467f8SDenis Scheither  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1334fed467f8SDenis Scheither  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1335fed467f8SDenis Scheither  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1336fed467f8SDenis Scheither  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1337fed467f8SDenis Scheither  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1338fed467f8SDenis Scheither  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1339fed467f8SDenis Scheither  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1340fed467f8SDenis Scheither  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1341fed467f8SDenis Scheither  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1342fed467f8SDenis Scheither  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1343fed467f8SDenis Scheither  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1344fed467f8SDenis Scheither  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1345fed467f8SDenis Scheither  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1346fed467f8SDenis Scheither  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1347fed467f8SDenis Scheither  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1348fed467f8SDenis Scheither  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1349fed467f8SDenis Scheither  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1350fed467f8SDenis Scheither  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1351fed467f8SDenis Scheither  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1352fed467f8SDenis Scheither  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1353fed467f8SDenis Scheither  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1354fed467f8SDenis Scheither  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1355fed467f8SDenis Scheither  'ウェー'=>'wee','ウィー'=>'wii',
1356fed467f8SDenis Scheither  'イェー'=>'yee',
1357fed467f8SDenis Scheither
1358fed467f8SDenis Scheither  // 3 character syllables - doubled consonants
1359fed467f8SDenis Scheither  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1360fed467f8SDenis Scheither  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1361fed467f8SDenis Scheither  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1362fed467f8SDenis Scheither  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1363fed467f8SDenis Scheither  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1364fed467f8SDenis Scheither  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1365fed467f8SDenis Scheither  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1366fed467f8SDenis Scheither  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1367fed467f8SDenis Scheither  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1368fed467f8SDenis Scheither
1369fed467f8SDenis Scheither  // 3 character syllables - doubled vowel and consonants
1370fed467f8SDenis Scheither  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1371fed467f8SDenis Scheither  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1372fed467f8SDenis Scheither  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1373fed467f8SDenis Scheither  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1374fed467f8SDenis Scheither  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1375fed467f8SDenis Scheither  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1376fed467f8SDenis Scheither  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1377fed467f8SDenis Scheither  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1378fed467f8SDenis Scheither  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'zzii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1379fed467f8SDenis Scheither  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttssuu',
1380fed467f8SDenis Scheither  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1381fed467f8SDenis Scheither
1382fed467f8SDenis Scheither  // 2 character syllables - normal
1383fed467f8SDenis Scheither  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo',
1384fed467f8SDenis Scheither  'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1385fed467f8SDenis Scheither  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1386fed467f8SDenis Scheither  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1387fed467f8SDenis Scheither  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1388fed467f8SDenis Scheither  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1389fed467f8SDenis Scheither  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1390fed467f8SDenis Scheither  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1391fed467f8SDenis Scheither  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1392fed467f8SDenis Scheither  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1393fed467f8SDenis Scheither  'シャ'=>'sha','シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu',
1394fed467f8SDenis Scheither  'ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju',
1395fed467f8SDenis Scheither  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1396fed467f8SDenis Scheither  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1397fed467f8SDenis Scheither  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1398fed467f8SDenis Scheither  'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1399fed467f8SDenis Scheither  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1400fed467f8SDenis Scheither  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1401fed467f8SDenis Scheither  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1402fed467f8SDenis Scheither  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1403fed467f8SDenis Scheither  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1404fed467f8SDenis Scheither  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1405fed467f8SDenis Scheither  'ウェ'=>'we','ウィ'=>'wi',
1406fed467f8SDenis Scheither  'イェ'=>'ye',
1407fed467f8SDenis Scheither
1408fed467f8SDenis Scheither  // 2 character syllables - doubled vocal
1409fed467f8SDenis Scheither  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1410fed467f8SDenis Scheither  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1411fed467f8SDenis Scheither  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1412fed467f8SDenis Scheither  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1413fed467f8SDenis Scheither  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1414fed467f8SDenis Scheither  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1415fed467f8SDenis Scheither  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1416fed467f8SDenis Scheither  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1417fed467f8SDenis Scheither  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1418fed467f8SDenis Scheither  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1419fed467f8SDenis Scheither  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1420fed467f8SDenis Scheither  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'zii','ゾー'=>'zoo','ズー'=>'zuu',
1421fed467f8SDenis Scheither  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1422fed467f8SDenis Scheither  'ワー'=>'waa','ヲー'=>'woo',
1423fed467f8SDenis Scheither  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1424fed467f8SDenis Scheither  'ヵー'=>'kaa','ヶー'=>'kee',
1425*9476a253SAndreas Gohr  // old characters
1426*9476a253SAndreas Gohr  'ヱー'=>'wee','ヰー'=>'wii',
1427fed467f8SDenis Scheither
1428fed467f8SDenis Scheither  // 2 character syllables - doubled consonants
1429fed467f8SDenis Scheither  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1430fed467f8SDenis Scheither  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1431fed467f8SDenis Scheither  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1432fed467f8SDenis Scheither  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1433fed467f8SDenis Scheither  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1434fed467f8SDenis Scheither  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1435fed467f8SDenis Scheither  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1436fed467f8SDenis Scheither  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1437fed467f8SDenis Scheither  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'zzi','ッゾ'=>'zzo','ッズ'=>'zzu',
1438fed467f8SDenis Scheither  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'chi','ット'=>'tto','ッツ'=>'ttssu',
1439fed467f8SDenis Scheither  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1440fed467f8SDenis Scheither
1441fed467f8SDenis Scheither  // 1 character syllables
1442fed467f8SDenis Scheither  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1443fed467f8SDenis Scheither  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1444fed467f8SDenis Scheither  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1445fed467f8SDenis Scheither  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1446fed467f8SDenis Scheither  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1447fed467f8SDenis Scheither  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1448fed467f8SDenis Scheither  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1449fed467f8SDenis Scheither  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1450fed467f8SDenis Scheither  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1451fed467f8SDenis Scheither  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1452fed467f8SDenis Scheither  'ザ'=>'za','ゼ'=>'ze','ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu',
1453fed467f8SDenis Scheither  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1454fed467f8SDenis Scheither  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1455fed467f8SDenis Scheither  'ワ'=>'wa','ヲ'=>'wo',
1456fed467f8SDenis Scheither  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1457fed467f8SDenis Scheither  'ヵ'=>'ka','ヶ'=>'ke',
1458*9476a253SAndreas Gohr  // old characters
1459*9476a253SAndreas Gohr  'ヱ'=>'we','ヰ'=>'wi',
1460fed467f8SDenis Scheither
1461*9476a253SAndreas Gohr  //  convert what's left (probably only kicks in when something's missing above)
1462fed467f8SDenis Scheither  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1463fed467f8SDenis Scheither  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1464fed467f8SDenis Scheither
1465fed467f8SDenis Scheither  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1466fed467f8SDenis Scheither  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1467fed467f8SDenis Scheither  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1468fed467f8SDenis Scheither  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1469fed467f8SDenis Scheither  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1470fed467f8SDenis Scheither  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1471fed467f8SDenis Scheither  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1472fed467f8SDenis Scheither  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1473fed467f8SDenis Scheither  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1474fed467f8SDenis Scheither  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1475fed467f8SDenis Scheither  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1476fed467f8SDenis Scheither  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
14778a831f2bSAndreas Gohr
14788a831f2bSAndreas Gohr  // "Greeklish"
14798a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
14808a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
14818a831f2bSAndreas Gohr
14828a831f2bSAndreas Gohr  // Thai
14838a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
14848a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
14858a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
14868a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
14878a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
14888a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1489014d0ab6SAndreas Gohr  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1490014d0ab6SAndreas Gohr  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1491014d0ab6SAndreas Gohr  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1492014d0ab6SAndreas Gohr  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1493014d0ab6SAndreas Gohr  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1494014d0ab6SAndreas Gohr  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1495014d0ab6SAndreas Gohr  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1496014d0ab6SAndreas Gohr  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1497014d0ab6SAndreas Gohr  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1498014d0ab6SAndreas Gohr  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1499014d0ab6SAndreas Gohr	'๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1500014d0ab6SAndreas Gohr  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
15018a831f2bSAndreas Gohr
15028a831f2bSAndreas Gohr  // Korean
15038a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
15048a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
15058a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
15068a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
15078a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
15088a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
15098a831f2bSAndreas Gohr);
1510340756e4Sandi
1511340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
15128a831f2bSAndreas Gohr
1513