1ed7b5f09Sandi<?php 282257610Sandi/** 382257610Sandi * UTF8 helper functions 482257610Sandi * 54a47269fSandi * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 782257610Sandi */ 882257610Sandi 9ab77016bSAndreas Gohr/** 10ab77016bSAndreas Gohr * check for mb_string support 11ab77016bSAndreas Gohr */ 12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){ 13ab77016bSAndreas Gohr if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14ab77016bSAndreas Gohr define('UTF8_MBSTRING',1); 15ab77016bSAndreas Gohr }else{ 16ab77016bSAndreas Gohr define('UTF8_MBSTRING',0); 17ab77016bSAndreas Gohr } 18ab77016bSAndreas Gohr} 19ab77016bSAndreas Gohr 205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 215e613a5cSchris 22df957b36SAndreas Gohrif(!function_exists('utf8_encodeFN')){ 2382257610Sandi /** 2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters 2549c713a3Sandi * 2649c713a3Sandi * Slashes are not encoded 2749c713a3Sandi * 28f59b22f0Sandi * When the second parameter is true the string will 29f59b22f0Sandi * be encoded only if non ASCII characters are detected - 30f59b22f0Sandi * This makes it safe to run it multiple times on the 31f59b22f0Sandi * same string (default is true) 32f59b22f0Sandi * 3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 34f59b22f0Sandi * @see urlencode 3549c713a3Sandi */ 36f59b22f0Sandi function utf8_encodeFN($file,$safe=true){ 37f59b22f0Sandi if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38f59b22f0Sandi return $file; 39f59b22f0Sandi } 40f59b22f0Sandi $file = urlencode($file); 4149c713a3Sandi $file = str_replace('%2F','/',$file); 4249c713a3Sandi return $file; 4349c713a3Sandi } 44df957b36SAndreas Gohr} 4549c713a3Sandi 46df957b36SAndreas Gohrif(!function_exists('utf8_decodeFN')){ 4749c713a3Sandi /** 4849c713a3Sandi * URL-Decode a filename 4949c713a3Sandi * 50f59b22f0Sandi * This is just a wrapper around urldecode 51f59b22f0Sandi * 5249c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 53f59b22f0Sandi * @see urldecode 5449c713a3Sandi */ 5549c713a3Sandi function utf8_decodeFN($file){ 56f59b22f0Sandi $file = urldecode($file); 5749c713a3Sandi return $file; 5849c713a3Sandi } 59df957b36SAndreas Gohr} 6049c713a3Sandi 61df957b36SAndreas Gohrif(!function_exists('utf8_isASCII')){ 62f29bd553Sandi /** 6344f669e9Sandi * Checks if a string contains 7bit ASCII only 6444f669e9Sandi * 657e6f32c4SAndreas Gohr * @author Andreas Haerter <netzmeister@andreas-haerter.de> 6644f669e9Sandi */ 6744f669e9Sandi function utf8_isASCII($str){ 687e6f32c4SAndreas Gohr return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); 6944f669e9Sandi } 70df957b36SAndreas Gohr} 7144f669e9Sandi 72df957b36SAndreas Gohrif(!function_exists('utf8_strip')){ 7344f669e9Sandi /** 74e1906e6eSandi * Strips all highbyte chars 75e1906e6eSandi * 76e1906e6eSandi * Returns a pure ASCII7 string 77e1906e6eSandi * 78e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org> 79e1906e6eSandi */ 80e1906e6eSandi function utf8_strip($str){ 81e1906e6eSandi $ascii = ''; 828ec3f7bdSAndreas Gohr $len = strlen($str); 838ec3f7bdSAndreas Gohr for($i=0; $i<$len; $i++){ 84e1906e6eSandi if(ord($str{$i}) <128){ 85e1906e6eSandi $ascii .= $str{$i}; 86e1906e6eSandi } 87e1906e6eSandi } 88e1906e6eSandi return $ascii; 89e1906e6eSandi } 90df957b36SAndreas Gohr} 91e1906e6eSandi 92df957b36SAndreas Gohrif(!function_exists('utf8_check')){ 93e1906e6eSandi /** 94f29bd553Sandi * Tries to detect if a string is in Unicode encoding 95f29bd553Sandi * 96f29bd553Sandi * @author <bmorel@ssi.fr> 97f29bd553Sandi * @link http://www.php.net/manual/en/function.utf8-encode.php 98f29bd553Sandi */ 99f29bd553Sandi function utf8_check($Str) { 1008ec3f7bdSAndreas Gohr $len = strlen($Str); 1018ec3f7bdSAndreas Gohr for ($i=0; $i<$len; $i++) { 1025e613a5cSchris $b = ord($Str[$i]); 1035e613a5cSchris if ($b < 0x80) continue; # 0bbbbbbb 1045e613a5cSchris elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 1055e613a5cSchris elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 1065e613a5cSchris elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 1075e613a5cSchris elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 1085e613a5cSchris elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 109f29bd553Sandi else return false; # Does not match any model 110df957b36SAndreas Gohr 111f29bd553Sandi for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 1128ec3f7bdSAndreas Gohr if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80)) 113f29bd553Sandi return false; 114f29bd553Sandi } 115f29bd553Sandi } 116f29bd553Sandi return true; 117f29bd553Sandi } 118df957b36SAndreas Gohr} 11949c713a3Sandi 120df957b36SAndreas Gohrif(!function_exists('utf8_strlen')){ 1212f954959Sandi /** 122f29317c1Sandi * Unicode aware replacement for strlen() 1232f954959Sandi * 124f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1 125f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's 126f29317c1Sandi * even faster than mb_strlen. 1272f954959Sandi * 128f29317c1Sandi * @author <chernyshevsky at hotmail dot com> 1292f954959Sandi * @see strlen() 130f29317c1Sandi * @see utf8_decode() 1312f954959Sandi */ 1322f954959Sandi function utf8_strlen($string){ 133dc57ef04Sandi return strlen(utf8_decode($string)); 1342f954959Sandi } 135df957b36SAndreas Gohr} 1362f954959Sandi 137df957b36SAndreas Gohrif(!function_exists('utf8_substr')){ 1387077c942Sandi /** 13910f09f2aSAndreas Gohr * UTF-8 aware alternative to substr 1407077c942Sandi * 14110f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length) 14210f09f2aSAndreas Gohr * 14310f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 1445e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk> 14510f09f2aSAndreas Gohr * @param string 14610f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left) 14710f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset 14844881bd0Shenning.noren * @return mixed string or false if failure 1497077c942Sandi */ 15010f09f2aSAndreas Gohr function utf8_substr($str, $offset, $length = null) { 151ab77016bSAndreas Gohr if(UTF8_MBSTRING){ 15210f09f2aSAndreas Gohr if( $length === null ){ 15319a32233Schris return mb_substr($str, $offset); 1547d8be200Sandi }else{ 15519a32233Schris return mb_substr($str, $offset, $length); 156f29317c1Sandi } 157f29317c1Sandi } 158f29317c1Sandi 1592626ee0cSchris /* 1602626ee0cSchris * Notes: 1612626ee0cSchris * 1622626ee0cSchris * no mb string support, so we'll use pcre regex's with 'u' flag 1632626ee0cSchris * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 1642626ee0cSchris * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 1652626ee0cSchris * 1662626ee0cSchris * substr documentation states false can be returned in some cases (e.g. offset > string length) 1672626ee0cSchris * mb_substr never returns false, it will return an empty string instead. 1682626ee0cSchris * 1692626ee0cSchris * calculating the number of characters in the string is a relatively expensive operation, so 1702626ee0cSchris * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 1712626ee0cSchris */ 17210f09f2aSAndreas Gohr 1732626ee0cSchris // cast parameters to appropriate types to avoid multiple notices/warnings 1742626ee0cSchris $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 1752626ee0cSchris $offset = (int)$offset; 1762626ee0cSchris if (!is_null($length)) $length = (int)$length; 17710f09f2aSAndreas Gohr 1782626ee0cSchris // handle trivial cases 1795e613a5cSchris if ($length === 0) return ''; 1802626ee0cSchris if ($offset < 0 && $length < 0 && $length < $offset) return ''; 1815e613a5cSchris 1822626ee0cSchris $offset_pattern = ''; 1832626ee0cSchris $length_pattern = ''; 1842626ee0cSchris 1852626ee0cSchris // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 1862626ee0cSchris if ($offset < 0) { 1872626ee0cSchris $strlen = strlen(utf8_decode($str)); // see notes 1882626ee0cSchris $offset = $strlen + $offset; 1892626ee0cSchris if ($offset < 0) $offset = 0; 1902626ee0cSchris } 1912626ee0cSchris 1922626ee0cSchris // establish a pattern for offset, a non-captured group equal in length to offset 1932626ee0cSchris if ($offset > 0) { 1942626ee0cSchris $Ox = (int)($offset/65535); 1952626ee0cSchris $Oy = $offset%65535; 1962626ee0cSchris 1972626ee0cSchris if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 1982626ee0cSchris $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 1992626ee0cSchris } else { 2002626ee0cSchris $offset_pattern = '^'; // offset == 0; just anchor the pattern 2012626ee0cSchris } 2022626ee0cSchris 2032626ee0cSchris // establish a pattern for length 2042626ee0cSchris if (is_null($length)) { 2052626ee0cSchris $length_pattern = '(.*)$'; // the rest of the string 2062626ee0cSchris } else { 2072626ee0cSchris 2082626ee0cSchris if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 2092626ee0cSchris if ($offset > $strlen) return ''; // another trivial case 2102626ee0cSchris 2112626ee0cSchris if ($length > 0) { 2122626ee0cSchris 2132626ee0cSchris $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 2142626ee0cSchris 2152626ee0cSchris $Lx = (int)($length/65535); 2162626ee0cSchris $Ly = $length%65535; 2172626ee0cSchris 2182626ee0cSchris // +ve length requires ... a captured group of length characters 2192626ee0cSchris if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 2202626ee0cSchris $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 2212626ee0cSchris 2222626ee0cSchris } else if ($length < 0) { 2232626ee0cSchris 2242626ee0cSchris if ($length < ($offset - $strlen)) return ''; 2252626ee0cSchris 2262626ee0cSchris $Lx = (int)((-$length)/65535); 2272626ee0cSchris $Ly = (-$length)%65535; 2282626ee0cSchris 2292626ee0cSchris // -ve length requires ... capture everything except a group of -length characters 2302626ee0cSchris // anchored at the tail-end of the string 2312626ee0cSchris if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 2322626ee0cSchris $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 23310f09f2aSAndreas Gohr } 23410f09f2aSAndreas Gohr } 23510f09f2aSAndreas Gohr 2362626ee0cSchris if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 2372626ee0cSchris return $match[1]; 2382626ee0cSchris } 239df957b36SAndreas Gohr} 24010f09f2aSAndreas Gohr 241df957b36SAndreas Gohrif(!function_exists('utf8_substr_replace')){ 242f29317c1Sandi /** 243dc57ef04Sandi * Unicode aware replacement for substr_replace() 244dc57ef04Sandi * 245dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org> 246dc57ef04Sandi * @see substr_replace() 247dc57ef04Sandi */ 248dc57ef04Sandi function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 249dc57ef04Sandi $ret = ''; 250dc57ef04Sandi if($start>0) $ret .= utf8_substr($string, 0, $start); 251dc57ef04Sandi $ret .= $replacement; 252dc57ef04Sandi $ret .= utf8_substr($string, $start+$length); 253dc57ef04Sandi return $ret; 254dc57ef04Sandi } 255df957b36SAndreas Gohr} 256dc57ef04Sandi 257df957b36SAndreas Gohrif(!function_exists('utf8_ltrim')){ 258dc57ef04Sandi /** 259f29317c1Sandi * Unicode aware replacement for ltrim() 260f29317c1Sandi * 261f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 262f29317c1Sandi * @see ltrim() 263f29317c1Sandi * @return string 264f29317c1Sandi */ 265f29317c1Sandi function utf8_ltrim($str,$charlist=''){ 266f29317c1Sandi if($charlist == '') return ltrim($str); 267f29317c1Sandi 268f29317c1Sandi //quote charlist for use in a characterclass 269f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 270f29317c1Sandi 271f29317c1Sandi return preg_replace('/^['.$charlist.']+/u','',$str); 272f29317c1Sandi } 273df957b36SAndreas Gohr} 274f29317c1Sandi 275df957b36SAndreas Gohrif(!function_exists('utf8_rtrim')){ 276f29317c1Sandi /** 277ea2eed85Sandi * Unicode aware replacement for rtrim() 278f29317c1Sandi * 279f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 280f29317c1Sandi * @see rtrim() 281f29317c1Sandi * @return string 282f29317c1Sandi */ 283f29317c1Sandi function utf8_rtrim($str,$charlist=''){ 284f29317c1Sandi if($charlist == '') return rtrim($str); 285f29317c1Sandi 286f29317c1Sandi //quote charlist for use in a characterclass 287f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 288f29317c1Sandi 289f29317c1Sandi return preg_replace('/['.$charlist.']+$/u','',$str); 290f29317c1Sandi } 291df957b36SAndreas Gohr} 292f29317c1Sandi 293df957b36SAndreas Gohrif(!function_exists('utf8_trim')){ 294f29317c1Sandi /** 295f29317c1Sandi * Unicode aware replacement for trim() 296f29317c1Sandi * 297f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 298f29317c1Sandi * @see trim() 299f29317c1Sandi * @return string 300f29317c1Sandi */ 301f29317c1Sandi function utf8_trim($str,$charlist='') { 302f29317c1Sandi if($charlist == '') return trim($str); 303f29317c1Sandi 30440421069SAndreas Gohr return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist); 305f29317c1Sandi } 306df957b36SAndreas Gohr} 307f29317c1Sandi 308df957b36SAndreas Gohrif(!function_exists('utf8_strtolower')){ 30949c713a3Sandi /** 31082257610Sandi * This is a unicode aware replacement for strtolower() 31182257610Sandi * 31282257610Sandi * Uses mb_string extension if available 31382257610Sandi * 31472de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org> 31582257610Sandi * @see strtolower() 31682257610Sandi * @see utf8_strtoupper() 31782257610Sandi */ 31882257610Sandi function utf8_strtolower($string){ 319ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 32082257610Sandi 32182257610Sandi global $UTF8_UPPER_TO_LOWER; 32272de9068SAndreas Gohr return strtr($string,$UTF8_UPPER_TO_LOWER); 32382257610Sandi } 324df957b36SAndreas Gohr} 32582257610Sandi 326df957b36SAndreas Gohrif(!function_exists('utf8_strtoupper')){ 32782257610Sandi /** 32882257610Sandi * This is a unicode aware replacement for strtoupper() 32982257610Sandi * 33082257610Sandi * Uses mb_string extension if available 33182257610Sandi * 33272de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org> 33382257610Sandi * @see strtoupper() 33482257610Sandi * @see utf8_strtoupper() 33582257610Sandi */ 33682257610Sandi function utf8_strtoupper($string){ 337ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 33882257610Sandi 33982257610Sandi global $UTF8_LOWER_TO_UPPER; 34072de9068SAndreas Gohr return strtr($string,$UTF8_LOWER_TO_UPPER); 34182257610Sandi } 342df957b36SAndreas Gohr} 34382257610Sandi 344df957b36SAndreas Gohrif(!function_exists('utf8_ucfirst')){ 34582257610Sandi /** 34626ece5a7SAndreas Gohr * UTF-8 aware alternative to ucfirst 34726ece5a7SAndreas Gohr * Make a string's first character uppercase 34826ece5a7SAndreas Gohr * 34926ece5a7SAndreas Gohr * @author Harry Fuecks 35026ece5a7SAndreas Gohr * @param string 35126ece5a7SAndreas Gohr * @return string with first character as upper case (if applicable) 35226ece5a7SAndreas Gohr */ 35326ece5a7SAndreas Gohr function utf8_ucfirst($str){ 35426ece5a7SAndreas Gohr switch ( utf8_strlen($str) ) { 35526ece5a7SAndreas Gohr case 0: 35626ece5a7SAndreas Gohr return ''; 35726ece5a7SAndreas Gohr case 1: 35826ece5a7SAndreas Gohr return utf8_strtoupper($str); 35926ece5a7SAndreas Gohr default: 36026ece5a7SAndreas Gohr preg_match('/^(.{1})(.*)$/us', $str, $matches); 36126ece5a7SAndreas Gohr return utf8_strtoupper($matches[1]).$matches[2]; 36226ece5a7SAndreas Gohr } 36326ece5a7SAndreas Gohr } 364df957b36SAndreas Gohr} 36526ece5a7SAndreas Gohr 366df957b36SAndreas Gohrif(!function_exists('utf8_ucwords')){ 36726ece5a7SAndreas Gohr /** 36826ece5a7SAndreas Gohr * UTF-8 aware alternative to ucwords 36926ece5a7SAndreas Gohr * Uppercase the first character of each word in a string 37026ece5a7SAndreas Gohr * 37126ece5a7SAndreas Gohr * @author Harry Fuecks 37226ece5a7SAndreas Gohr * @param string 37326ece5a7SAndreas Gohr * @return string with first char of each word uppercase 37426ece5a7SAndreas Gohr * @see http://www.php.net/ucwords 37526ece5a7SAndreas Gohr */ 37626ece5a7SAndreas Gohr function utf8_ucwords($str) { 37726ece5a7SAndreas Gohr // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches; 37826ece5a7SAndreas Gohr // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns 37926ece5a7SAndreas Gohr // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords 38026ece5a7SAndreas Gohr $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u'; 38126ece5a7SAndreas Gohr 38226ece5a7SAndreas Gohr return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str); 38326ece5a7SAndreas Gohr } 38426ece5a7SAndreas Gohr 38526ece5a7SAndreas Gohr /** 38626ece5a7SAndreas Gohr * Callback function for preg_replace_callback call in utf8_ucwords 38726ece5a7SAndreas Gohr * You don't need to call this yourself 38826ece5a7SAndreas Gohr * 38926ece5a7SAndreas Gohr * @author Harry Fuecks 39026ece5a7SAndreas Gohr * @param array of matches corresponding to a single word 39126ece5a7SAndreas Gohr * @return string with first char of the word in uppercase 39226ece5a7SAndreas Gohr * @see utf8_ucwords 39326ece5a7SAndreas Gohr * @see utf8_strtoupper 39426ece5a7SAndreas Gohr */ 39526ece5a7SAndreas Gohr function utf8_ucwords_callback($matches) { 39626ece5a7SAndreas Gohr $leadingws = $matches[2]; 39726ece5a7SAndreas Gohr $ucfirst = utf8_strtoupper($matches[3]); 39826ece5a7SAndreas Gohr $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1); 39926ece5a7SAndreas Gohr return $leadingws . $ucword; 40026ece5a7SAndreas Gohr } 401df957b36SAndreas Gohr} 40226ece5a7SAndreas Gohr 403df957b36SAndreas Gohrif(!function_exists('utf8_deaccent')){ 40426ece5a7SAndreas Gohr /** 40582257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 40682257610Sandi * 40782257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 40882257610Sandi * letters. Default is to deaccent both cases ($case = 0) 40982257610Sandi * 41082257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 41182257610Sandi */ 41282257610Sandi function utf8_deaccent($string,$case=0){ 41382257610Sandi if($case <= 0){ 41482257610Sandi global $UTF8_LOWER_ACCENTS; 41572de9068SAndreas Gohr $string = strtr($string,$UTF8_LOWER_ACCENTS); 41682257610Sandi } 41782257610Sandi if($case >= 0){ 41882257610Sandi global $UTF8_UPPER_ACCENTS; 41972de9068SAndreas Gohr $string = strtr($string,$UTF8_UPPER_ACCENTS); 42082257610Sandi } 42182257610Sandi return $string; 42282257610Sandi } 423df957b36SAndreas Gohr} 42482257610Sandi 425df957b36SAndreas Gohrif(!function_exists('utf8_romanize')){ 42682257610Sandi /** 4278a831f2bSAndreas Gohr * Romanize a non-latin string 4288a831f2bSAndreas Gohr * 4298a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 4308a831f2bSAndreas Gohr */ 4318a831f2bSAndreas Gohr function utf8_romanize($string){ 4328a831f2bSAndreas Gohr if(utf8_isASCII($string)) return $string; //nothing to do 4338a831f2bSAndreas Gohr 4348a831f2bSAndreas Gohr global $UTF8_ROMANIZATION; 4358a831f2bSAndreas Gohr return strtr($string,$UTF8_ROMANIZATION); 4368a831f2bSAndreas Gohr } 437df957b36SAndreas Gohr} 4388a831f2bSAndreas Gohr 439df957b36SAndreas Gohrif(!function_exists('utf8_stripspecials')){ 4408a831f2bSAndreas Gohr /** 441099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string 442099ada41Sandi * 443099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of 444099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 445099ada41Sandi * 446099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 447099ada41Sandi * @param string $string The UTF8 string to strip of special chars 448099ada41Sandi * @param string $repl Replace special with this string 449b4ce25e9SAndreas Gohr * @param string $additional Additional chars to strip (used in regexp char class) 450099ada41Sandi */ 451b4ce25e9SAndreas Gohr function utf8_stripspecials($string,$repl='',$additional=''){ 452099ada41Sandi global $UTF8_SPECIAL_CHARS; 453720307d9Schris global $UTF8_SPECIAL_CHARS2; 454099ada41Sandi 4555c812709Sandi static $specials = null; 4565c812709Sandi if(is_null($specials)){ 457720307d9Schris #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 458720307d9Schris $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 4595c812709Sandi } 460099ada41Sandi 461b4ce25e9SAndreas Gohr return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 462099ada41Sandi } 463df957b36SAndreas Gohr} 464099ada41Sandi 465df957b36SAndreas Gohrif(!function_exists('utf8_strpos')){ 466099ada41Sandi /** 4672f954959Sandi * This is an Unicode aware replacement for strpos 4682f954959Sandi * 46972de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org> 4702f954959Sandi * @see strpos() 47172de9068SAndreas Gohr * @param string 47272de9068SAndreas Gohr * @param string 47372de9068SAndreas Gohr * @param integer 47472de9068SAndreas Gohr * @return integer 4752f954959Sandi */ 4762f954959Sandi function utf8_strpos($haystack, $needle, $offset=0){ 47772de9068SAndreas Gohr $comp = 0; 47872de9068SAndreas Gohr $length = null; 4792f954959Sandi 48072de9068SAndreas Gohr while (is_null($length) || $length < $offset) { 48172de9068SAndreas Gohr $pos = strpos($haystack, $needle, $offset + $comp); 48272de9068SAndreas Gohr 48372de9068SAndreas Gohr if ($pos === false) 484f29317c1Sandi return false; 48572de9068SAndreas Gohr 48672de9068SAndreas Gohr $length = utf8_strlen(substr($haystack, 0, $pos)); 48772de9068SAndreas Gohr 48872de9068SAndreas Gohr if ($length < $offset) 48972de9068SAndreas Gohr $comp = $pos - $length; 490f29317c1Sandi } 4912f954959Sandi 49272de9068SAndreas Gohr return $length; 49372de9068SAndreas Gohr } 494df957b36SAndreas Gohr} 495f29317c1Sandi 496df957b36SAndreas Gohrif(!function_exists('utf8_tohtml')){ 4972f954959Sandi /** 498ea2eed85Sandi * Encodes UTF-8 characters to HTML entities 499ea2eed85Sandi * 5009f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 501ea2eed85Sandi * @author <vpribish at shopping dot com> 502ea2eed85Sandi * @link http://www.php.net/manual/en/function.utf8-decode.php 503ea2eed85Sandi */ 504ea2eed85Sandi function utf8_tohtml ($str) { 505ea2eed85Sandi $ret = ''; 5069f9fb0e5STom N Harris foreach (utf8_to_unicode($str) as $cp) { 5079f9fb0e5STom N Harris if ($cp < 0x80) 5089f9fb0e5STom N Harris $ret .= chr($cp); 5099f9fb0e5STom N Harris elseif ($cp < 0x100) 5109f9fb0e5STom N Harris $ret .= "&#$cp;"; 5119f9fb0e5STom N Harris else 5129f9fb0e5STom N Harris $ret .= '&#x'.dechex($cp).';'; 5139f9fb0e5STom N Harris } 5149f9fb0e5STom N Harris return $ret; 5159f9fb0e5STom N Harris } 516df957b36SAndreas Gohr} 5179f9fb0e5STom N Harris 518df957b36SAndreas Gohrif(!function_exists('utf8_unhtml')){ 5199f9fb0e5STom N Harris /** 5209f9fb0e5STom N Harris * Decodes HTML entities to UTF-8 characters 5219f9fb0e5STom N Harris * 5229f9fb0e5STom N Harris * Convert any &#..; entity to a codepoint, 5239f9fb0e5STom N Harris * The entities flag defaults to only decoding numeric entities. 5249f9fb0e5STom N Harris * Pass HTML_ENTITIES and named entities, including & < etc. 5259f9fb0e5STom N Harris * are handled as well. Avoids the problem that would occur if you 5269f9fb0e5STom N Harris * had to decode "&#38;&amp;#38;" 5279f9fb0e5STom N Harris * 5289f9fb0e5STom N Harris * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 5299f9fb0e5STom N Harris * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 5309f9fb0e5STom N Harris * what it should be -> "&&#38;" 5319f9fb0e5STom N Harris * 5329f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 5339f9fb0e5STom N Harris * @param string $str UTF-8 encoded string 5349f9fb0e5STom N Harris * @param boolean $entities Flag controlling decoding of named entities. 5359f9fb0e5STom N Harris * @return UTF-8 encoded string with numeric (and named) entities replaced. 5369f9fb0e5STom N Harris */ 5379f9fb0e5STom N Harris function utf8_unhtml($str, $entities=null) { 5389f9fb0e5STom N Harris static $decoder = null; 5399f9fb0e5STom N Harris if (is_null($decoder)) 5409f9fb0e5STom N Harris $decoder = new utf8_entity_decoder(); 5419f9fb0e5STom N Harris if (is_null($entities)) 5429f9fb0e5STom N Harris return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 5439f9fb0e5STom N Harris 'utf8_decode_numeric', $str); 5449f9fb0e5STom N Harris else 5459f9fb0e5STom N Harris return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 5469f9fb0e5STom N Harris array(&$decoder, 'decode'), $str); 5479f9fb0e5STom N Harris } 548df957b36SAndreas Gohr} 549df957b36SAndreas Gohr 550df957b36SAndreas Gohrif(!function_exists('utf8_decode_numeric')){ 5519f9fb0e5STom N Harris function utf8_decode_numeric($ent) { 5529f9fb0e5STom N Harris switch ($ent[2]) { 5539f9fb0e5STom N Harris case 'X': 5549f9fb0e5STom N Harris case 'x': 5559f9fb0e5STom N Harris $cp = hexdec($ent[3]); 5569f9fb0e5STom N Harris break; 5579f9fb0e5STom N Harris default: 5589f9fb0e5STom N Harris $cp = intval($ent[3]); 5599f9fb0e5STom N Harris break; 5609f9fb0e5STom N Harris } 5619f9fb0e5STom N Harris return unicode_to_utf8(array($cp)); 5629f9fb0e5STom N Harris } 563df957b36SAndreas Gohr} 564df957b36SAndreas Gohr 565df957b36SAndreas Gohrif(!class_exists('utf8_entity_decoder')){ 5669f9fb0e5STom N Harris class utf8_entity_decoder { 5679f9fb0e5STom N Harris var $table; 5689f9fb0e5STom N Harris function utf8_entity_decoder() { 5699f9fb0e5STom N Harris $table = get_html_translation_table(HTML_ENTITIES); 5709f9fb0e5STom N Harris $table = array_flip($table); 5719f9fb0e5STom N Harris $this->table = array_map(array(&$this,'makeutf8'), $table); 5729f9fb0e5STom N Harris } 5739f9fb0e5STom N Harris function makeutf8($c) { 5749f9fb0e5STom N Harris return unicode_to_utf8(array(ord($c))); 5759f9fb0e5STom N Harris } 5769f9fb0e5STom N Harris function decode($ent) { 5779f9fb0e5STom N Harris if ($ent[1] == '#') { 5789f9fb0e5STom N Harris return utf8_decode_numeric($ent); 5799f9fb0e5STom N Harris } elseif (array_key_exists($ent[0],$this->table)) { 5809f9fb0e5STom N Harris return $this->table[$ent[0]]; 5819f9fb0e5STom N Harris } else { 5829f9fb0e5STom N Harris return $ent[0]; 583ea2eed85Sandi } 584ea2eed85Sandi } 585ea2eed85Sandi } 586df957b36SAndreas Gohr} 587ea2eed85Sandi 588df957b36SAndreas Gohrif(!function_exists('utf8_to_unicode')){ 589ea2eed85Sandi /** 5901abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the 5911abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the 5921abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 5931abfaba4SAndreas Gohr * are not allowed. 59482257610Sandi * 5951abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 5961abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at 5971abfaba4SAndreas Gohr * level E_USER_WARNING 5981abfaba4SAndreas Gohr * 5991abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to 6001abfaba4SAndreas Gohr * trigger errors on encountering bad bytes 6011abfaba4SAndreas Gohr * 6021abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 6031abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 6041abfaba4SAndreas Gohr * @param string UTF-8 encoded string 6051abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 60644881bd0Shenning.noren * @return mixed array of unicode code points or false if UTF-8 invalid 6071abfaba4SAndreas Gohr * @see unicode_to_utf8 6081abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 6091abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 61082257610Sandi */ 6111abfaba4SAndreas Gohr function utf8_to_unicode($str,$strict=false) { 6121abfaba4SAndreas Gohr $mState = 0; // cached expected number of octets after the current octet 6131abfaba4SAndreas Gohr // until the beginning of the next UTF8 character sequence 6141abfaba4SAndreas Gohr $mUcs4 = 0; // cached Unicode character 6151abfaba4SAndreas Gohr $mBytes = 1; // cached expected number of octets in the current sequence 61682257610Sandi 6171abfaba4SAndreas Gohr $out = array(); 6181abfaba4SAndreas Gohr 6191abfaba4SAndreas Gohr $len = strlen($str); 6201abfaba4SAndreas Gohr 6211abfaba4SAndreas Gohr for($i = 0; $i < $len; $i++) { 6221abfaba4SAndreas Gohr 6231abfaba4SAndreas Gohr $in = ord($str{$i}); 6241abfaba4SAndreas Gohr 6251abfaba4SAndreas Gohr if ( $mState == 0) { 6261abfaba4SAndreas Gohr 6271abfaba4SAndreas Gohr // When mState is zero we expect either a US-ASCII character or a 6281abfaba4SAndreas Gohr // multi-octet sequence. 6291abfaba4SAndreas Gohr if (0 == (0x80 & ($in))) { 6301abfaba4SAndreas Gohr // US-ASCII, pass straight through. 6311abfaba4SAndreas Gohr $out[] = $in; 6321abfaba4SAndreas Gohr $mBytes = 1; 6331abfaba4SAndreas Gohr 6341abfaba4SAndreas Gohr } else if (0xC0 == (0xE0 & ($in))) { 6351abfaba4SAndreas Gohr // First octet of 2 octet sequence 6361abfaba4SAndreas Gohr $mUcs4 = ($in); 6371abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x1F) << 6; 6381abfaba4SAndreas Gohr $mState = 1; 6391abfaba4SAndreas Gohr $mBytes = 2; 6401abfaba4SAndreas Gohr 6411abfaba4SAndreas Gohr } else if (0xE0 == (0xF0 & ($in))) { 6421abfaba4SAndreas Gohr // First octet of 3 octet sequence 6431abfaba4SAndreas Gohr $mUcs4 = ($in); 6441abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x0F) << 12; 6451abfaba4SAndreas Gohr $mState = 2; 6461abfaba4SAndreas Gohr $mBytes = 3; 6471abfaba4SAndreas Gohr 6481abfaba4SAndreas Gohr } else if (0xF0 == (0xF8 & ($in))) { 6491abfaba4SAndreas Gohr // First octet of 4 octet sequence 6501abfaba4SAndreas Gohr $mUcs4 = ($in); 6511abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x07) << 18; 6521abfaba4SAndreas Gohr $mState = 3; 6531abfaba4SAndreas Gohr $mBytes = 4; 6541abfaba4SAndreas Gohr 6551abfaba4SAndreas Gohr } else if (0xF8 == (0xFC & ($in))) { 6561abfaba4SAndreas Gohr /* First octet of 5 octet sequence. 6571abfaba4SAndreas Gohr * 6581abfaba4SAndreas Gohr * This is illegal because the encoded codepoint must be either 6591abfaba4SAndreas Gohr * (a) not the shortest form or 6601abfaba4SAndreas Gohr * (b) outside the Unicode range of 0-0x10FFFF. 6611abfaba4SAndreas Gohr * Rather than trying to resynchronize, we will carry on until the end 6621abfaba4SAndreas Gohr * of the sequence and let the later error handling code catch it. 6631abfaba4SAndreas Gohr */ 6641abfaba4SAndreas Gohr $mUcs4 = ($in); 6651abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x03) << 24; 6661abfaba4SAndreas Gohr $mState = 4; 6671abfaba4SAndreas Gohr $mBytes = 5; 6681abfaba4SAndreas Gohr 6691abfaba4SAndreas Gohr } else if (0xFC == (0xFE & ($in))) { 6701abfaba4SAndreas Gohr // First octet of 6 octet sequence, see comments for 5 octet sequence. 6711abfaba4SAndreas Gohr $mUcs4 = ($in); 6721abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 1) << 30; 6731abfaba4SAndreas Gohr $mState = 5; 6741abfaba4SAndreas Gohr $mBytes = 6; 6751abfaba4SAndreas Gohr 6761abfaba4SAndreas Gohr } elseif($strict) { 6771abfaba4SAndreas Gohr /* Current octet is neither in the US-ASCII range nor a legal first 6781abfaba4SAndreas Gohr * octet of a multi-octet sequence. 6791abfaba4SAndreas Gohr */ 6801abfaba4SAndreas Gohr trigger_error( 6811abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence identifier '. 6821abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 6831abfaba4SAndreas Gohr E_USER_WARNING 6841abfaba4SAndreas Gohr ); 68544881bd0Shenning.noren return false; 6861abfaba4SAndreas Gohr 6871abfaba4SAndreas Gohr } 6881abfaba4SAndreas Gohr 6891abfaba4SAndreas Gohr } else { 6901abfaba4SAndreas Gohr 6911abfaba4SAndreas Gohr // When mState is non-zero, we expect a continuation of the multi-octet 6921abfaba4SAndreas Gohr // sequence 6931abfaba4SAndreas Gohr if (0x80 == (0xC0 & ($in))) { 6941abfaba4SAndreas Gohr 6951abfaba4SAndreas Gohr // Legal continuation. 6961abfaba4SAndreas Gohr $shift = ($mState - 1) * 6; 6971abfaba4SAndreas Gohr $tmp = $in; 6981abfaba4SAndreas Gohr $tmp = ($tmp & 0x0000003F) << $shift; 6991abfaba4SAndreas Gohr $mUcs4 |= $tmp; 7001abfaba4SAndreas Gohr 7011abfaba4SAndreas Gohr /** 7021abfaba4SAndreas Gohr * End of the multi-octet sequence. mUcs4 now contains the final 7031abfaba4SAndreas Gohr * Unicode codepoint to be output 7041abfaba4SAndreas Gohr */ 7051abfaba4SAndreas Gohr if (0 == --$mState) { 7061abfaba4SAndreas Gohr 7071abfaba4SAndreas Gohr /* 7081abfaba4SAndreas Gohr * Check for illegal sequences and codepoints. 7091abfaba4SAndreas Gohr */ 7101abfaba4SAndreas Gohr // From Unicode 3.1, non-shortest form is illegal 7111abfaba4SAndreas Gohr if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 7121abfaba4SAndreas Gohr ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 7131abfaba4SAndreas Gohr ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 7141abfaba4SAndreas Gohr (4 < $mBytes) || 7151abfaba4SAndreas Gohr // From Unicode 3.2, surrogate characters are illegal 7161abfaba4SAndreas Gohr (($mUcs4 & 0xFFFFF800) == 0xD800) || 7171abfaba4SAndreas Gohr // Codepoints outside the Unicode range are illegal 7181abfaba4SAndreas Gohr ($mUcs4 > 0x10FFFF)) { 7191abfaba4SAndreas Gohr 7201abfaba4SAndreas Gohr if($strict){ 7211abfaba4SAndreas Gohr trigger_error( 7221abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence or codepoint '. 7231abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 7241abfaba4SAndreas Gohr E_USER_WARNING 7251abfaba4SAndreas Gohr ); 7261abfaba4SAndreas Gohr 72744881bd0Shenning.noren return false; 7281abfaba4SAndreas Gohr } 7291abfaba4SAndreas Gohr 7301abfaba4SAndreas Gohr } 7311abfaba4SAndreas Gohr 7321abfaba4SAndreas Gohr if (0xFEFF != $mUcs4) { 7331abfaba4SAndreas Gohr // BOM is legal but we don't want to output it 7341abfaba4SAndreas Gohr $out[] = $mUcs4; 7351abfaba4SAndreas Gohr } 7361abfaba4SAndreas Gohr 7371abfaba4SAndreas Gohr //initialize UTF8 cache 7381abfaba4SAndreas Gohr $mState = 0; 7391abfaba4SAndreas Gohr $mUcs4 = 0; 7401abfaba4SAndreas Gohr $mBytes = 1; 7411abfaba4SAndreas Gohr } 7421abfaba4SAndreas Gohr 7431abfaba4SAndreas Gohr } elseif($strict) { 7441abfaba4SAndreas Gohr /** 7451abfaba4SAndreas Gohr *((0xC0 & (*in) != 0x80) && (mState != 0)) 7461abfaba4SAndreas Gohr * Incomplete multi-octet sequence. 7471abfaba4SAndreas Gohr */ 7481abfaba4SAndreas Gohr trigger_error( 7491abfaba4SAndreas Gohr 'utf8_to_unicode: Incomplete multi-octet '. 7501abfaba4SAndreas Gohr ' sequence in UTF-8 at byte '.$i, 7511abfaba4SAndreas Gohr E_USER_WARNING 7521abfaba4SAndreas Gohr ); 7531abfaba4SAndreas Gohr 75444881bd0Shenning.noren return false; 75582257610Sandi } 75682257610Sandi } 75782257610Sandi } 7581abfaba4SAndreas Gohr return $out; 75982257610Sandi } 760df957b36SAndreas Gohr} 76182257610Sandi 762df957b36SAndreas Gohrif(!function_exists('unicode_to_utf8')){ 76382257610Sandi /** 7641abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns 7651abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the 7661abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 7671abfaba4SAndreas Gohr * are not allowed. 76882257610Sandi * 7691abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 7701abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the 7711abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING 7721abfaba4SAndreas Gohr * 7731abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use 7741abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as 7751abfaba4SAndreas Gohr * reference the array by it's keys 7761abfaba4SAndreas Gohr * 7771abfaba4SAndreas Gohr * @param array of unicode code points representing a string 7781abfaba4SAndreas Gohr * @param boolean Check for invalid sequences? 77944881bd0Shenning.noren * @return mixed UTF-8 string or false if array contains invalid code points 7801abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 7811abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 7821abfaba4SAndreas Gohr * @see utf8_to_unicode 7831abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 7841abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 78582257610Sandi */ 7861abfaba4SAndreas Gohr function unicode_to_utf8($arr,$strict=false) { 7871abfaba4SAndreas Gohr if (!is_array($arr)) return ''; 7881abfaba4SAndreas Gohr ob_start(); 789f949a01cSAndreas Gohr 7901abfaba4SAndreas Gohr foreach (array_keys($arr) as $k) { 7911abfaba4SAndreas Gohr 7921abfaba4SAndreas Gohr if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 793*db959ae3SAndreas Gohr # ASCII range (including control chars) 7941abfaba4SAndreas Gohr 7951abfaba4SAndreas Gohr echo chr($arr[$k]); 7961abfaba4SAndreas Gohr 7971abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x07ff) { 798*db959ae3SAndreas Gohr # 2 byte sequence 7991abfaba4SAndreas Gohr 8001abfaba4SAndreas Gohr echo chr(0xc0 | ($arr[$k] >> 6)); 8011abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 8021abfaba4SAndreas Gohr 8031abfaba4SAndreas Gohr } else if($arr[$k] == 0xFEFF) { 804*db959ae3SAndreas Gohr # Byte order mark (skip) 8051abfaba4SAndreas Gohr 8061abfaba4SAndreas Gohr // nop -- zap the BOM 8071abfaba4SAndreas Gohr 8081abfaba4SAndreas Gohr } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 809*db959ae3SAndreas Gohr # Test for illegal surrogates 8101abfaba4SAndreas Gohr 8111abfaba4SAndreas Gohr // found a surrogate 8121abfaba4SAndreas Gohr if($strict){ 8131abfaba4SAndreas Gohr trigger_error( 8141abfaba4SAndreas Gohr 'unicode_to_utf8: Illegal surrogate '. 8151abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 8161abfaba4SAndreas Gohr E_USER_WARNING 8171abfaba4SAndreas Gohr ); 81844881bd0Shenning.noren return false; 8191abfaba4SAndreas Gohr } 8201abfaba4SAndreas Gohr 8211abfaba4SAndreas Gohr } else if ($arr[$k] <= 0xffff) { 822*db959ae3SAndreas Gohr # 3 byte sequence 8231abfaba4SAndreas Gohr 8241abfaba4SAndreas Gohr echo chr(0xe0 | ($arr[$k] >> 12)); 8251abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 8261abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 8271abfaba4SAndreas Gohr 8281abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x10ffff) { 829*db959ae3SAndreas Gohr # 4 byte sequence 8301abfaba4SAndreas Gohr 8311abfaba4SAndreas Gohr echo chr(0xf0 | ($arr[$k] >> 18)); 8321abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 8331abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 8341abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x3f)); 8351abfaba4SAndreas Gohr 8361abfaba4SAndreas Gohr } elseif($strict) { 8371abfaba4SAndreas Gohr 8381abfaba4SAndreas Gohr trigger_error( 8391abfaba4SAndreas Gohr 'unicode_to_utf8: Codepoint out of Unicode range '. 8401abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 8411abfaba4SAndreas Gohr E_USER_WARNING 8421abfaba4SAndreas Gohr ); 8431abfaba4SAndreas Gohr 8441abfaba4SAndreas Gohr // out of range 84544881bd0Shenning.noren return false; 84682257610Sandi } 84782257610Sandi } 8481abfaba4SAndreas Gohr 8491abfaba4SAndreas Gohr $result = ob_get_contents(); 8501abfaba4SAndreas Gohr ob_end_clean(); 8511abfaba4SAndreas Gohr return $result; 85282257610Sandi } 853df957b36SAndreas Gohr} 85482257610Sandi 855df957b36SAndreas Gohrif(!function_exists('utf8_to_utf16be')){ 85682257610Sandi /** 85715fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 85815fa0b4fSAndreas Gohr * 85915fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 86015fa0b4fSAndreas Gohr */ 86115fa0b4fSAndreas Gohr function utf8_to_utf16be(&$str, $bom = false) { 86215fa0b4fSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 863ab77016bSAndreas Gohr if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 86415fa0b4fSAndreas Gohr 86515fa0b4fSAndreas Gohr $uni = utf8_to_unicode($str); 86615fa0b4fSAndreas Gohr foreach($uni as $cp){ 86715fa0b4fSAndreas Gohr $out .= pack('n',$cp); 86815fa0b4fSAndreas Gohr } 86915fa0b4fSAndreas Gohr return $out; 87015fa0b4fSAndreas Gohr } 871df957b36SAndreas Gohr} 87215fa0b4fSAndreas Gohr 873df957b36SAndreas Gohrif(!function_exists('utf16be_to_utf8')){ 87415fa0b4fSAndreas Gohr /** 87515fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 87615fa0b4fSAndreas Gohr * 87715fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 87815fa0b4fSAndreas Gohr */ 87915fa0b4fSAndreas Gohr function utf16be_to_utf8(&$str) { 88015fa0b4fSAndreas Gohr $uni = unpack('n*',$str); 88115fa0b4fSAndreas Gohr return unicode_to_utf8($uni); 88215fa0b4fSAndreas Gohr } 883df957b36SAndreas Gohr} 88415fa0b4fSAndreas Gohr 885df957b36SAndreas Gohrif(!function_exists('utf8_bad_replace')){ 8860eac1afbSAndreas Gohr /** 8870eac1afbSAndreas Gohr * Replace bad bytes with an alternative character 8880eac1afbSAndreas Gohr * 8890eac1afbSAndreas Gohr * ASCII character is recommended for replacement char 8900eac1afbSAndreas Gohr * 8910eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string 8920eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms 8930eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars 8940eac1afbSAndreas Gohr * 8950eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 8960eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8 8970eac1afbSAndreas Gohr * @param string to search 8980eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII 8990eac1afbSAndreas Gohr * @return string 9000eac1afbSAndreas Gohr */ 9010eac1afbSAndreas Gohr function utf8_bad_replace($str, $replace = '') { 9020eac1afbSAndreas Gohr $UTF8_BAD = 9030eac1afbSAndreas Gohr '([\x00-\x7F]'. # ASCII (including control chars) 9040eac1afbSAndreas Gohr '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 9050eac1afbSAndreas Gohr '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 9060eac1afbSAndreas Gohr '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 9070eac1afbSAndreas Gohr '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 9080eac1afbSAndreas Gohr '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 9090eac1afbSAndreas Gohr '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 9100eac1afbSAndreas Gohr '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 9110eac1afbSAndreas Gohr '|(.{1}))'; # invalid byte 9120eac1afbSAndreas Gohr ob_start(); 9130eac1afbSAndreas Gohr while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 9140eac1afbSAndreas Gohr if ( !isset($matches[2])) { 9150eac1afbSAndreas Gohr echo $matches[0]; 9160eac1afbSAndreas Gohr } else { 9170eac1afbSAndreas Gohr echo $replace; 9180eac1afbSAndreas Gohr } 9190eac1afbSAndreas Gohr $str = substr($str,strlen($matches[0])); 9200eac1afbSAndreas Gohr } 9210eac1afbSAndreas Gohr $result = ob_get_contents(); 9220eac1afbSAndreas Gohr ob_end_clean(); 9230eac1afbSAndreas Gohr return $result; 9240eac1afbSAndreas Gohr } 925df957b36SAndreas Gohr} 926ab77016bSAndreas Gohr 927df957b36SAndreas Gohrif(!function_exists('utf8_correctIdx')){ 9285953e889Schris /** 9295953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary 9305953e889Schris * 9315953e889Schris * @param $str string utf8 character string 9325953e889Schris * @param $i int byte index into $str 9335953e889Schris * @param $next bool direction to search for boundary, 9345953e889Schris * false = up (current character) 9355953e889Schris * true = down (next character) 9365953e889Schris * 9375953e889Schris * @return int byte index into $str now pointing to a utf8 character boundary 9385953e889Schris * 9395953e889Schris * @author chris smith <chris@jalakai.co.uk> 9405953e889Schris */ 9415953e889Schris function utf8_correctIdx(&$str,$i,$next=false) { 9425953e889Schris 943f50163d1Schris if ($i <= 0) return 0; 944f50163d1Schris 9455953e889Schris $limit = strlen($str); 946f50163d1Schris if ($i>=$limit) return $limit; 947f50163d1Schris 948f50163d1Schris if ($next) { 9495953e889Schris while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 9505953e889Schris } else { 9515953e889Schris while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 9525953e889Schris } 9535953e889Schris 9545953e889Schris return $i; 9555953e889Schris } 956df957b36SAndreas Gohr} 9575953e889Schris 958ab77016bSAndreas Gohr// only needed if no mb_string available 959ab77016bSAndreas Gohrif(!UTF8_MBSTRING){ 96015fa0b4fSAndreas Gohr /** 96182257610Sandi * UTF-8 Case lookup table 96282257610Sandi * 96382257610Sandi * This lookuptable defines the upper case letters to their correspponding 96482257610Sandi * lower case letter in UTF-8 96582257610Sandi * 96682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 96782257610Sandi */ 96854662a04SAndreas Gohr global $UTF8_LOWER_TO_UPPER; 969df957b36SAndreas Gohr if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array( 97072de9068SAndreas Gohr "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q", 97172de9068SAndreas Gohr "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G", 97272de9068SAndreas Gohr "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ", 97372de9068SAndreas Gohr "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ", 97472de9068SAndreas Gohr "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ", 97572de9068SAndreas Gohr "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ", 97672de9068SAndreas Gohr "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ", 97772de9068SAndreas Gohr "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ", 97872de9068SAndreas Gohr "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ", 97972de9068SAndreas Gohr "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ", 98072de9068SAndreas Gohr "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ", 98172de9068SAndreas Gohr "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ", 98272de9068SAndreas Gohr "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ", 98372de9068SAndreas Gohr "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố", 98472de9068SAndreas Gohr "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ", 98572de9068SAndreas Gohr "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ", 98672de9068SAndreas Gohr "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ", 98772de9068SAndreas Gohr "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ", 98872de9068SAndreas Gohr "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ", 98972de9068SAndreas Gohr "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ", 99072de9068SAndreas Gohr "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ", 99172de9068SAndreas Gohr "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ", 99272de9068SAndreas Gohr "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ", 99372de9068SAndreas Gohr "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ", 99472de9068SAndreas Gohr "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս", 99572de9068SAndreas Gohr "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ", 99672de9068SAndreas Gohr "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ", 99772de9068SAndreas Gohr "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ", 99872de9068SAndreas Gohr "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ", 99972de9068SAndreas Gohr "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ", 100072de9068SAndreas Gohr "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ", 100172de9068SAndreas Gohr "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ", 100272de9068SAndreas Gohr "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ", 100372de9068SAndreas Gohr "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ", 100472de9068SAndreas Gohr "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ", 100572de9068SAndreas Gohr "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ", 100672de9068SAndreas Gohr "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ", 100772de9068SAndreas Gohr "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ", 100872de9068SAndreas Gohr "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П", 100972de9068SAndreas Gohr "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е", 101072de9068SAndreas Gohr "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ", 101172de9068SAndreas Gohr "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ", 101272de9068SAndreas Gohr "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ", 101372de9068SAndreas Gohr "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π", 101472de9068SAndreas Gohr "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ", 101572de9068SAndreas Gohr "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ", 101672de9068SAndreas Gohr "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ", 101772de9068SAndreas Gohr "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ", 101872de9068SAndreas Gohr "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș", 101972de9068SAndreas Gohr "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ", 102072de9068SAndreas Gohr "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ", 102172de9068SAndreas Gohr "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ", 102272de9068SAndreas Gohr "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ", 102372de9068SAndreas Gohr "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ", 102472de9068SAndreas Gohr "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž", 102572de9068SAndreas Gohr "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ", 102672de9068SAndreas Gohr "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ", 102772de9068SAndreas Gohr "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ", 102872de9068SAndreas Gohr "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī", 102972de9068SAndreas Gohr "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė", 103072de9068SAndreas Gohr "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă", 103172de9068SAndreas Gohr "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö", 103272de9068SAndreas Gohr "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì", 103372de9068SAndreas Gohr "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â", 103472de9068SAndreas Gohr "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T", 103572de9068SAndreas Gohr "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J", 103672de9068SAndreas Gohr "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A" 103782257610Sandi ); 103882257610Sandi 103982257610Sandi /** 104082257610Sandi * UTF-8 Case lookup table 104182257610Sandi * 104282257610Sandi * This lookuptable defines the lower case letters to their correspponding 104372de9068SAndreas Gohr * upper case letter in UTF-8 104482257610Sandi * 104582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 104682257610Sandi */ 104754662a04SAndreas Gohr global $UTF8_UPPER_TO_LOWER; 1048df957b36SAndreas Gohr if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array ( 104972de9068SAndreas Gohr "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q", 105072de9068SAndreas Gohr "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g", 105172de9068SAndreas Gohr "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ", 105272de9068SAndreas Gohr "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ", 105372de9068SAndreas Gohr "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ", 105472de9068SAndreas Gohr "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ", 105572de9068SAndreas Gohr "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ", 105672de9068SAndreas Gohr "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ", 105772de9068SAndreas Gohr "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ", 105872de9068SAndreas Gohr "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ", 105972de9068SAndreas Gohr "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ", 106072de9068SAndreas Gohr "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ", 106172de9068SAndreas Gohr "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ", 106272de9068SAndreas Gohr "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố", 106372de9068SAndreas Gohr "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ", 106472de9068SAndreas Gohr "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ", 106572de9068SAndreas Gohr "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ", 106672de9068SAndreas Gohr "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ", 106772de9068SAndreas Gohr "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ", 106872de9068SAndreas Gohr "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ", 106972de9068SAndreas Gohr "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ", 107072de9068SAndreas Gohr "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ", 107172de9068SAndreas Gohr "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ", 107272de9068SAndreas Gohr "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ", 107372de9068SAndreas Gohr "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս", 107472de9068SAndreas Gohr "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ", 107572de9068SAndreas Gohr "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ", 107672de9068SAndreas Gohr "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ", 107772de9068SAndreas Gohr "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ", 107872de9068SAndreas Gohr "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ", 107972de9068SAndreas Gohr "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ", 108072de9068SAndreas Gohr "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ", 108172de9068SAndreas Gohr "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ", 108272de9068SAndreas Gohr "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ", 108372de9068SAndreas Gohr "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ", 108472de9068SAndreas Gohr "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ", 108572de9068SAndreas Gohr "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ", 108672de9068SAndreas Gohr "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ", 108772de9068SAndreas Gohr "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п", 108872de9068SAndreas Gohr "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е", 108972de9068SAndreas Gohr "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ", 109072de9068SAndreas Gohr "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ", 109172de9068SAndreas Gohr "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ", 109272de9068SAndreas Gohr "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π", 109372de9068SAndreas Gohr "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ", 109472de9068SAndreas Gohr "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ", 109572de9068SAndreas Gohr "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ", 109672de9068SAndreas Gohr "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ", 109772de9068SAndreas Gohr "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș", 109872de9068SAndreas Gohr "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ", 109972de9068SAndreas Gohr "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ", 110072de9068SAndreas Gohr "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ", 110172de9068SAndreas Gohr "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ", 110272de9068SAndreas Gohr "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ", 110372de9068SAndreas Gohr "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž", 110472de9068SAndreas Gohr "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ", 110572de9068SAndreas Gohr "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ", 110672de9068SAndreas Gohr "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ", 110772de9068SAndreas Gohr "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī", 110872de9068SAndreas Gohr "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė", 110972de9068SAndreas Gohr "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă", 111072de9068SAndreas Gohr "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö", 111172de9068SAndreas Gohr "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì", 111272de9068SAndreas Gohr "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â", 111372de9068SAndreas Gohr "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t", 111472de9068SAndreas Gohr "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j", 111572de9068SAndreas Gohr "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a" 111672de9068SAndreas Gohr ); 111772de9068SAndreas Gohr}; // end of case lookup tables 1118ab77016bSAndreas Gohr 111982257610Sandi/** 112082257610Sandi * UTF-8 lookup table for lower case accented letters 112182257610Sandi * 112282257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 112382257610Sandi * range. This are lower case letters only. 112482257610Sandi * 112582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 112682257610Sandi * @see utf8_deaccent() 112782257610Sandi */ 112854662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS; 1129df957b36SAndreas Gohrif(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array( 113082257610Sandi 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 113182257610Sandi 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 113282257610Sandi 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 113382257610Sandi 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 113482257610Sandi 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 113582257610Sandi 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 113682257610Sandi 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 113782257610Sandi 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 113882257610Sandi 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 113982257610Sandi 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 114082257610Sandi 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 114182257610Sandi 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 114282257610Sandi 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 114382257610Sandi 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 114474c0c504Schris 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 114582257610Sandi); 114682257610Sandi 114782257610Sandi/** 114882257610Sandi * UTF-8 lookup table for upper case accented letters 114982257610Sandi * 115082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 115182257610Sandi * range. This are upper case letters only. 115282257610Sandi * 115382257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 115482257610Sandi * @see utf8_deaccent() 115582257610Sandi */ 115654662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS; 1157df957b36SAndreas Gohrif(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array( 1158df3ecd55SAndreas Gohr 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 1159df3ecd55SAndreas Gohr 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 1160df3ecd55SAndreas Gohr 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 1161df3ecd55SAndreas Gohr 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 1162df3ecd55SAndreas Gohr 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 1163df3ecd55SAndreas Gohr 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 1164df3ecd55SAndreas Gohr 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 1165df3ecd55SAndreas Gohr 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1166df3ecd55SAndreas Gohr 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1167df3ecd55SAndreas Gohr 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1168df3ecd55SAndreas Gohr 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1169df3ecd55SAndreas Gohr 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1170df3ecd55SAndreas Gohr 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1171df3ecd55SAndreas Gohr 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 117274c0c504Schris 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 117382257610Sandi); 117482257610Sandi 1175099ada41Sandi/** 1176099ada41Sandi * UTF-8 array of common special characters 1177099ada41Sandi * 1178099ada41Sandi * This array should contain all special characters (not a letter or digit) 1179099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum 1180099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special 1181099ada41Sandi * chars. 1182099ada41Sandi * 1183099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1184ad81d431SAndreas Gohr * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1185099ada41Sandi * 1186099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 1187099ada41Sandi * @see utf8_stripspecials() 1188099ada41Sandi */ 118954662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS; 1190df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array( 1191099ada41Sandi 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1192ad81d431SAndreas Gohr 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 11935c812709Sandi 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 11945c812709Sandi 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1195099ada41Sandi 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1196099ada41Sandi 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1197099ada41Sandi 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1198099ada41Sandi 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1199099ada41Sandi 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1200099ada41Sandi 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1201099ada41Sandi 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1202099ada41Sandi 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1203fae4b5fcSAndreas Gohr 0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1204099ada41Sandi 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1205099ada41Sandi 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1206099ada41Sandi 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1207099ada41Sandi 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1208099ada41Sandi 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1209099ada41Sandi 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1210099ada41Sandi 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1211099ada41Sandi 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1212099ada41Sandi 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1213099ada41Sandi 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1214099ada41Sandi 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1215099ada41Sandi 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1216099ada41Sandi 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1217099ada41Sandi 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1218099ada41Sandi 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1219099ada41Sandi 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1220099ada41Sandi 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1221099ada41Sandi 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1222099ada41Sandi 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1223099ada41Sandi 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1224099ada41Sandi 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1225099ada41Sandi 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1226099ada41Sandi 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1227099ada41Sandi 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1228099ada41Sandi 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1229099ada41Sandi 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1230099ada41Sandi 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1231099ada41Sandi 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1232099ada41Sandi 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1233099ada41Sandi 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1234099ada41Sandi 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1235099ada41Sandi 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1236d5b23302STom N Harris 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 1237d5b23302STom N Harris 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, 1238d5b23302STom N Harris 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 1239d5b23302STom N Harris 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1240099ada41Sandi 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1241099ada41Sandi 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1242099ada41Sandi 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1243099ada41Sandi 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1244d5b23302STom N Harris 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 1245d5b23302STom N Harris 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, 1246d5b23302STom N Harris 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, 1247d5b23302STom N Harris 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 1248d5b23302STom N Harris 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, 1249d5b23302STom N Harris 0xffeb, 0xffec, 0xffed, 0xffee, 1250fae4b5fcSAndreas Gohr 0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703, 1251fae4b5fcSAndreas Gohr 0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b, 1252fae4b5fcSAndreas Gohr 0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713, 12537de9cff5SAndreas Gohr 0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b, 12547de9cff5SAndreas Gohr 0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf, 1255099ada41Sandi); 1256340756e4Sandi 1257720307d9Schris// utf8 version of above data 1258720307d9Schrisglobal $UTF8_SPECIAL_CHARS2; 1259df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 = 126037242afaSTom N Harris "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 1261720307d9Schris '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 1262fae4b5fcSAndreas Gohr '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1263720307d9Schris '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 1264720307d9Schris '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1265720307d9Schris '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 1266720307d9Schris '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1267720307d9Schris '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1268720307d9Schris '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1269720307d9Schris '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1270720307d9Schris '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1271720307d9Schris '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1272720307d9Schris '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1273720307d9Schris '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1274d5b23302STom N Harris '➷➸➹➺➻➼➽➾'. 1275d5b23302STom N Harris ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. 1276d5b23302STom N Harris '�'. 1277d5b23302STom N Harris '�ﹼﹽ'. 1278d5b23302STom N Harris '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. 1279fae4b5fcSAndreas Gohr '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'. 12807de9cff5SAndreas Gohr ''. 12817de9cff5SAndreas Gohr ' '; 1282720307d9Schris 12838a831f2bSAndreas Gohr/** 12848a831f2bSAndreas Gohr * Romanization lookup table 12858a831f2bSAndreas Gohr * 12868a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language 12878a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII. 12888a831f2bSAndreas Gohr * 12898a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works 12908a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement 12918a831f2bSAndreas Gohr * only. Specialities of each language are not supported. 12928a831f2bSAndreas Gohr * 12938a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 12948a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com> 12958a831f2bSAndreas Gohr * @link http://www.uconv.com/translit.htm 12968a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi> 12978a831f2bSAndreas Gohr * @link http://kanjidict.stc.cx/hiragana.php?src=2 12988a831f2bSAndreas Gohr * @link http://www.translatum.gr/converter/greek-transliteration.htm 12998a831f2bSAndreas Gohr * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 13008a831f2bSAndreas Gohr * @link http://www.btranslations.com/resources/romanization/korean.asp 1301014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com> 1302fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de> 13038a831f2bSAndreas Gohr */ 130454662a04SAndreas Gohrglobal $UTF8_ROMANIZATION; 1305df957b36SAndreas Gohrif(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array( 1306176ae32bSAndreas Gohr // scandinavian - differs from what we do in deaccent 1307176ae32bSAndreas Gohr 'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O', 1308176ae32bSAndreas Gohr 13098a831f2bSAndreas Gohr //russian cyrillic 13108a831f2bSAndreas Gohr 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 13118a831f2bSAndreas Gohr 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 13128a831f2bSAndreas Gohr 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 13138a831f2bSAndreas Gohr 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 13148a831f2bSAndreas Gohr 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 13158a831f2bSAndreas Gohr 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1316d8cb2602SDenis Simakov 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1317f5e334deSAndreas Gohr 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 13188a831f2bSAndreas Gohr 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 13198a831f2bSAndreas Gohr // Ukrainian cyrillic 13208a831f2bSAndreas Gohr 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 13218a831f2bSAndreas Gohr // Georgian 13228a831f2bSAndreas Gohr 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 13238a831f2bSAndreas Gohr 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 13248a831f2bSAndreas Gohr 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 13258a831f2bSAndreas Gohr 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 13268a831f2bSAndreas Gohr 'ჰ'=>'xh', 13278a831f2bSAndreas Gohr //Sanskrit 13288a831f2bSAndreas Gohr 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 13298a831f2bSAndreas Gohr 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 13308a831f2bSAndreas Gohr 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 13318a831f2bSAndreas Gohr 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 13328a831f2bSAndreas Gohr 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 13338a831f2bSAndreas Gohr 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 13348a831f2bSAndreas Gohr 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 13358a831f2bSAndreas Gohr //Hebrew 13363dbad6dcSDenis Simakov 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 13373dbad6dcSDenis Simakov 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 13383dbad6dcSDenis Simakov 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 13398a831f2bSAndreas Gohr 'ש'=>'sh','ת'=>'t', 13408a831f2bSAndreas Gohr //Arabic 13418a831f2bSAndreas Gohr 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 13428a831f2bSAndreas Gohr 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 13438a831f2bSAndreas Gohr 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 13448a831f2bSAndreas Gohr 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 13458a831f2bSAndreas Gohr 1346799e0977SAndreas Gohr // Japanese characters (last update: 2008-05-09) 13479476a253SAndreas Gohr 13488a831f2bSAndreas Gohr // Japanese hiragana 1349fed467f8SDenis Scheither 1350fed467f8SDenis Scheither // 3 character syllables, っ doubles the consonant after 1351fed467f8SDenis Scheither 'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu', 1352879205e1SAndreas Gohr 'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu', 1353799e0977SAndreas Gohr 'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu', 1354879205e1SAndreas Gohr 'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu', 1355879205e1SAndreas Gohr // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu', 1356879205e1SAndreas Gohr 'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu', 1357879205e1SAndreas Gohr 'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu', 1358879205e1SAndreas Gohr 'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu', 1359879205e1SAndreas Gohr 'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu', 1360879205e1SAndreas Gohr 'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu', 1361879205e1SAndreas Gohr 'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu', 1362879205e1SAndreas Gohr 1363879205e1SAndreas Gohr // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway) 1364879205e1SAndreas Gohr 'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u', 1365879205e1SAndreas Gohr 'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu', 1366fed467f8SDenis Scheither 1367fed467f8SDenis Scheither // 2 character syllables - normal 1368879205e1SAndreas Gohr 'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1369fed467f8SDenis Scheither 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu', 1370fed467f8SDenis Scheither 'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu', 1371799e0977SAndreas Gohr 'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1372799e0977SAndreas Gohr 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu', 1373fed467f8SDenis Scheither 'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu', 1374fed467f8SDenis Scheither 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu', 1375fed467f8SDenis Scheither 'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1376fed467f8SDenis Scheither 'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1377fed467f8SDenis Scheither 'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu', 1378fed467f8SDenis Scheither 'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu', 1379879205e1SAndreas Gohr 'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju', 1380879205e1SAndreas Gohr 'うぇ'=>'we','うぃ'=>'wi', 1381879205e1SAndreas Gohr 'いぇ'=>'ye', 1382fed467f8SDenis Scheither 1383fed467f8SDenis Scheither // 2 character syllables, っ doubles the consonant after 1384fed467f8SDenis Scheither 'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu', 1385fed467f8SDenis Scheither 'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu', 1386fed467f8SDenis Scheither 'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu', 1387fed467f8SDenis Scheither 'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu', 1388fed467f8SDenis Scheither 'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu', 1389fed467f8SDenis Scheither 'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku', 1390fed467f8SDenis Scheither 'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu', 1391fed467f8SDenis Scheither 'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu', 1392fed467f8SDenis Scheither 'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru', 1393fed467f8SDenis Scheither 'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu', 1394799e0977SAndreas Gohr 'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu', 1395fed467f8SDenis Scheither 1396fed467f8SDenis Scheither // 1 character syllabels 1397fed467f8SDenis Scheither 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n', 1398879205e1SAndreas Gohr 'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu', 1399fed467f8SDenis Scheither 'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu', 1400fed467f8SDenis Scheither 'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu', 14019476a253SAndreas Gohr 'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu', 1402fed467f8SDenis Scheither 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du', 1403fed467f8SDenis Scheither 'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu', 1404fed467f8SDenis Scheither 'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku', 1405fed467f8SDenis Scheither 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu', 1406fed467f8SDenis Scheither 'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu', 1407fed467f8SDenis Scheither 'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru', 1408fed467f8SDenis Scheither 'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su', 1409879205e1SAndreas Gohr 'わ'=>'wa','を'=>'wo', 1410879205e1SAndreas Gohr 'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu', 1411879205e1SAndreas Gohr 'や'=>'ya','よ'=>'yo','ゆ'=>'yu', 14129476a253SAndreas Gohr // old characters 14139476a253SAndreas Gohr 'ゑ'=>'we','ゐ'=>'wi', 1414fed467f8SDenis Scheither 14159476a253SAndreas Gohr // convert what's left (probably only kicks in when something's missing above) 14169476a253SAndreas Gohr // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u', 14179476a253SAndreas Gohr // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu', 1418fed467f8SDenis Scheither 14199476a253SAndreas Gohr // never seen one of those (disabled for the moment) 1420879205e1SAndreas Gohr // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu', 14219476a253SAndreas Gohr // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu', 14229476a253SAndreas Gohr // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu', 14239476a253SAndreas Gohr // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu', 14249476a253SAndreas Gohr // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu', 14259476a253SAndreas Gohr // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 14269476a253SAndreas Gohr // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu', 14279476a253SAndreas Gohr // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 14289476a253SAndreas Gohr // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu', 14299476a253SAndreas Gohr // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu', 14309476a253SAndreas Gohr // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu', 14319476a253SAndreas Gohr // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu', 14329476a253SAndreas Gohr // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu', 14339476a253SAndreas Gohr // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu', 1434fed467f8SDenis Scheither 1435fed467f8SDenis Scheither // 'spare' characters from other romanization systems 1436fed467f8SDenis Scheither // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du', 1437fed467f8SDenis Scheither // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1438fed467f8SDenis Scheither // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su', 1439fed467f8SDenis Scheither // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu', 1440fed467f8SDenis Scheither //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu', 1441fed467f8SDenis Scheither //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu', 1442fed467f8SDenis Scheither //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu', 1443fed467f8SDenis Scheither //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu', 1444fed467f8SDenis Scheither //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi', 1445fed467f8SDenis Scheither //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju', 1446fed467f8SDenis Scheither 1447fed467f8SDenis Scheither 14488a831f2bSAndreas Gohr // Japanese katakana 1449fed467f8SDenis Scheither 1450fed467f8SDenis Scheither // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs) 1451fed467f8SDenis Scheither 'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu', 1452fed467f8SDenis Scheither 'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu', 1453fed467f8SDenis Scheither 'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu', 1454fed467f8SDenis Scheither 'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu', 1455fed467f8SDenis Scheither 'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu', 1456fed467f8SDenis Scheither 'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu', 1457fed467f8SDenis Scheither 'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu', 1458fed467f8SDenis Scheither 'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu', 1459fed467f8SDenis Scheither 'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu', 1460799e0977SAndreas Gohr 'ッティー'=>'ttii', 1461799e0977SAndreas Gohr 'ッヂィー'=>'ddii', 1462fed467f8SDenis Scheither 1463fed467f8SDenis Scheither // 3 character syllables - doubled vowels 1464fed467f8SDenis Scheither 'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo', 1465fed467f8SDenis Scheither 'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu', 1466fed467f8SDenis Scheither 'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu', 1467fed467f8SDenis Scheither 'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu', 1468fed467f8SDenis Scheither 'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu', 1469fed467f8SDenis Scheither 'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu', 1470fed467f8SDenis Scheither 'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu', 1471fed467f8SDenis Scheither 'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu', 1472fed467f8SDenis Scheither 'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu', 1473fed467f8SDenis Scheither 'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu', 1474fed467f8SDenis Scheither 'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu', 1475fed467f8SDenis Scheither 'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu', 1476fed467f8SDenis Scheither 'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu', 1477fed467f8SDenis Scheither 'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu', 1478fed467f8SDenis Scheither 'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu', 1479fed467f8SDenis Scheither 'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu', 1480fed467f8SDenis Scheither 'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu', 1481fed467f8SDenis Scheither 'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu', 1482fed467f8SDenis Scheither 'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu', 1483fed467f8SDenis Scheither 'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu', 1484fed467f8SDenis Scheither 'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu', 1485fed467f8SDenis Scheither 'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu', 1486fed467f8SDenis Scheither 'ウェー'=>'wee','ウィー'=>'wii', 1487fed467f8SDenis Scheither 'イェー'=>'yee', 1488799e0977SAndreas Gohr 'ティー'=>'tii', 1489799e0977SAndreas Gohr 'ヂィー'=>'dii', 1490fed467f8SDenis Scheither 1491fed467f8SDenis Scheither // 3 character syllables - doubled consonants 1492fed467f8SDenis Scheither 'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu', 1493fed467f8SDenis Scheither 'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu', 1494fed467f8SDenis Scheither 'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu', 1495fed467f8SDenis Scheither 'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu', 1496fed467f8SDenis Scheither 'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu', 1497fed467f8SDenis Scheither 'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu', 1498fed467f8SDenis Scheither 'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu', 1499fed467f8SDenis Scheither 'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu', 1500fed467f8SDenis Scheither 'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu', 1501799e0977SAndreas Gohr 'ッティ'=>'tti', 1502799e0977SAndreas Gohr 'ッヂィ'=>'ddi', 1503fed467f8SDenis Scheither 1504fed467f8SDenis Scheither // 3 character syllables - doubled vowel and consonants 1505fed467f8SDenis Scheither 'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu', 1506fed467f8SDenis Scheither 'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu', 1507fed467f8SDenis Scheither 'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa', 1508fed467f8SDenis Scheither 'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu', 1509fed467f8SDenis Scheither 'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu', 1510fed467f8SDenis Scheither 'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu', 1511fed467f8SDenis Scheither 'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu', 1512fed467f8SDenis Scheither 'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu', 1513799e0977SAndreas Gohr 'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu', 1514799e0977SAndreas Gohr 'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu', 1515fed467f8SDenis Scheither 'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu', 1516fed467f8SDenis Scheither 1517fed467f8SDenis Scheither // 2 character syllables - normal 1518799e0977SAndreas Gohr 'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu', 1519799e0977SAndreas Gohr // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu', 1520799e0977SAndreas Gohr 'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu', 1521fed467f8SDenis Scheither 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu', 1522fed467f8SDenis Scheither 'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu', 1523fed467f8SDenis Scheither 'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1524fed467f8SDenis Scheither 'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu', 1525fed467f8SDenis Scheither 'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1526fed467f8SDenis Scheither 'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu', 1527fed467f8SDenis Scheither 'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu', 1528fed467f8SDenis Scheither 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu', 1529879205e1SAndreas Gohr 'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu', 1530879205e1SAndreas Gohr 'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju', 1531fed467f8SDenis Scheither 'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu', 1532fed467f8SDenis Scheither 'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du', 1533fed467f8SDenis Scheither 'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1534799e0977SAndreas Gohr // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu', 1535fed467f8SDenis Scheither 'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1536fed467f8SDenis Scheither 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu', 1537fed467f8SDenis Scheither 'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu', 1538fed467f8SDenis Scheither 'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu', 1539fed467f8SDenis Scheither 'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu', 1540fed467f8SDenis Scheither 'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu', 1541fed467f8SDenis Scheither 'ウェ'=>'we','ウィ'=>'wi', 1542fed467f8SDenis Scheither 'イェ'=>'ye', 1543799e0977SAndreas Gohr 'ティ'=>'ti', 1544799e0977SAndreas Gohr 'ヂィ'=>'di', 1545fed467f8SDenis Scheither 1546fed467f8SDenis Scheither // 2 character syllables - doubled vocal 1547fed467f8SDenis Scheither 'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu', 1548fed467f8SDenis Scheither 'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu', 1549fed467f8SDenis Scheither 'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu', 1550fed467f8SDenis Scheither 'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu', 1551fed467f8SDenis Scheither 'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu', 1552fed467f8SDenis Scheither 'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa', 1553fed467f8SDenis Scheither 'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu', 1554fed467f8SDenis Scheither 'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu', 1555fed467f8SDenis Scheither 'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu', 1556fed467f8SDenis Scheither 'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu', 1557fed467f8SDenis Scheither 'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu', 1558799e0977SAndreas Gohr 'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu', 1559fed467f8SDenis Scheither 'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu', 1560fed467f8SDenis Scheither 'ワー'=>'waa','ヲー'=>'woo', 1561fed467f8SDenis Scheither 'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu', 1562fed467f8SDenis Scheither 'ヵー'=>'kaa','ヶー'=>'kee', 15639476a253SAndreas Gohr // old characters 15649476a253SAndreas Gohr 'ヱー'=>'wee','ヰー'=>'wii', 1565fed467f8SDenis Scheither 1566879205e1SAndreas Gohr // seperate katakana 'n' 1567879205e1SAndreas Gohr 'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u', 1568879205e1SAndreas Gohr 'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu', 1569879205e1SAndreas Gohr 1570fed467f8SDenis Scheither // 2 character syllables - doubled consonants 1571fed467f8SDenis Scheither 'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu', 1572fed467f8SDenis Scheither 'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu', 1573fed467f8SDenis Scheither 'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka', 1574fed467f8SDenis Scheither 'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu', 1575fed467f8SDenis Scheither 'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu', 1576fed467f8SDenis Scheither 'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu', 1577fed467f8SDenis Scheither 'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru', 1578fed467f8SDenis Scheither 'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu', 1579799e0977SAndreas Gohr 'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu', 1580799e0977SAndreas Gohr 'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu', 1581fed467f8SDenis Scheither 'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu', 1582fed467f8SDenis Scheither 1583fed467f8SDenis Scheither // 1 character syllables 1584fed467f8SDenis Scheither 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n', 1585fed467f8SDenis Scheither 'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu', 1586fed467f8SDenis Scheither 'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu', 1587fed467f8SDenis Scheither 'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1588fed467f8SDenis Scheither 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka', 1589fed467f8SDenis Scheither 'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu', 1590fed467f8SDenis Scheither 'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu', 1591fed467f8SDenis Scheither 'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu', 1592fed467f8SDenis Scheither 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru', 1593fed467f8SDenis Scheither 'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su', 1594879205e1SAndreas Gohr 'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu', 1595fed467f8SDenis Scheither 'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu', 1596fed467f8SDenis Scheither 'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du', 1597fed467f8SDenis Scheither 'ワ'=>'wa','ヲ'=>'wo', 1598fed467f8SDenis Scheither 'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu', 1599fed467f8SDenis Scheither 'ヵ'=>'ka','ヶ'=>'ke', 16009476a253SAndreas Gohr // old characters 16019476a253SAndreas Gohr 'ヱ'=>'we','ヰ'=>'wi', 1602fed467f8SDenis Scheither 16039476a253SAndreas Gohr // convert what's left (probably only kicks in when something's missing above) 1604fed467f8SDenis Scheither 'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u', 1605fed467f8SDenis Scheither 'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu', 1606fed467f8SDenis Scheither 1607799e0977SAndreas Gohr // special characters 1608799e0977SAndreas Gohr '・'=>'_','、'=>'_', 1609799e0977SAndreas Gohr 'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise 1610799e0977SAndreas Gohr 1611fed467f8SDenis Scheither // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu', 1612fed467f8SDenis Scheither // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu', 1613fed467f8SDenis Scheither //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu', 1614fed467f8SDenis Scheither // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu', 1615fed467f8SDenis Scheither // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu', 1616fed467f8SDenis Scheither //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu', 1617fed467f8SDenis Scheither //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu', 1618fed467f8SDenis Scheither // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu', 1619fed467f8SDenis Scheither // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu', 1620fed467f8SDenis Scheither //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu', 1621fed467f8SDenis Scheither //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu', 1622fed467f8SDenis Scheither //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi', 16238a831f2bSAndreas Gohr 16248a831f2bSAndreas Gohr // "Greeklish" 16258a831f2bSAndreas Gohr 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 16268a831f2bSAndreas Gohr 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 16278a831f2bSAndreas Gohr 16288a831f2bSAndreas Gohr // Thai 16298a831f2bSAndreas Gohr 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 16308a831f2bSAndreas Gohr 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 16318a831f2bSAndreas Gohr 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 16328a831f2bSAndreas Gohr 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 16338a831f2bSAndreas Gohr 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 16348a831f2bSAndreas Gohr 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1635014d0ab6SAndreas Gohr 'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am', 1636014d0ab6SAndreas Gohr 'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u', 1637014d0ab6SAndreas Gohr 'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o', 1638014d0ab6SAndreas Gohr 'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua', 1639014d0ab6SAndreas Gohr 'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao', 1640014d0ab6SAndreas Gohr 'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai', 1641014d0ab6SAndreas Gohr 'ิว'=>'io','็ว'=>'eo','ียว'=>'iao', 1642014d0ab6SAndreas Gohr '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'', 1643014d0ab6SAndreas Gohr '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'', 1644014d0ab6SAndreas Gohr 'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-', 1645014d0ab6SAndreas Gohr '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4', 1646014d0ab6SAndreas Gohr '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9', 16478a831f2bSAndreas Gohr 16488a831f2bSAndreas Gohr // Korean 16498a831f2bSAndreas Gohr 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 16508a831f2bSAndreas Gohr 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 16518a831f2bSAndreas Gohr 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 16528a831f2bSAndreas Gohr 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 16538a831f2bSAndreas Gohr 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 16548a831f2bSAndreas Gohr 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 16558a831f2bSAndreas Gohr); 1656340756e4Sandi 16578a831f2bSAndreas Gohr 1658