1ed7b5f09Sandi<?php 282257610Sandi/** 382257610Sandi * UTF8 helper functions 482257610Sandi * 51f2058faSAndreas Gohr * @license LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html) 682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 782257610Sandi */ 882257610Sandi 9ab77016bSAndreas Gohr/** 10ab77016bSAndreas Gohr * check for mb_string support 11ab77016bSAndreas Gohr */ 12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){ 13ab77016bSAndreas Gohr if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14ab77016bSAndreas Gohr define('UTF8_MBSTRING',1); 15ab77016bSAndreas Gohr }else{ 16ab77016bSAndreas Gohr define('UTF8_MBSTRING',0); 17ab77016bSAndreas Gohr } 18ab77016bSAndreas Gohr} 19ab77016bSAndreas Gohr 205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 215e613a5cSchris 22df957b36SAndreas Gohrif(!function_exists('utf8_isASCII')){ 23f29bd553Sandi /** 2444f669e9Sandi * Checks if a string contains 7bit ASCII only 2544f669e9Sandi * 263cf90024SMichael Hamann * @author Andreas Haerter <andreas.haerter@dev.mail-node.com> 2744f669e9Sandi */ 2844f669e9Sandi function utf8_isASCII($str){ 297e6f32c4SAndreas Gohr return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); 3044f669e9Sandi } 31df957b36SAndreas Gohr} 3244f669e9Sandi 33df957b36SAndreas Gohrif(!function_exists('utf8_strip')){ 3444f669e9Sandi /** 35e1906e6eSandi * Strips all highbyte chars 36e1906e6eSandi * 37e1906e6eSandi * Returns a pure ASCII7 string 38e1906e6eSandi * 39e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org> 40e1906e6eSandi */ 41e1906e6eSandi function utf8_strip($str){ 42e1906e6eSandi $ascii = ''; 438ec3f7bdSAndreas Gohr $len = strlen($str); 448ec3f7bdSAndreas Gohr for($i=0; $i<$len; $i++){ 45e1906e6eSandi if(ord($str{$i}) <128){ 46e1906e6eSandi $ascii .= $str{$i}; 47e1906e6eSandi } 48e1906e6eSandi } 49e1906e6eSandi return $ascii; 50e1906e6eSandi } 51df957b36SAndreas Gohr} 52e1906e6eSandi 53df957b36SAndreas Gohrif(!function_exists('utf8_check')){ 54e1906e6eSandi /** 55f29bd553Sandi * Tries to detect if a string is in Unicode encoding 56f29bd553Sandi * 57f29bd553Sandi * @author <bmorel@ssi.fr> 58f29bd553Sandi * @link http://www.php.net/manual/en/function.utf8-encode.php 59f29bd553Sandi */ 60f29bd553Sandi function utf8_check($Str) { 618ec3f7bdSAndreas Gohr $len = strlen($Str); 628ec3f7bdSAndreas Gohr for ($i=0; $i<$len; $i++) { 635e613a5cSchris $b = ord($Str[$i]); 645e613a5cSchris if ($b < 0x80) continue; # 0bbbbbbb 655e613a5cSchris elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 665e613a5cSchris elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 675e613a5cSchris elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 685e613a5cSchris elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 695e613a5cSchris elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 70f29bd553Sandi else return false; # Does not match any model 71df957b36SAndreas Gohr 72f29bd553Sandi for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 738ec3f7bdSAndreas Gohr if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80)) 74f29bd553Sandi return false; 75f29bd553Sandi } 76f29bd553Sandi } 77f29bd553Sandi return true; 78f29bd553Sandi } 79df957b36SAndreas Gohr} 8049c713a3Sandi 81f393a4ebSAndreas Gohrif(!function_exists('utf8_basename')){ 82f393a4ebSAndreas Gohr /** 83f393a4ebSAndreas Gohr * A locale independent basename() implementation 84f393a4ebSAndreas Gohr * 85f393a4ebSAndreas Gohr * works around a bug in PHP's basename() implementation 86f393a4ebSAndreas Gohr * 87f393a4ebSAndreas Gohr * @see basename() 88f393a4ebSAndreas Gohr * @link https://bugs.php.net/bug.php?id=37738 89f393a4ebSAndreas Gohr * @param string $path A path 90f393a4ebSAndreas Gohr * @param string $suffix If the name component ends in suffix this will also be cut off 91f393a4ebSAndreas Gohr * @return string 92f393a4ebSAndreas Gohr */ 93f393a4ebSAndreas Gohr function utf8_basename($path, $suffix=''){ 94*fa446926SAndreas Gohr $path = trim($path,'\\/'); 95*fa446926SAndreas Gohr $rpos = max(strrpos($path, '/'), strrpos($path, '\\')); 96*fa446926SAndreas Gohr if($rpos) $path = substr($path, $rpos+1); 97f393a4ebSAndreas Gohr 98f393a4ebSAndreas Gohr $suflen = strlen($suffix); 99420addb2SAndreas Gohr if($suflen && (substr($path, -$suflen) == $suffix)){ 100420addb2SAndreas Gohr $path = substr($path, 0, -$suflen); 101f393a4ebSAndreas Gohr } 102f393a4ebSAndreas Gohr 103420addb2SAndreas Gohr return $path; 104f393a4ebSAndreas Gohr } 105f393a4ebSAndreas Gohr} 106f393a4ebSAndreas Gohr 107df957b36SAndreas Gohrif(!function_exists('utf8_strlen')){ 1082f954959Sandi /** 109f29317c1Sandi * Unicode aware replacement for strlen() 1102f954959Sandi * 111f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1 112f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's 113f29317c1Sandi * even faster than mb_strlen. 1142f954959Sandi * 115f29317c1Sandi * @author <chernyshevsky at hotmail dot com> 1162f954959Sandi * @see strlen() 117f29317c1Sandi * @see utf8_decode() 1182f954959Sandi */ 1192f954959Sandi function utf8_strlen($string){ 120dc57ef04Sandi return strlen(utf8_decode($string)); 1212f954959Sandi } 122df957b36SAndreas Gohr} 1232f954959Sandi 124df957b36SAndreas Gohrif(!function_exists('utf8_substr')){ 1257077c942Sandi /** 12610f09f2aSAndreas Gohr * UTF-8 aware alternative to substr 1277077c942Sandi * 12810f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length) 12910f09f2aSAndreas Gohr * 13010f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 1315e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk> 132e3736c26SAndreas Gohr * @param string $str 133e3736c26SAndreas Gohr * @param int $offset number of UTF-8 characters offset (from left) 134e3736c26SAndreas Gohr * @param int $length (optional) length in UTF-8 characters from offset 13544881bd0Shenning.noren * @return mixed string or false if failure 1367077c942Sandi */ 13710f09f2aSAndreas Gohr function utf8_substr($str, $offset, $length = null) { 138ab77016bSAndreas Gohr if(UTF8_MBSTRING){ 13910f09f2aSAndreas Gohr if( $length === null ){ 14019a32233Schris return mb_substr($str, $offset); 1417d8be200Sandi }else{ 14219a32233Schris return mb_substr($str, $offset, $length); 143f29317c1Sandi } 144f29317c1Sandi } 145f29317c1Sandi 1462626ee0cSchris /* 1472626ee0cSchris * Notes: 1482626ee0cSchris * 1492626ee0cSchris * no mb string support, so we'll use pcre regex's with 'u' flag 1502626ee0cSchris * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 1512626ee0cSchris * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 1522626ee0cSchris * 1532626ee0cSchris * substr documentation states false can be returned in some cases (e.g. offset > string length) 1542626ee0cSchris * mb_substr never returns false, it will return an empty string instead. 1552626ee0cSchris * 1562626ee0cSchris * calculating the number of characters in the string is a relatively expensive operation, so 1572626ee0cSchris * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 1582626ee0cSchris */ 15910f09f2aSAndreas Gohr 1602626ee0cSchris // cast parameters to appropriate types to avoid multiple notices/warnings 1612626ee0cSchris $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 1622626ee0cSchris $offset = (int)$offset; 1632626ee0cSchris if (!is_null($length)) $length = (int)$length; 16410f09f2aSAndreas Gohr 1652626ee0cSchris // handle trivial cases 1665e613a5cSchris if ($length === 0) return ''; 1672626ee0cSchris if ($offset < 0 && $length < 0 && $length < $offset) return ''; 1685e613a5cSchris 1692626ee0cSchris $offset_pattern = ''; 1702626ee0cSchris $length_pattern = ''; 1712626ee0cSchris 1722626ee0cSchris // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 1732626ee0cSchris if ($offset < 0) { 1742626ee0cSchris $strlen = strlen(utf8_decode($str)); // see notes 1752626ee0cSchris $offset = $strlen + $offset; 1762626ee0cSchris if ($offset < 0) $offset = 0; 1772626ee0cSchris } 1782626ee0cSchris 1792626ee0cSchris // establish a pattern for offset, a non-captured group equal in length to offset 1802626ee0cSchris if ($offset > 0) { 1812626ee0cSchris $Ox = (int)($offset/65535); 1822626ee0cSchris $Oy = $offset%65535; 1832626ee0cSchris 1842626ee0cSchris if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 1852626ee0cSchris $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 1862626ee0cSchris } else { 1872626ee0cSchris $offset_pattern = '^'; // offset == 0; just anchor the pattern 1882626ee0cSchris } 1892626ee0cSchris 1902626ee0cSchris // establish a pattern for length 1912626ee0cSchris if (is_null($length)) { 1922626ee0cSchris $length_pattern = '(.*)$'; // the rest of the string 1932626ee0cSchris } else { 1942626ee0cSchris 1952626ee0cSchris if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 1962626ee0cSchris if ($offset > $strlen) return ''; // another trivial case 1972626ee0cSchris 1982626ee0cSchris if ($length > 0) { 1992626ee0cSchris 2002626ee0cSchris $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 2012626ee0cSchris 2022626ee0cSchris $Lx = (int)($length/65535); 2032626ee0cSchris $Ly = $length%65535; 2042626ee0cSchris 2052626ee0cSchris // +ve length requires ... a captured group of length characters 2062626ee0cSchris if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 2072626ee0cSchris $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 2082626ee0cSchris 2092626ee0cSchris } else if ($length < 0) { 2102626ee0cSchris 2112626ee0cSchris if ($length < ($offset - $strlen)) return ''; 2122626ee0cSchris 2132626ee0cSchris $Lx = (int)((-$length)/65535); 2142626ee0cSchris $Ly = (-$length)%65535; 2152626ee0cSchris 2162626ee0cSchris // -ve length requires ... capture everything except a group of -length characters 2172626ee0cSchris // anchored at the tail-end of the string 2182626ee0cSchris if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 2192626ee0cSchris $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 22010f09f2aSAndreas Gohr } 22110f09f2aSAndreas Gohr } 22210f09f2aSAndreas Gohr 2232626ee0cSchris if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 2242626ee0cSchris return $match[1]; 2252626ee0cSchris } 226df957b36SAndreas Gohr} 22710f09f2aSAndreas Gohr 228df957b36SAndreas Gohrif(!function_exists('utf8_substr_replace')){ 229f29317c1Sandi /** 230dc57ef04Sandi * Unicode aware replacement for substr_replace() 231dc57ef04Sandi * 232dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org> 233dc57ef04Sandi * @see substr_replace() 234dc57ef04Sandi */ 235dc57ef04Sandi function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 236dc57ef04Sandi $ret = ''; 237dc57ef04Sandi if($start>0) $ret .= utf8_substr($string, 0, $start); 238dc57ef04Sandi $ret .= $replacement; 239dc57ef04Sandi $ret .= utf8_substr($string, $start+$length); 240dc57ef04Sandi return $ret; 241dc57ef04Sandi } 242df957b36SAndreas Gohr} 243dc57ef04Sandi 244df957b36SAndreas Gohrif(!function_exists('utf8_ltrim')){ 245dc57ef04Sandi /** 246f29317c1Sandi * Unicode aware replacement for ltrim() 247f29317c1Sandi * 248f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 249f29317c1Sandi * @see ltrim() 250e3736c26SAndreas Gohr * @param string $str 251e3736c26SAndreas Gohr * @param string $charlist 252f29317c1Sandi * @return string 253f29317c1Sandi */ 254f29317c1Sandi function utf8_ltrim($str,$charlist=''){ 255f29317c1Sandi if($charlist == '') return ltrim($str); 256f29317c1Sandi 257f29317c1Sandi //quote charlist for use in a characterclass 258f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 259f29317c1Sandi 260f29317c1Sandi return preg_replace('/^['.$charlist.']+/u','',$str); 261f29317c1Sandi } 262df957b36SAndreas Gohr} 263f29317c1Sandi 264df957b36SAndreas Gohrif(!function_exists('utf8_rtrim')){ 265f29317c1Sandi /** 266ea2eed85Sandi * Unicode aware replacement for rtrim() 267f29317c1Sandi * 268f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 269f29317c1Sandi * @see rtrim() 270e3736c26SAndreas Gohr * @param string $str 271e3736c26SAndreas Gohr * @param string $charlist 272f29317c1Sandi * @return string 273f29317c1Sandi */ 274f29317c1Sandi function utf8_rtrim($str,$charlist=''){ 275f29317c1Sandi if($charlist == '') return rtrim($str); 276f29317c1Sandi 277f29317c1Sandi //quote charlist for use in a characterclass 278f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 279f29317c1Sandi 280f29317c1Sandi return preg_replace('/['.$charlist.']+$/u','',$str); 281f29317c1Sandi } 282df957b36SAndreas Gohr} 283f29317c1Sandi 284df957b36SAndreas Gohrif(!function_exists('utf8_trim')){ 285f29317c1Sandi /** 286f29317c1Sandi * Unicode aware replacement for trim() 287f29317c1Sandi * 288f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 289f29317c1Sandi * @see trim() 290e3736c26SAndreas Gohr * @param string $str 291e3736c26SAndreas Gohr * @param string $charlist 292f29317c1Sandi * @return string 293f29317c1Sandi */ 294f29317c1Sandi function utf8_trim($str,$charlist='') { 295f29317c1Sandi if($charlist == '') return trim($str); 296f29317c1Sandi 29740421069SAndreas Gohr return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist); 298f29317c1Sandi } 299df957b36SAndreas Gohr} 300f29317c1Sandi 301df957b36SAndreas Gohrif(!function_exists('utf8_strtolower')){ 30249c713a3Sandi /** 30382257610Sandi * This is a unicode aware replacement for strtolower() 30482257610Sandi * 30582257610Sandi * Uses mb_string extension if available 30682257610Sandi * 30772de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org> 30882257610Sandi * @see strtolower() 30982257610Sandi * @see utf8_strtoupper() 31082257610Sandi */ 31182257610Sandi function utf8_strtolower($string){ 312ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 31382257610Sandi 31482257610Sandi global $UTF8_UPPER_TO_LOWER; 31572de9068SAndreas Gohr return strtr($string,$UTF8_UPPER_TO_LOWER); 31682257610Sandi } 317df957b36SAndreas Gohr} 31882257610Sandi 319df957b36SAndreas Gohrif(!function_exists('utf8_strtoupper')){ 32082257610Sandi /** 32182257610Sandi * This is a unicode aware replacement for strtoupper() 32282257610Sandi * 32382257610Sandi * Uses mb_string extension if available 32482257610Sandi * 32572de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org> 32682257610Sandi * @see strtoupper() 32782257610Sandi * @see utf8_strtoupper() 32882257610Sandi */ 32982257610Sandi function utf8_strtoupper($string){ 330ab77016bSAndreas Gohr if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 33182257610Sandi 33282257610Sandi global $UTF8_LOWER_TO_UPPER; 33372de9068SAndreas Gohr return strtr($string,$UTF8_LOWER_TO_UPPER); 33482257610Sandi } 335df957b36SAndreas Gohr} 33682257610Sandi 337df957b36SAndreas Gohrif(!function_exists('utf8_ucfirst')){ 33882257610Sandi /** 33926ece5a7SAndreas Gohr * UTF-8 aware alternative to ucfirst 34026ece5a7SAndreas Gohr * Make a string's first character uppercase 34126ece5a7SAndreas Gohr * 34226ece5a7SAndreas Gohr * @author Harry Fuecks 34326ece5a7SAndreas Gohr * @param string 34426ece5a7SAndreas Gohr * @return string with first character as upper case (if applicable) 34526ece5a7SAndreas Gohr */ 34626ece5a7SAndreas Gohr function utf8_ucfirst($str){ 34726ece5a7SAndreas Gohr switch ( utf8_strlen($str) ) { 34826ece5a7SAndreas Gohr case 0: 34926ece5a7SAndreas Gohr return ''; 35026ece5a7SAndreas Gohr case 1: 35126ece5a7SAndreas Gohr return utf8_strtoupper($str); 35226ece5a7SAndreas Gohr default: 35326ece5a7SAndreas Gohr preg_match('/^(.{1})(.*)$/us', $str, $matches); 35426ece5a7SAndreas Gohr return utf8_strtoupper($matches[1]).$matches[2]; 35526ece5a7SAndreas Gohr } 35626ece5a7SAndreas Gohr } 357df957b36SAndreas Gohr} 35826ece5a7SAndreas Gohr 359df957b36SAndreas Gohrif(!function_exists('utf8_ucwords')){ 36026ece5a7SAndreas Gohr /** 36126ece5a7SAndreas Gohr * UTF-8 aware alternative to ucwords 36226ece5a7SAndreas Gohr * Uppercase the first character of each word in a string 36326ece5a7SAndreas Gohr * 36426ece5a7SAndreas Gohr * @author Harry Fuecks 36526ece5a7SAndreas Gohr * @param string 36626ece5a7SAndreas Gohr * @return string with first char of each word uppercase 36726ece5a7SAndreas Gohr * @see http://www.php.net/ucwords 36826ece5a7SAndreas Gohr */ 36926ece5a7SAndreas Gohr function utf8_ucwords($str) { 37026ece5a7SAndreas Gohr // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches; 37126ece5a7SAndreas Gohr // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns 37226ece5a7SAndreas Gohr // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords 37326ece5a7SAndreas Gohr $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u'; 37426ece5a7SAndreas Gohr 37526ece5a7SAndreas Gohr return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str); 37626ece5a7SAndreas Gohr } 37726ece5a7SAndreas Gohr 37826ece5a7SAndreas Gohr /** 37926ece5a7SAndreas Gohr * Callback function for preg_replace_callback call in utf8_ucwords 38026ece5a7SAndreas Gohr * You don't need to call this yourself 38126ece5a7SAndreas Gohr * 38226ece5a7SAndreas Gohr * @author Harry Fuecks 383e3736c26SAndreas Gohr * @param array $matches matches corresponding to a single word 38426ece5a7SAndreas Gohr * @return string with first char of the word in uppercase 38526ece5a7SAndreas Gohr * @see utf8_ucwords 38626ece5a7SAndreas Gohr * @see utf8_strtoupper 38726ece5a7SAndreas Gohr */ 38826ece5a7SAndreas Gohr function utf8_ucwords_callback($matches) { 38926ece5a7SAndreas Gohr $leadingws = $matches[2]; 39026ece5a7SAndreas Gohr $ucfirst = utf8_strtoupper($matches[3]); 39126ece5a7SAndreas Gohr $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1); 39226ece5a7SAndreas Gohr return $leadingws . $ucword; 39326ece5a7SAndreas Gohr } 394df957b36SAndreas Gohr} 39526ece5a7SAndreas Gohr 396df957b36SAndreas Gohrif(!function_exists('utf8_deaccent')){ 39726ece5a7SAndreas Gohr /** 39882257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 39982257610Sandi * 40082257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 40182257610Sandi * letters. Default is to deaccent both cases ($case = 0) 40282257610Sandi * 40382257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 40482257610Sandi */ 40582257610Sandi function utf8_deaccent($string,$case=0){ 40682257610Sandi if($case <= 0){ 40782257610Sandi global $UTF8_LOWER_ACCENTS; 40872de9068SAndreas Gohr $string = strtr($string,$UTF8_LOWER_ACCENTS); 40982257610Sandi } 41082257610Sandi if($case >= 0){ 41182257610Sandi global $UTF8_UPPER_ACCENTS; 41272de9068SAndreas Gohr $string = strtr($string,$UTF8_UPPER_ACCENTS); 41382257610Sandi } 41482257610Sandi return $string; 41582257610Sandi } 416df957b36SAndreas Gohr} 41782257610Sandi 418df957b36SAndreas Gohrif(!function_exists('utf8_romanize')){ 41982257610Sandi /** 4208a831f2bSAndreas Gohr * Romanize a non-latin string 4218a831f2bSAndreas Gohr * 4228a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 4238a831f2bSAndreas Gohr */ 4248a831f2bSAndreas Gohr function utf8_romanize($string){ 4258a831f2bSAndreas Gohr if(utf8_isASCII($string)) return $string; //nothing to do 4268a831f2bSAndreas Gohr 4278a831f2bSAndreas Gohr global $UTF8_ROMANIZATION; 4288a831f2bSAndreas Gohr return strtr($string,$UTF8_ROMANIZATION); 4298a831f2bSAndreas Gohr } 430df957b36SAndreas Gohr} 4318a831f2bSAndreas Gohr 432df957b36SAndreas Gohrif(!function_exists('utf8_stripspecials')){ 4338a831f2bSAndreas Gohr /** 434099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string 435099ada41Sandi * 436099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of 437099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 438099ada41Sandi * 439099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 440099ada41Sandi * @param string $string The UTF8 string to strip of special chars 441099ada41Sandi * @param string $repl Replace special with this string 442b4ce25e9SAndreas Gohr * @param string $additional Additional chars to strip (used in regexp char class) 443e3736c26SAndreas Gohr * @return string 444099ada41Sandi */ 445b4ce25e9SAndreas Gohr function utf8_stripspecials($string,$repl='',$additional=''){ 446720307d9Schris global $UTF8_SPECIAL_CHARS2; 447099ada41Sandi 4485c812709Sandi static $specials = null; 4495c812709Sandi if(is_null($specials)){ 450720307d9Schris #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 451720307d9Schris $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 4525c812709Sandi } 453099ada41Sandi 454b4ce25e9SAndreas Gohr return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 455099ada41Sandi } 456df957b36SAndreas Gohr} 457099ada41Sandi 458df957b36SAndreas Gohrif(!function_exists('utf8_strpos')){ 459099ada41Sandi /** 4602f954959Sandi * This is an Unicode aware replacement for strpos 4612f954959Sandi * 46272de9068SAndreas Gohr * @author Leo Feyer <leo@typolight.org> 4632f954959Sandi * @see strpos() 46472de9068SAndreas Gohr * @param string 46572de9068SAndreas Gohr * @param string 46672de9068SAndreas Gohr * @param integer 46772de9068SAndreas Gohr * @return integer 4682f954959Sandi */ 4692f954959Sandi function utf8_strpos($haystack, $needle, $offset=0){ 47072de9068SAndreas Gohr $comp = 0; 47172de9068SAndreas Gohr $length = null; 4722f954959Sandi 47372de9068SAndreas Gohr while (is_null($length) || $length < $offset) { 47472de9068SAndreas Gohr $pos = strpos($haystack, $needle, $offset + $comp); 47572de9068SAndreas Gohr 47672de9068SAndreas Gohr if ($pos === false) 477f29317c1Sandi return false; 47872de9068SAndreas Gohr 47972de9068SAndreas Gohr $length = utf8_strlen(substr($haystack, 0, $pos)); 48072de9068SAndreas Gohr 48172de9068SAndreas Gohr if ($length < $offset) 48272de9068SAndreas Gohr $comp = $pos - $length; 483f29317c1Sandi } 4842f954959Sandi 48572de9068SAndreas Gohr return $length; 48672de9068SAndreas Gohr } 487df957b36SAndreas Gohr} 488f29317c1Sandi 489df957b36SAndreas Gohrif(!function_exists('utf8_tohtml')){ 4902f954959Sandi /** 491ea2eed85Sandi * Encodes UTF-8 characters to HTML entities 492ea2eed85Sandi * 4939f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 494ea2eed85Sandi * @author <vpribish at shopping dot com> 495ea2eed85Sandi * @link http://www.php.net/manual/en/function.utf8-decode.php 496ea2eed85Sandi */ 497ea2eed85Sandi function utf8_tohtml ($str) { 498ea2eed85Sandi $ret = ''; 4999f9fb0e5STom N Harris foreach (utf8_to_unicode($str) as $cp) { 5009f9fb0e5STom N Harris if ($cp < 0x80) 5019f9fb0e5STom N Harris $ret .= chr($cp); 5029f9fb0e5STom N Harris elseif ($cp < 0x100) 5039f9fb0e5STom N Harris $ret .= "&#$cp;"; 5049f9fb0e5STom N Harris else 5059f9fb0e5STom N Harris $ret .= '&#x'.dechex($cp).';'; 5069f9fb0e5STom N Harris } 5079f9fb0e5STom N Harris return $ret; 5089f9fb0e5STom N Harris } 509df957b36SAndreas Gohr} 5109f9fb0e5STom N Harris 511df957b36SAndreas Gohrif(!function_exists('utf8_unhtml')){ 5129f9fb0e5STom N Harris /** 5139f9fb0e5STom N Harris * Decodes HTML entities to UTF-8 characters 5149f9fb0e5STom N Harris * 5159f9fb0e5STom N Harris * Convert any &#..; entity to a codepoint, 5169f9fb0e5STom N Harris * The entities flag defaults to only decoding numeric entities. 5179f9fb0e5STom N Harris * Pass HTML_ENTITIES and named entities, including & < etc. 5189f9fb0e5STom N Harris * are handled as well. Avoids the problem that would occur if you 5199f9fb0e5STom N Harris * had to decode "&#38;&amp;#38;" 5209f9fb0e5STom N Harris * 5219f9fb0e5STom N Harris * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 5229f9fb0e5STom N Harris * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 5239f9fb0e5STom N Harris * what it should be -> "&&#38;" 5249f9fb0e5STom N Harris * 5259f9fb0e5STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 5269f9fb0e5STom N Harris * @param string $str UTF-8 encoded string 5279f9fb0e5STom N Harris * @param boolean $entities Flag controlling decoding of named entities. 528e3736c26SAndreas Gohr * @return string UTF-8 encoded string with numeric (and named) entities replaced. 5299f9fb0e5STom N Harris */ 5309f9fb0e5STom N Harris function utf8_unhtml($str, $entities=null) { 5319f9fb0e5STom N Harris static $decoder = null; 5329f9fb0e5STom N Harris if (is_null($decoder)) 5339f9fb0e5STom N Harris $decoder = new utf8_entity_decoder(); 5349f9fb0e5STom N Harris if (is_null($entities)) 5359f9fb0e5STom N Harris return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 5369f9fb0e5STom N Harris 'utf8_decode_numeric', $str); 5379f9fb0e5STom N Harris else 5389f9fb0e5STom N Harris return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 5399f9fb0e5STom N Harris array(&$decoder, 'decode'), $str); 5409f9fb0e5STom N Harris } 541df957b36SAndreas Gohr} 542df957b36SAndreas Gohr 543df957b36SAndreas Gohrif(!function_exists('utf8_decode_numeric')){ 544e3736c26SAndreas Gohr /** 545e3736c26SAndreas Gohr * Decodes numeric HTML entities to their correct UTF-8 characters 546e3736c26SAndreas Gohr * 547e3736c26SAndreas Gohr * @param $ent string A numeric entity 548e3736c26SAndreas Gohr * @return string 549e3736c26SAndreas Gohr */ 5509f9fb0e5STom N Harris function utf8_decode_numeric($ent) { 5519f9fb0e5STom N Harris switch ($ent[2]) { 5529f9fb0e5STom N Harris case 'X': 5539f9fb0e5STom N Harris case 'x': 5549f9fb0e5STom N Harris $cp = hexdec($ent[3]); 5559f9fb0e5STom N Harris break; 5569f9fb0e5STom N Harris default: 5579f9fb0e5STom N Harris $cp = intval($ent[3]); 5589f9fb0e5STom N Harris break; 5599f9fb0e5STom N Harris } 5609f9fb0e5STom N Harris return unicode_to_utf8(array($cp)); 5619f9fb0e5STom N Harris } 562df957b36SAndreas Gohr} 563df957b36SAndreas Gohr 564df957b36SAndreas Gohrif(!class_exists('utf8_entity_decoder')){ 565e3736c26SAndreas Gohr /** 566e3736c26SAndreas Gohr * Encapsulate HTML entity decoding tables 567e3736c26SAndreas Gohr */ 5689f9fb0e5STom N Harris class utf8_entity_decoder { 5699f9fb0e5STom N Harris var $table; 570e3736c26SAndreas Gohr 571e3736c26SAndreas Gohr /** 572e3736c26SAndreas Gohr * Initializes the decoding tables 573e3736c26SAndreas Gohr */ 57463703ba5SAndreas Gohr function __construct() { 5759f9fb0e5STom N Harris $table = get_html_translation_table(HTML_ENTITIES); 5769f9fb0e5STom N Harris $table = array_flip($table); 5779f9fb0e5STom N Harris $this->table = array_map(array(&$this,'makeutf8'), $table); 5789f9fb0e5STom N Harris } 579e3736c26SAndreas Gohr 580e3736c26SAndreas Gohr /** 581e3736c26SAndreas Gohr * Wrapper aorund unicode_to_utf8() 582e3736c26SAndreas Gohr * 583e3736c26SAndreas Gohr * @param $c string 584e3736c26SAndreas Gohr * @return mixed 585e3736c26SAndreas Gohr */ 5869f9fb0e5STom N Harris function makeutf8($c) { 5879f9fb0e5STom N Harris return unicode_to_utf8(array(ord($c))); 5889f9fb0e5STom N Harris } 589e3736c26SAndreas Gohr 590e3736c26SAndreas Gohr /** 591e3736c26SAndreas Gohr * Decodes any HTML entity to it's correct UTF-8 char equivalent 592e3736c26SAndreas Gohr * 593e3736c26SAndreas Gohr * @param $ent string An entity 594e3736c26SAndreas Gohr * @return string 595e3736c26SAndreas Gohr */ 5969f9fb0e5STom N Harris function decode($ent) { 5979f9fb0e5STom N Harris if ($ent[1] == '#') { 5989f9fb0e5STom N Harris return utf8_decode_numeric($ent); 5999f9fb0e5STom N Harris } elseif (array_key_exists($ent[0],$this->table)) { 6009f9fb0e5STom N Harris return $this->table[$ent[0]]; 6019f9fb0e5STom N Harris } else { 6029f9fb0e5STom N Harris return $ent[0]; 603ea2eed85Sandi } 604ea2eed85Sandi } 605ea2eed85Sandi } 606df957b36SAndreas Gohr} 607ea2eed85Sandi 608df957b36SAndreas Gohrif(!function_exists('utf8_to_unicode')){ 609ea2eed85Sandi /** 6101abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the 6111abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the 6121abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 6131abfaba4SAndreas Gohr * are not allowed. 61482257610Sandi * 6151abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 6161abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at 6171abfaba4SAndreas Gohr * level E_USER_WARNING 6181abfaba4SAndreas Gohr * 6191abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to 6201abfaba4SAndreas Gohr * trigger errors on encountering bad bytes 6211abfaba4SAndreas Gohr * 6221abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 6231abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 624e3736c26SAndreas Gohr * @param string $str UTF-8 encoded string 625e3736c26SAndreas Gohr * @param boolean $strict Check for invalid sequences? 62644881bd0Shenning.noren * @return mixed array of unicode code points or false if UTF-8 invalid 6271abfaba4SAndreas Gohr * @see unicode_to_utf8 6281abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 6291abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 63082257610Sandi */ 6311abfaba4SAndreas Gohr function utf8_to_unicode($str,$strict=false) { 6321abfaba4SAndreas Gohr $mState = 0; // cached expected number of octets after the current octet 6331abfaba4SAndreas Gohr // until the beginning of the next UTF8 character sequence 6341abfaba4SAndreas Gohr $mUcs4 = 0; // cached Unicode character 6351abfaba4SAndreas Gohr $mBytes = 1; // cached expected number of octets in the current sequence 63682257610Sandi 6371abfaba4SAndreas Gohr $out = array(); 6381abfaba4SAndreas Gohr 6391abfaba4SAndreas Gohr $len = strlen($str); 6401abfaba4SAndreas Gohr 6411abfaba4SAndreas Gohr for($i = 0; $i < $len; $i++) { 6421abfaba4SAndreas Gohr 6431abfaba4SAndreas Gohr $in = ord($str{$i}); 6441abfaba4SAndreas Gohr 6451abfaba4SAndreas Gohr if ( $mState == 0) { 6461abfaba4SAndreas Gohr 6471abfaba4SAndreas Gohr // When mState is zero we expect either a US-ASCII character or a 6481abfaba4SAndreas Gohr // multi-octet sequence. 6491abfaba4SAndreas Gohr if (0 == (0x80 & ($in))) { 6501abfaba4SAndreas Gohr // US-ASCII, pass straight through. 6511abfaba4SAndreas Gohr $out[] = $in; 6521abfaba4SAndreas Gohr $mBytes = 1; 6531abfaba4SAndreas Gohr 6541abfaba4SAndreas Gohr } else if (0xC0 == (0xE0 & ($in))) { 6551abfaba4SAndreas Gohr // First octet of 2 octet sequence 6561abfaba4SAndreas Gohr $mUcs4 = ($in); 6571abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x1F) << 6; 6581abfaba4SAndreas Gohr $mState = 1; 6591abfaba4SAndreas Gohr $mBytes = 2; 6601abfaba4SAndreas Gohr 6611abfaba4SAndreas Gohr } else if (0xE0 == (0xF0 & ($in))) { 6621abfaba4SAndreas Gohr // First octet of 3 octet sequence 6631abfaba4SAndreas Gohr $mUcs4 = ($in); 6641abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x0F) << 12; 6651abfaba4SAndreas Gohr $mState = 2; 6661abfaba4SAndreas Gohr $mBytes = 3; 6671abfaba4SAndreas Gohr 6681abfaba4SAndreas Gohr } else if (0xF0 == (0xF8 & ($in))) { 6691abfaba4SAndreas Gohr // First octet of 4 octet sequence 6701abfaba4SAndreas Gohr $mUcs4 = ($in); 6711abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x07) << 18; 6721abfaba4SAndreas Gohr $mState = 3; 6731abfaba4SAndreas Gohr $mBytes = 4; 6741abfaba4SAndreas Gohr 6751abfaba4SAndreas Gohr } else if (0xF8 == (0xFC & ($in))) { 6761abfaba4SAndreas Gohr /* First octet of 5 octet sequence. 6771abfaba4SAndreas Gohr * 6781abfaba4SAndreas Gohr * This is illegal because the encoded codepoint must be either 6791abfaba4SAndreas Gohr * (a) not the shortest form or 6801abfaba4SAndreas Gohr * (b) outside the Unicode range of 0-0x10FFFF. 6811abfaba4SAndreas Gohr * Rather than trying to resynchronize, we will carry on until the end 6821abfaba4SAndreas Gohr * of the sequence and let the later error handling code catch it. 6831abfaba4SAndreas Gohr */ 6841abfaba4SAndreas Gohr $mUcs4 = ($in); 6851abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 0x03) << 24; 6861abfaba4SAndreas Gohr $mState = 4; 6871abfaba4SAndreas Gohr $mBytes = 5; 6881abfaba4SAndreas Gohr 6891abfaba4SAndreas Gohr } else if (0xFC == (0xFE & ($in))) { 6901abfaba4SAndreas Gohr // First octet of 6 octet sequence, see comments for 5 octet sequence. 6911abfaba4SAndreas Gohr $mUcs4 = ($in); 6921abfaba4SAndreas Gohr $mUcs4 = ($mUcs4 & 1) << 30; 6931abfaba4SAndreas Gohr $mState = 5; 6941abfaba4SAndreas Gohr $mBytes = 6; 6951abfaba4SAndreas Gohr 6961abfaba4SAndreas Gohr } elseif($strict) { 6971abfaba4SAndreas Gohr /* Current octet is neither in the US-ASCII range nor a legal first 6981abfaba4SAndreas Gohr * octet of a multi-octet sequence. 6991abfaba4SAndreas Gohr */ 7001abfaba4SAndreas Gohr trigger_error( 7011abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence identifier '. 7021abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 7031abfaba4SAndreas Gohr E_USER_WARNING 7041abfaba4SAndreas Gohr ); 70544881bd0Shenning.noren return false; 7061abfaba4SAndreas Gohr 7071abfaba4SAndreas Gohr } 7081abfaba4SAndreas Gohr 7091abfaba4SAndreas Gohr } else { 7101abfaba4SAndreas Gohr 7111abfaba4SAndreas Gohr // When mState is non-zero, we expect a continuation of the multi-octet 7121abfaba4SAndreas Gohr // sequence 7131abfaba4SAndreas Gohr if (0x80 == (0xC0 & ($in))) { 7141abfaba4SAndreas Gohr 7151abfaba4SAndreas Gohr // Legal continuation. 7161abfaba4SAndreas Gohr $shift = ($mState - 1) * 6; 7171abfaba4SAndreas Gohr $tmp = $in; 7181abfaba4SAndreas Gohr $tmp = ($tmp & 0x0000003F) << $shift; 7191abfaba4SAndreas Gohr $mUcs4 |= $tmp; 7201abfaba4SAndreas Gohr 7211abfaba4SAndreas Gohr /** 7221abfaba4SAndreas Gohr * End of the multi-octet sequence. mUcs4 now contains the final 7231abfaba4SAndreas Gohr * Unicode codepoint to be output 7241abfaba4SAndreas Gohr */ 7251abfaba4SAndreas Gohr if (0 == --$mState) { 7261abfaba4SAndreas Gohr 7271abfaba4SAndreas Gohr /* 7281abfaba4SAndreas Gohr * Check for illegal sequences and codepoints. 7291abfaba4SAndreas Gohr */ 7301abfaba4SAndreas Gohr // From Unicode 3.1, non-shortest form is illegal 7311abfaba4SAndreas Gohr if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 7321abfaba4SAndreas Gohr ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 7331abfaba4SAndreas Gohr ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 7341abfaba4SAndreas Gohr (4 < $mBytes) || 7351abfaba4SAndreas Gohr // From Unicode 3.2, surrogate characters are illegal 7361abfaba4SAndreas Gohr (($mUcs4 & 0xFFFFF800) == 0xD800) || 7371abfaba4SAndreas Gohr // Codepoints outside the Unicode range are illegal 7381abfaba4SAndreas Gohr ($mUcs4 > 0x10FFFF)) { 7391abfaba4SAndreas Gohr 7401abfaba4SAndreas Gohr if($strict){ 7411abfaba4SAndreas Gohr trigger_error( 7421abfaba4SAndreas Gohr 'utf8_to_unicode: Illegal sequence or codepoint '. 7431abfaba4SAndreas Gohr 'in UTF-8 at byte '.$i, 7441abfaba4SAndreas Gohr E_USER_WARNING 7451abfaba4SAndreas Gohr ); 7461abfaba4SAndreas Gohr 74744881bd0Shenning.noren return false; 7481abfaba4SAndreas Gohr } 7491abfaba4SAndreas Gohr 7501abfaba4SAndreas Gohr } 7511abfaba4SAndreas Gohr 7521abfaba4SAndreas Gohr if (0xFEFF != $mUcs4) { 7531abfaba4SAndreas Gohr // BOM is legal but we don't want to output it 7541abfaba4SAndreas Gohr $out[] = $mUcs4; 7551abfaba4SAndreas Gohr } 7561abfaba4SAndreas Gohr 7571abfaba4SAndreas Gohr //initialize UTF8 cache 7581abfaba4SAndreas Gohr $mState = 0; 7591abfaba4SAndreas Gohr $mUcs4 = 0; 7601abfaba4SAndreas Gohr $mBytes = 1; 7611abfaba4SAndreas Gohr } 7621abfaba4SAndreas Gohr 7631abfaba4SAndreas Gohr } elseif($strict) { 7641abfaba4SAndreas Gohr /** 7651abfaba4SAndreas Gohr *((0xC0 & (*in) != 0x80) && (mState != 0)) 7661abfaba4SAndreas Gohr * Incomplete multi-octet sequence. 7671abfaba4SAndreas Gohr */ 7681abfaba4SAndreas Gohr trigger_error( 7691abfaba4SAndreas Gohr 'utf8_to_unicode: Incomplete multi-octet '. 7701abfaba4SAndreas Gohr ' sequence in UTF-8 at byte '.$i, 7711abfaba4SAndreas Gohr E_USER_WARNING 7721abfaba4SAndreas Gohr ); 7731abfaba4SAndreas Gohr 77444881bd0Shenning.noren return false; 77582257610Sandi } 77682257610Sandi } 77782257610Sandi } 7781abfaba4SAndreas Gohr return $out; 77982257610Sandi } 780df957b36SAndreas Gohr} 78182257610Sandi 782df957b36SAndreas Gohrif(!function_exists('unicode_to_utf8')){ 78382257610Sandi /** 7841abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns 7851abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the 7861abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 7871abfaba4SAndreas Gohr * are not allowed. 78882257610Sandi * 7891abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input 7901abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the 7911abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING 7921abfaba4SAndreas Gohr * 7931abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use 7941abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as 7951abfaba4SAndreas Gohr * reference the array by it's keys 7961abfaba4SAndreas Gohr * 797e3736c26SAndreas Gohr * @param array $arr of unicode code points representing a string 798e3736c26SAndreas Gohr * @param boolean $strict Check for invalid sequences? 79944881bd0Shenning.noren * @return mixed UTF-8 string or false if array contains invalid code points 8001abfaba4SAndreas Gohr * @author <hsivonen@iki.fi> 8011abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 8021abfaba4SAndreas Gohr * @see utf8_to_unicode 8031abfaba4SAndreas Gohr * @link http://hsivonen.iki.fi/php-utf8/ 8041abfaba4SAndreas Gohr * @link http://sourceforge.net/projects/phputf8/ 80582257610Sandi */ 8061abfaba4SAndreas Gohr function unicode_to_utf8($arr,$strict=false) { 8071abfaba4SAndreas Gohr if (!is_array($arr)) return ''; 8081abfaba4SAndreas Gohr ob_start(); 809f949a01cSAndreas Gohr 8101abfaba4SAndreas Gohr foreach (array_keys($arr) as $k) { 8111abfaba4SAndreas Gohr 8121abfaba4SAndreas Gohr if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 813db959ae3SAndreas Gohr # ASCII range (including control chars) 8141abfaba4SAndreas Gohr 8151abfaba4SAndreas Gohr echo chr($arr[$k]); 8161abfaba4SAndreas Gohr 8171abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x07ff) { 818db959ae3SAndreas Gohr # 2 byte sequence 8191abfaba4SAndreas Gohr 8201abfaba4SAndreas Gohr echo chr(0xc0 | ($arr[$k] >> 6)); 8211abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 8221abfaba4SAndreas Gohr 8231abfaba4SAndreas Gohr } else if($arr[$k] == 0xFEFF) { 824db959ae3SAndreas Gohr # Byte order mark (skip) 8251abfaba4SAndreas Gohr 8261abfaba4SAndreas Gohr // nop -- zap the BOM 8271abfaba4SAndreas Gohr 8281abfaba4SAndreas Gohr } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 829db959ae3SAndreas Gohr # Test for illegal surrogates 8301abfaba4SAndreas Gohr 8311abfaba4SAndreas Gohr // found a surrogate 8321abfaba4SAndreas Gohr if($strict){ 8331abfaba4SAndreas Gohr trigger_error( 8341abfaba4SAndreas Gohr 'unicode_to_utf8: Illegal surrogate '. 8351abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 8361abfaba4SAndreas Gohr E_USER_WARNING 8371abfaba4SAndreas Gohr ); 83844881bd0Shenning.noren return false; 8391abfaba4SAndreas Gohr } 8401abfaba4SAndreas Gohr 8411abfaba4SAndreas Gohr } else if ($arr[$k] <= 0xffff) { 842db959ae3SAndreas Gohr # 3 byte sequence 8431abfaba4SAndreas Gohr 8441abfaba4SAndreas Gohr echo chr(0xe0 | ($arr[$k] >> 12)); 8451abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 8461abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x003f)); 8471abfaba4SAndreas Gohr 8481abfaba4SAndreas Gohr } else if ($arr[$k] <= 0x10ffff) { 849db959ae3SAndreas Gohr # 4 byte sequence 8501abfaba4SAndreas Gohr 8511abfaba4SAndreas Gohr echo chr(0xf0 | ($arr[$k] >> 18)); 8521abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 8531abfaba4SAndreas Gohr echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 8541abfaba4SAndreas Gohr echo chr(0x80 | ($arr[$k] & 0x3f)); 8551abfaba4SAndreas Gohr 8561abfaba4SAndreas Gohr } elseif($strict) { 8571abfaba4SAndreas Gohr 8581abfaba4SAndreas Gohr trigger_error( 8591abfaba4SAndreas Gohr 'unicode_to_utf8: Codepoint out of Unicode range '. 8601abfaba4SAndreas Gohr 'at index: '.$k.', value: '.$arr[$k], 8611abfaba4SAndreas Gohr E_USER_WARNING 8621abfaba4SAndreas Gohr ); 8631abfaba4SAndreas Gohr 8641abfaba4SAndreas Gohr // out of range 86544881bd0Shenning.noren return false; 86682257610Sandi } 86782257610Sandi } 8681abfaba4SAndreas Gohr 8691abfaba4SAndreas Gohr $result = ob_get_contents(); 8701abfaba4SAndreas Gohr ob_end_clean(); 8711abfaba4SAndreas Gohr return $result; 87282257610Sandi } 873df957b36SAndreas Gohr} 87482257610Sandi 875df957b36SAndreas Gohrif(!function_exists('utf8_to_utf16be')){ 87682257610Sandi /** 87715fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 87815fa0b4fSAndreas Gohr * 87915fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 88015fa0b4fSAndreas Gohr */ 88115fa0b4fSAndreas Gohr function utf8_to_utf16be(&$str, $bom = false) { 88215fa0b4fSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 883ab77016bSAndreas Gohr if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); 88415fa0b4fSAndreas Gohr 88515fa0b4fSAndreas Gohr $uni = utf8_to_unicode($str); 88615fa0b4fSAndreas Gohr foreach($uni as $cp){ 88715fa0b4fSAndreas Gohr $out .= pack('n',$cp); 88815fa0b4fSAndreas Gohr } 88915fa0b4fSAndreas Gohr return $out; 89015fa0b4fSAndreas Gohr } 891df957b36SAndreas Gohr} 89215fa0b4fSAndreas Gohr 893df957b36SAndreas Gohrif(!function_exists('utf16be_to_utf8')){ 89415fa0b4fSAndreas Gohr /** 89515fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion. 89615fa0b4fSAndreas Gohr * 89715fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 89815fa0b4fSAndreas Gohr */ 89915fa0b4fSAndreas Gohr function utf16be_to_utf8(&$str) { 90015fa0b4fSAndreas Gohr $uni = unpack('n*',$str); 90115fa0b4fSAndreas Gohr return unicode_to_utf8($uni); 90215fa0b4fSAndreas Gohr } 903df957b36SAndreas Gohr} 90415fa0b4fSAndreas Gohr 905df957b36SAndreas Gohrif(!function_exists('utf8_bad_replace')){ 9060eac1afbSAndreas Gohr /** 9070eac1afbSAndreas Gohr * Replace bad bytes with an alternative character 9080eac1afbSAndreas Gohr * 9090eac1afbSAndreas Gohr * ASCII character is recommended for replacement char 9100eac1afbSAndreas Gohr * 9110eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string 9120eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms 9130eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars 9140eac1afbSAndreas Gohr * 9150eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com> 9160eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8 917e3736c26SAndreas Gohr * @param string $str to search 918e3736c26SAndreas Gohr * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII 9190eac1afbSAndreas Gohr * @return string 9200eac1afbSAndreas Gohr */ 9210eac1afbSAndreas Gohr function utf8_bad_replace($str, $replace = '') { 9220eac1afbSAndreas Gohr $UTF8_BAD = 9230eac1afbSAndreas Gohr '([\x00-\x7F]'. # ASCII (including control chars) 9240eac1afbSAndreas Gohr '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 9250eac1afbSAndreas Gohr '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 9260eac1afbSAndreas Gohr '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 9270eac1afbSAndreas Gohr '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 9280eac1afbSAndreas Gohr '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 9290eac1afbSAndreas Gohr '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 9300eac1afbSAndreas Gohr '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 9310eac1afbSAndreas Gohr '|(.{1}))'; # invalid byte 9320eac1afbSAndreas Gohr ob_start(); 9330eac1afbSAndreas Gohr while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 9340eac1afbSAndreas Gohr if ( !isset($matches[2])) { 9350eac1afbSAndreas Gohr echo $matches[0]; 9360eac1afbSAndreas Gohr } else { 9370eac1afbSAndreas Gohr echo $replace; 9380eac1afbSAndreas Gohr } 9390eac1afbSAndreas Gohr $str = substr($str,strlen($matches[0])); 9400eac1afbSAndreas Gohr } 9410eac1afbSAndreas Gohr $result = ob_get_contents(); 9420eac1afbSAndreas Gohr ob_end_clean(); 9430eac1afbSAndreas Gohr return $result; 9440eac1afbSAndreas Gohr } 945df957b36SAndreas Gohr} 946ab77016bSAndreas Gohr 947df957b36SAndreas Gohrif(!function_exists('utf8_correctIdx')){ 9485953e889Schris /** 9495953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary 9505953e889Schris * 9515953e889Schris * @param $str string utf8 character string 9525953e889Schris * @param $i int byte index into $str 9535953e889Schris * @param $next bool direction to search for boundary, 9545953e889Schris * false = up (current character) 9555953e889Schris * true = down (next character) 9565953e889Schris * 9575953e889Schris * @return int byte index into $str now pointing to a utf8 character boundary 9585953e889Schris * 9595953e889Schris * @author chris smith <chris@jalakai.co.uk> 9605953e889Schris */ 9615953e889Schris function utf8_correctIdx(&$str,$i,$next=false) { 9625953e889Schris 963f50163d1Schris if ($i <= 0) return 0; 964f50163d1Schris 9655953e889Schris $limit = strlen($str); 966f50163d1Schris if ($i>=$limit) return $limit; 967f50163d1Schris 968f50163d1Schris if ($next) { 9695953e889Schris while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; 9705953e889Schris } else { 9715953e889Schris while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; 9725953e889Schris } 9735953e889Schris 9745953e889Schris return $i; 9755953e889Schris } 976df957b36SAndreas Gohr} 9775953e889Schris 978ab77016bSAndreas Gohr// only needed if no mb_string available 979ab77016bSAndreas Gohrif(!UTF8_MBSTRING){ 98015fa0b4fSAndreas Gohr /** 98182257610Sandi * UTF-8 Case lookup table 98282257610Sandi * 98382257610Sandi * This lookuptable defines the upper case letters to their correspponding 98482257610Sandi * lower case letter in UTF-8 98582257610Sandi * 98682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 98782257610Sandi */ 98854662a04SAndreas Gohr global $UTF8_LOWER_TO_UPPER; 989df957b36SAndreas Gohr if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array( 99072de9068SAndreas Gohr "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q", 99172de9068SAndreas Gohr "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G", 99272de9068SAndreas Gohr "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ", 99385b77bbdSAndreas Gohr "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ", 99472de9068SAndreas Gohr "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ", 99585b77bbdSAndreas Gohr "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ", 99685b77bbdSAndreas Gohr "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ", 99785b77bbdSAndreas Gohr "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ", 99872de9068SAndreas Gohr "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ", 99972de9068SAndreas Gohr "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ", 100072de9068SAndreas Gohr "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ", 100172de9068SAndreas Gohr "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ", 100272de9068SAndreas Gohr "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ", 100372de9068SAndreas Gohr "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố", 100472de9068SAndreas Gohr "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ", 100572de9068SAndreas Gohr "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ", 100672de9068SAndreas Gohr "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ", 100772de9068SAndreas Gohr "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ", 100872de9068SAndreas Gohr "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ", 100972de9068SAndreas Gohr "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ", 101072de9068SAndreas Gohr "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ", 101172de9068SAndreas Gohr "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ", 101272de9068SAndreas Gohr "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ", 101372de9068SAndreas Gohr "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ", 101472de9068SAndreas Gohr "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս", 101572de9068SAndreas Gohr "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ", 101672de9068SAndreas Gohr "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ", 101772de9068SAndreas Gohr "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ", 101872de9068SAndreas Gohr "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ", 101972de9068SAndreas Gohr "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ", 102072de9068SAndreas Gohr "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ", 102172de9068SAndreas Gohr "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ", 102272de9068SAndreas Gohr "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ", 102372de9068SAndreas Gohr "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ", 102472de9068SAndreas Gohr "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ", 102572de9068SAndreas Gohr "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ", 102672de9068SAndreas Gohr "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ", 102772de9068SAndreas Gohr "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ", 102872de9068SAndreas Gohr "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П", 102972de9068SAndreas Gohr "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е", 103072de9068SAndreas Gohr "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ", 103172de9068SAndreas Gohr "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ", 103272de9068SAndreas Gohr "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ", 103372de9068SAndreas Gohr "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π", 103472de9068SAndreas Gohr "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ", 103572de9068SAndreas Gohr "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ", 103672de9068SAndreas Gohr "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ", 103772de9068SAndreas Gohr "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ", 103872de9068SAndreas Gohr "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș", 103972de9068SAndreas Gohr "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ", 104072de9068SAndreas Gohr "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ", 104172de9068SAndreas Gohr "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ", 104272de9068SAndreas Gohr "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ", 104372de9068SAndreas Gohr "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ", 104472de9068SAndreas Gohr "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž", 104572de9068SAndreas Gohr "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ", 104672de9068SAndreas Gohr "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ", 104772de9068SAndreas Gohr "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ", 104872de9068SAndreas Gohr "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī", 104972de9068SAndreas Gohr "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė", 105072de9068SAndreas Gohr "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă", 105172de9068SAndreas Gohr "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö", 105272de9068SAndreas Gohr "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì", 105372de9068SAndreas Gohr "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â", 105472de9068SAndreas Gohr "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T", 105572de9068SAndreas Gohr "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J", 105672de9068SAndreas Gohr "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A" 105782257610Sandi ); 105882257610Sandi 105982257610Sandi /** 106082257610Sandi * UTF-8 Case lookup table 106182257610Sandi * 1062e3736c26SAndreas Gohr * This lookuptable defines the lower case letters to their corresponding 106372de9068SAndreas Gohr * upper case letter in UTF-8 106482257610Sandi * 106582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 106682257610Sandi */ 106754662a04SAndreas Gohr global $UTF8_UPPER_TO_LOWER; 1068df957b36SAndreas Gohr if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array ( 106972de9068SAndreas Gohr "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q", 107072de9068SAndreas Gohr "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g", 107172de9068SAndreas Gohr "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ", 107285b77bbdSAndreas Gohr "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ", 107372de9068SAndreas Gohr "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ", 107485b77bbdSAndreas Gohr "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ", 107585b77bbdSAndreas Gohr "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ", 107685b77bbdSAndreas Gohr "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ", 107772de9068SAndreas Gohr "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ", 107872de9068SAndreas Gohr "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ", 107972de9068SAndreas Gohr "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ", 108072de9068SAndreas Gohr "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ", 108172de9068SAndreas Gohr "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ", 108272de9068SAndreas Gohr "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố", 108372de9068SAndreas Gohr "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ", 108472de9068SAndreas Gohr "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ", 108572de9068SAndreas Gohr "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ", 108672de9068SAndreas Gohr "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ", 108772de9068SAndreas Gohr "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ", 108872de9068SAndreas Gohr "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ", 108972de9068SAndreas Gohr "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ", 109072de9068SAndreas Gohr "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ", 109172de9068SAndreas Gohr "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ", 109272de9068SAndreas Gohr "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ", 109372de9068SAndreas Gohr "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս", 109472de9068SAndreas Gohr "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ", 109572de9068SAndreas Gohr "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ", 109672de9068SAndreas Gohr "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ", 109772de9068SAndreas Gohr "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ", 109872de9068SAndreas Gohr "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ", 109972de9068SAndreas Gohr "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ", 110072de9068SAndreas Gohr "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ", 110172de9068SAndreas Gohr "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ", 110272de9068SAndreas Gohr "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ", 110372de9068SAndreas Gohr "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ", 110472de9068SAndreas Gohr "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ", 110572de9068SAndreas Gohr "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ", 110672de9068SAndreas Gohr "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ", 110772de9068SAndreas Gohr "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п", 110872de9068SAndreas Gohr "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е", 110972de9068SAndreas Gohr "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ", 111072de9068SAndreas Gohr "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ", 111172de9068SAndreas Gohr "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ", 111272de9068SAndreas Gohr "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π", 111372de9068SAndreas Gohr "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ", 111472de9068SAndreas Gohr "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ", 111572de9068SAndreas Gohr "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ", 111672de9068SAndreas Gohr "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ", 111772de9068SAndreas Gohr "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș", 111872de9068SAndreas Gohr "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ", 111972de9068SAndreas Gohr "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ", 112072de9068SAndreas Gohr "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ", 112172de9068SAndreas Gohr "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ", 112272de9068SAndreas Gohr "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ", 112372de9068SAndreas Gohr "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž", 112472de9068SAndreas Gohr "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ", 112572de9068SAndreas Gohr "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ", 112672de9068SAndreas Gohr "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ", 112772de9068SAndreas Gohr "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī", 112872de9068SAndreas Gohr "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė", 112972de9068SAndreas Gohr "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă", 113072de9068SAndreas Gohr "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö", 113172de9068SAndreas Gohr "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì", 113272de9068SAndreas Gohr "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â", 113372de9068SAndreas Gohr "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t", 113472de9068SAndreas Gohr "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j", 113572de9068SAndreas Gohr "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a" 113672de9068SAndreas Gohr ); 113772de9068SAndreas Gohr}; // end of case lookup tables 1138ab77016bSAndreas Gohr 113982257610Sandi/** 114082257610Sandi * UTF-8 lookup table for lower case accented letters 114182257610Sandi * 114282257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 114382257610Sandi * range. This are lower case letters only. 114482257610Sandi * 114582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 114682257610Sandi * @see utf8_deaccent() 114782257610Sandi */ 114854662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS; 1149df957b36SAndreas Gohrif(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array( 115082257610Sandi 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 115182257610Sandi 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 115282257610Sandi 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 115382257610Sandi 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 115482257610Sandi 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 115582257610Sandi 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 115682257610Sandi 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 115782257610Sandi 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 115882257610Sandi 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 115982257610Sandi 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 116082257610Sandi 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 116182257610Sandi 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 116282257610Sandi 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 116382257610Sandi 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 116474c0c504Schris 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e', 116582257610Sandi); 116682257610Sandi 116782257610Sandi/** 116882257610Sandi * UTF-8 lookup table for upper case accented letters 116982257610Sandi * 117082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 117182257610Sandi * range. This are upper case letters only. 117282257610Sandi * 117382257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 117482257610Sandi * @see utf8_deaccent() 117582257610Sandi */ 117654662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS; 1177df957b36SAndreas Gohrif(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array( 1178df3ecd55SAndreas Gohr 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O', 1179df3ecd55SAndreas Gohr 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 1180df3ecd55SAndreas Gohr 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O', 1181df3ecd55SAndreas Gohr 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O', 1182df3ecd55SAndreas Gohr 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C', 1183df3ecd55SAndreas Gohr 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T', 1184df3ecd55SAndreas Gohr 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L', 1185df3ecd55SAndreas Gohr 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z', 1186df3ecd55SAndreas Gohr 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T', 1187df3ecd55SAndreas Gohr 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', 1188df3ecd55SAndreas Gohr 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J', 1189df3ecd55SAndreas Gohr 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O', 1190df3ecd55SAndreas Gohr 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G', 1191df3ecd55SAndreas Gohr 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A', 119274c0c504Schris 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E', 119382257610Sandi); 119482257610Sandi 1195099ada41Sandi/** 1196099ada41Sandi * UTF-8 array of common special characters 1197099ada41Sandi * 1198099ada41Sandi * This array should contain all special characters (not a letter or digit) 1199099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum 1200099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special 1201099ada41Sandi * chars. 1202099ada41Sandi * 1203099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 1204ad81d431SAndreas Gohr * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a 1205099ada41Sandi * 1206099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 1207099ada41Sandi * @see utf8_stripspecials() 1208099ada41Sandi */ 120954662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS; 1210df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array( 1211099ada41Sandi 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 1212ad81d431SAndreas Gohr 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, 12135c812709Sandi 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 12145c812709Sandi 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 1215099ada41Sandi 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 1216099ada41Sandi 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 1217099ada41Sandi 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 1218099ada41Sandi 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 1219099ada41Sandi 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 1220099ada41Sandi 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 1221099ada41Sandi 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 1222099ada41Sandi 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 1223fae4b5fcSAndreas Gohr 0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 1224099ada41Sandi 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 1225099ada41Sandi 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 1226099ada41Sandi 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 1227099ada41Sandi 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 1228099ada41Sandi 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 1229099ada41Sandi 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 1230099ada41Sandi 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 1231099ada41Sandi 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 1232099ada41Sandi 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 1233099ada41Sandi 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 1234099ada41Sandi 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 1235099ada41Sandi 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 1236099ada41Sandi 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 1237099ada41Sandi 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 1238099ada41Sandi 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 1239099ada41Sandi 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 1240099ada41Sandi 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 1241099ada41Sandi 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 1242099ada41Sandi 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 1243099ada41Sandi 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 1244099ada41Sandi 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 1245099ada41Sandi 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 1246099ada41Sandi 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 1247099ada41Sandi 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 1248099ada41Sandi 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 1249099ada41Sandi 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 1250099ada41Sandi 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 1251099ada41Sandi 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 1252099ada41Sandi 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 1253099ada41Sandi 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 1254099ada41Sandi 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 1255099ada41Sandi 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 1256d5b23302STom N Harris 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 1257d5b23302STom N Harris 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017, 1258d5b23302STom N Harris 0x3018, 0x3019, 0x301a, 0x301b, 0x3036, 1259d5b23302STom N Harris 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 1260099ada41Sandi 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 1261099ada41Sandi 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 1262099ada41Sandi 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 1263099ada41Sandi 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 1264d5b23302STom N Harris 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 1265d5b23302STom N Harris 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c, 1266d5b23302STom N Harris 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b, 1267d5b23302STom N Harris 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 1268d5b23302STom N Harris 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea, 1269d5b23302STom N Harris 0xffeb, 0xffec, 0xffed, 0xffee, 1270fae4b5fcSAndreas Gohr 0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703, 1271fae4b5fcSAndreas Gohr 0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b, 1272fae4b5fcSAndreas Gohr 0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713, 12737de9cff5SAndreas Gohr 0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b, 12747de9cff5SAndreas Gohr 0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf, 1275099ada41Sandi); 1276340756e4Sandi 1277720307d9Schris// utf8 version of above data 1278720307d9Schrisglobal $UTF8_SPECIAL_CHARS2; 1279df957b36SAndreas Gohrif(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 = 128037242afaSTom N Harris "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~ �'. 128132261ab5SChristopher Smith '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'. 128285b77bbdSAndreas Gohr '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'. 1283720307d9Schris '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿–—―‗‘’‚“”�'. 128485b77bbdSAndreas Gohr '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'. 1285720307d9Schris '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'. 128685b77bbdSAndreas Gohr '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'. 1287720307d9Schris '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'. 1288720307d9Schris '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'. 1289720307d9Schris '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'. 1290720307d9Schris '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'. 1291720307d9Schris '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'. 1292720307d9Schris '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'. 1293720307d9Schris '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'. 1294d5b23302STom N Harris '➷➸➹➺➻➼➽➾'. 1295d5b23302STom N Harris ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'. 1296d5b23302STom N Harris '�'. 1297d5b23302STom N Harris '�ﹼﹽ'. 1298d5b23302STom N Harris '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'. 1299fae4b5fcSAndreas Gohr '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'. 13007de9cff5SAndreas Gohr ''. 13017de9cff5SAndreas Gohr ' '; 1302720307d9Schris 13038a831f2bSAndreas Gohr/** 13048a831f2bSAndreas Gohr * Romanization lookup table 13058a831f2bSAndreas Gohr * 13068a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language 13078a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII. 13088a831f2bSAndreas Gohr * 13098a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works 13108a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement 13118a831f2bSAndreas Gohr * only. Specialities of each language are not supported. 13128a831f2bSAndreas Gohr * 13138a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 13148a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com> 13158a831f2bSAndreas Gohr * @link http://www.uconv.com/translit.htm 13168a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi> 13178a831f2bSAndreas Gohr * @link http://kanjidict.stc.cx/hiragana.php?src=2 13188a831f2bSAndreas Gohr * @link http://www.translatum.gr/converter/greek-transliteration.htm 13198a831f2bSAndreas Gohr * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription 13208a831f2bSAndreas Gohr * @link http://www.btranslations.com/resources/romanization/korean.asp 1321014d0ab6SAndreas Gohr * @author Arthit Suriyawongkul <arthit@gmail.com> 1322fed467f8SDenis Scheither * @author Denis Scheither <amorphis@uni-bremen.de> 132356c92de6SEivind Morland * @author Eivind Morland <eivind.morland@gmail.com> 13248a831f2bSAndreas Gohr */ 132554662a04SAndreas Gohrglobal $UTF8_ROMANIZATION; 1326df957b36SAndreas Gohrif(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array( 1327176ae32bSAndreas Gohr // scandinavian - differs from what we do in deaccent 1328176ae32bSAndreas Gohr 'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O', 1329176ae32bSAndreas Gohr 13308a831f2bSAndreas Gohr //russian cyrillic 13318a831f2bSAndreas Gohr 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G', 13328a831f2bSAndreas Gohr 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh', 13338a831f2bSAndreas Gohr 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K', 13348a831f2bSAndreas Gohr 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O', 13358a831f2bSAndreas Gohr 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T', 13368a831f2bSAndreas Gohr 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C', 1337d8cb2602SDenis Simakov 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'', 1338f5e334deSAndreas Gohr 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju', 13398a831f2bSAndreas Gohr 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja', 13408a831f2bSAndreas Gohr // Ukrainian cyrillic 13418a831f2bSAndreas Gohr 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji', 13428a831f2bSAndreas Gohr // Georgian 13438a831f2bSAndreas Gohr 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th', 13448a831f2bSAndreas Gohr 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh', 13458a831f2bSAndreas Gohr 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q', 13468a831f2bSAndreas Gohr 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh', 13478a831f2bSAndreas Gohr 'ჰ'=>'xh', 13488a831f2bSAndreas Gohr //Sanskrit 13498a831f2bSAndreas Gohr 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry', 13508a831f2bSAndreas Gohr 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw', 13518a831f2bSAndreas Gohr 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh', 13528a831f2bSAndreas Gohr 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh', 13538a831f2bSAndreas Gohr 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh', 13548a831f2bSAndreas Gohr 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r', 13558a831f2bSAndreas Gohr 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x', 135656c92de6SEivind Morland //Sanskrit diacritics 135756c92de6SEivind Morland 'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T', 135856c92de6SEivind Morland 'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L', 135956c92de6SEivind Morland 'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t', 136056c92de6SEivind Morland 'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l', 13618a831f2bSAndreas Gohr //Hebrew 13623dbad6dcSDenis Simakov 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th', 13633dbad6dcSDenis Simakov 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n', 13643dbad6dcSDenis Simakov 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r', 13658a831f2bSAndreas Gohr 'ש'=>'sh','ת'=>'t', 13668a831f2bSAndreas Gohr //Arabic 13678a831f2bSAndreas Gohr 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d', 13688a831f2bSAndreas Gohr 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'', 13698a831f2bSAndreas Gohr 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k', 13708a831f2bSAndreas Gohr 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i', 13718a831f2bSAndreas Gohr 1372799e0977SAndreas Gohr // Japanese characters (last update: 2008-05-09) 13739476a253SAndreas Gohr 13748a831f2bSAndreas Gohr // Japanese hiragana 1375fed467f8SDenis Scheither 1376fed467f8SDenis Scheither // 3 character syllables, っ doubles the consonant after 1377fed467f8SDenis Scheither 'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu', 1378879205e1SAndreas Gohr 'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu', 1379799e0977SAndreas Gohr 'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu', 1380879205e1SAndreas Gohr 'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu', 1381879205e1SAndreas Gohr // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu', 1382879205e1SAndreas Gohr 'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu', 1383879205e1SAndreas Gohr 'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu', 1384879205e1SAndreas Gohr 'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu', 1385879205e1SAndreas Gohr 'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu', 1386879205e1SAndreas Gohr 'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu', 1387879205e1SAndreas Gohr 'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu', 1388879205e1SAndreas Gohr 1389879205e1SAndreas Gohr // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway) 1390879205e1SAndreas Gohr 'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u', 1391879205e1SAndreas Gohr 'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu', 1392fed467f8SDenis Scheither 1393fed467f8SDenis Scheither // 2 character syllables - normal 1394879205e1SAndreas Gohr 'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo', 1395fed467f8SDenis Scheither 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu', 1396fed467f8SDenis Scheither 'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu', 1397799e0977SAndreas Gohr 'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu', 1398799e0977SAndreas Gohr 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu', 1399fed467f8SDenis Scheither 'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu', 1400fed467f8SDenis Scheither 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu', 1401fed467f8SDenis Scheither 'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu', 1402fed467f8SDenis Scheither 'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu', 1403fed467f8SDenis Scheither 'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu', 1404fed467f8SDenis Scheither 'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu', 1405879205e1SAndreas Gohr 'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju', 1406879205e1SAndreas Gohr 'うぇ'=>'we','うぃ'=>'wi', 1407879205e1SAndreas Gohr 'いぇ'=>'ye', 1408fed467f8SDenis Scheither 1409fed467f8SDenis Scheither // 2 character syllables, っ doubles the consonant after 1410fed467f8SDenis Scheither 'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu', 1411fed467f8SDenis Scheither 'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu', 1412fed467f8SDenis Scheither 'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu', 1413fed467f8SDenis Scheither 'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu', 1414fed467f8SDenis Scheither 'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu', 1415fed467f8SDenis Scheither 'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku', 1416fed467f8SDenis Scheither 'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu', 1417fed467f8SDenis Scheither 'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu', 1418fed467f8SDenis Scheither 'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru', 1419fed467f8SDenis Scheither 'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu', 1420799e0977SAndreas Gohr 'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu', 1421fed467f8SDenis Scheither 1422fed467f8SDenis Scheither // 1 character syllabels 1423fed467f8SDenis Scheither 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n', 1424879205e1SAndreas Gohr 'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu', 1425fed467f8SDenis Scheither 'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu', 1426fed467f8SDenis Scheither 'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu', 14279476a253SAndreas Gohr 'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu', 1428fed467f8SDenis Scheither 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du', 1429fed467f8SDenis Scheither 'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu', 1430fed467f8SDenis Scheither 'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku', 1431fed467f8SDenis Scheither 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu', 1432fed467f8SDenis Scheither 'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu', 1433fed467f8SDenis Scheither 'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru', 1434fed467f8SDenis Scheither 'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su', 1435879205e1SAndreas Gohr 'わ'=>'wa','を'=>'wo', 1436879205e1SAndreas Gohr 'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu', 1437879205e1SAndreas Gohr 'や'=>'ya','よ'=>'yo','ゆ'=>'yu', 14389476a253SAndreas Gohr // old characters 14399476a253SAndreas Gohr 'ゑ'=>'we','ゐ'=>'wi', 1440fed467f8SDenis Scheither 14419476a253SAndreas Gohr // convert what's left (probably only kicks in when something's missing above) 14429476a253SAndreas Gohr // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u', 14439476a253SAndreas Gohr // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu', 1444fed467f8SDenis Scheither 14459476a253SAndreas Gohr // never seen one of those (disabled for the moment) 1446879205e1SAndreas Gohr // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu', 14479476a253SAndreas Gohr // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu', 14489476a253SAndreas Gohr // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu', 14499476a253SAndreas Gohr // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu', 14509476a253SAndreas Gohr // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu', 14519476a253SAndreas Gohr // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu', 14529476a253SAndreas Gohr // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu', 14539476a253SAndreas Gohr // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu', 14549476a253SAndreas Gohr // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu', 14559476a253SAndreas Gohr // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu', 14569476a253SAndreas Gohr // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu', 14579476a253SAndreas Gohr // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu', 14589476a253SAndreas Gohr // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu', 14599476a253SAndreas Gohr // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu', 1460fed467f8SDenis Scheither 1461fed467f8SDenis Scheither // 'spare' characters from other romanization systems 1462fed467f8SDenis Scheither // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du', 1463fed467f8SDenis Scheither // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu', 1464fed467f8SDenis Scheither // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su', 1465fed467f8SDenis Scheither // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu', 1466fed467f8SDenis Scheither //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu', 1467fed467f8SDenis Scheither //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu', 1468fed467f8SDenis Scheither //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu', 1469fed467f8SDenis Scheither //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu', 1470fed467f8SDenis Scheither //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi', 1471fed467f8SDenis Scheither //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju', 1472fed467f8SDenis Scheither 1473fed467f8SDenis Scheither 14748a831f2bSAndreas Gohr // Japanese katakana 1475fed467f8SDenis Scheither 1476fed467f8SDenis Scheither // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs) 1477fed467f8SDenis Scheither 'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu', 1478fed467f8SDenis Scheither 'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu', 1479fed467f8SDenis Scheither 'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu', 1480fed467f8SDenis Scheither 'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu', 1481fed467f8SDenis Scheither 'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu', 1482fed467f8SDenis Scheither 'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu', 1483fed467f8SDenis Scheither 'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu', 1484fed467f8SDenis Scheither 'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu', 1485fed467f8SDenis Scheither 'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu', 1486799e0977SAndreas Gohr 'ッティー'=>'ttii', 1487799e0977SAndreas Gohr 'ッヂィー'=>'ddii', 1488fed467f8SDenis Scheither 1489fed467f8SDenis Scheither // 3 character syllables - doubled vowels 1490fed467f8SDenis Scheither 'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo', 1491fed467f8SDenis Scheither 'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu', 1492fed467f8SDenis Scheither 'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu', 1493fed467f8SDenis Scheither 'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu', 1494fed467f8SDenis Scheither 'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu', 1495fed467f8SDenis Scheither 'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu', 1496fed467f8SDenis Scheither 'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu', 1497fed467f8SDenis Scheither 'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu', 1498fed467f8SDenis Scheither 'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu', 1499fed467f8SDenis Scheither 'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu', 1500fed467f8SDenis Scheither 'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu', 1501fed467f8SDenis Scheither 'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu', 1502fed467f8SDenis Scheither 'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu', 1503fed467f8SDenis Scheither 'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu', 1504fed467f8SDenis Scheither 'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu', 1505fed467f8SDenis Scheither 'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu', 1506fed467f8SDenis Scheither 'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu', 1507fed467f8SDenis Scheither 'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu', 1508fed467f8SDenis Scheither 'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu', 1509fed467f8SDenis Scheither 'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu', 1510fed467f8SDenis Scheither 'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu', 1511fed467f8SDenis Scheither 'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu', 1512fed467f8SDenis Scheither 'ウェー'=>'wee','ウィー'=>'wii', 1513fed467f8SDenis Scheither 'イェー'=>'yee', 1514799e0977SAndreas Gohr 'ティー'=>'tii', 1515799e0977SAndreas Gohr 'ヂィー'=>'dii', 1516fed467f8SDenis Scheither 1517fed467f8SDenis Scheither // 3 character syllables - doubled consonants 1518fed467f8SDenis Scheither 'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu', 1519fed467f8SDenis Scheither 'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu', 1520fed467f8SDenis Scheither 'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu', 1521fed467f8SDenis Scheither 'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu', 1522fed467f8SDenis Scheither 'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu', 1523fed467f8SDenis Scheither 'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu', 1524fed467f8SDenis Scheither 'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu', 1525fed467f8SDenis Scheither 'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu', 1526fed467f8SDenis Scheither 'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu', 1527799e0977SAndreas Gohr 'ッティ'=>'tti', 1528799e0977SAndreas Gohr 'ッヂィ'=>'ddi', 1529fed467f8SDenis Scheither 1530fed467f8SDenis Scheither // 3 character syllables - doubled vowel and consonants 1531fed467f8SDenis Scheither 'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu', 1532fed467f8SDenis Scheither 'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu', 1533fed467f8SDenis Scheither 'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa', 1534fed467f8SDenis Scheither 'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu', 1535fed467f8SDenis Scheither 'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu', 1536fed467f8SDenis Scheither 'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu', 1537fed467f8SDenis Scheither 'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu', 1538fed467f8SDenis Scheither 'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu', 1539799e0977SAndreas Gohr 'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu', 1540799e0977SAndreas Gohr 'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu', 1541fed467f8SDenis Scheither 'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu', 1542fed467f8SDenis Scheither 1543fed467f8SDenis Scheither // 2 character syllables - normal 1544799e0977SAndreas Gohr 'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu', 1545799e0977SAndreas Gohr // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu', 1546799e0977SAndreas Gohr 'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu', 1547fed467f8SDenis Scheither 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu', 1548fed467f8SDenis Scheither 'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu', 1549fed467f8SDenis Scheither 'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu', 1550fed467f8SDenis Scheither 'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu', 1551fed467f8SDenis Scheither 'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu', 1552fed467f8SDenis Scheither 'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu', 1553fed467f8SDenis Scheither 'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu', 1554fed467f8SDenis Scheither 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu', 1555879205e1SAndreas Gohr 'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu', 1556879205e1SAndreas Gohr 'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju', 1557fed467f8SDenis Scheither 'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu', 1558fed467f8SDenis Scheither 'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du', 1559fed467f8SDenis Scheither 'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu', 1560799e0977SAndreas Gohr // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu', 1561fed467f8SDenis Scheither 'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu', 1562fed467f8SDenis Scheither 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu', 1563fed467f8SDenis Scheither 'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu', 1564fed467f8SDenis Scheither 'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu', 1565fed467f8SDenis Scheither 'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu', 1566fed467f8SDenis Scheither 'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu', 1567fed467f8SDenis Scheither 'ウェ'=>'we','ウィ'=>'wi', 1568fed467f8SDenis Scheither 'イェ'=>'ye', 1569799e0977SAndreas Gohr 'ティ'=>'ti', 1570799e0977SAndreas Gohr 'ヂィ'=>'di', 1571fed467f8SDenis Scheither 1572fed467f8SDenis Scheither // 2 character syllables - doubled vocal 1573fed467f8SDenis Scheither 'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu', 1574fed467f8SDenis Scheither 'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu', 1575fed467f8SDenis Scheither 'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu', 1576fed467f8SDenis Scheither 'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu', 1577fed467f8SDenis Scheither 'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu', 1578fed467f8SDenis Scheither 'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa', 1579fed467f8SDenis Scheither 'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu', 1580fed467f8SDenis Scheither 'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu', 1581fed467f8SDenis Scheither 'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu', 1582fed467f8SDenis Scheither 'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu', 1583fed467f8SDenis Scheither 'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu', 1584799e0977SAndreas Gohr 'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu', 1585fed467f8SDenis Scheither 'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu', 1586fed467f8SDenis Scheither 'ワー'=>'waa','ヲー'=>'woo', 1587fed467f8SDenis Scheither 'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu', 1588fed467f8SDenis Scheither 'ヵー'=>'kaa','ヶー'=>'kee', 15899476a253SAndreas Gohr // old characters 15909476a253SAndreas Gohr 'ヱー'=>'wee','ヰー'=>'wii', 1591fed467f8SDenis Scheither 1592879205e1SAndreas Gohr // seperate katakana 'n' 1593879205e1SAndreas Gohr 'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u', 1594879205e1SAndreas Gohr 'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu', 1595879205e1SAndreas Gohr 1596fed467f8SDenis Scheither // 2 character syllables - doubled consonants 1597fed467f8SDenis Scheither 'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu', 1598fed467f8SDenis Scheither 'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu', 1599fed467f8SDenis Scheither 'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka', 1600fed467f8SDenis Scheither 'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu', 1601fed467f8SDenis Scheither 'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu', 1602fed467f8SDenis Scheither 'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu', 1603fed467f8SDenis Scheither 'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru', 1604fed467f8SDenis Scheither 'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu', 1605799e0977SAndreas Gohr 'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu', 1606799e0977SAndreas Gohr 'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu', 1607fed467f8SDenis Scheither 'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu', 1608fed467f8SDenis Scheither 1609fed467f8SDenis Scheither // 1 character syllables 1610fed467f8SDenis Scheither 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n', 1611fed467f8SDenis Scheither 'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu', 1612fed467f8SDenis Scheither 'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu', 1613fed467f8SDenis Scheither 'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu', 1614fed467f8SDenis Scheither 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka', 1615fed467f8SDenis Scheither 'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu', 1616fed467f8SDenis Scheither 'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu', 1617fed467f8SDenis Scheither 'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu', 1618fed467f8SDenis Scheither 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru', 1619fed467f8SDenis Scheither 'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su', 1620879205e1SAndreas Gohr 'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu', 1621fed467f8SDenis Scheither 'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu', 1622fed467f8SDenis Scheither 'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du', 1623fed467f8SDenis Scheither 'ワ'=>'wa','ヲ'=>'wo', 1624fed467f8SDenis Scheither 'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu', 1625fed467f8SDenis Scheither 'ヵ'=>'ka','ヶ'=>'ke', 16269476a253SAndreas Gohr // old characters 16279476a253SAndreas Gohr 'ヱ'=>'we','ヰ'=>'wi', 1628fed467f8SDenis Scheither 16299476a253SAndreas Gohr // convert what's left (probably only kicks in when something's missing above) 1630fed467f8SDenis Scheither 'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u', 1631fed467f8SDenis Scheither 'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu', 1632fed467f8SDenis Scheither 1633799e0977SAndreas Gohr // special characters 1634799e0977SAndreas Gohr '・'=>'_','、'=>'_', 1635799e0977SAndreas Gohr 'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise 1636799e0977SAndreas Gohr 1637fed467f8SDenis Scheither // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu', 1638fed467f8SDenis Scheither // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu', 1639fed467f8SDenis Scheither //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu', 1640fed467f8SDenis Scheither // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu', 1641fed467f8SDenis Scheither // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu', 1642fed467f8SDenis Scheither //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu', 1643fed467f8SDenis Scheither //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu', 1644fed467f8SDenis Scheither // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu', 1645fed467f8SDenis Scheither // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu', 1646fed467f8SDenis Scheither //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu', 1647fed467f8SDenis Scheither //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu', 1648fed467f8SDenis Scheither //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi', 16498a831f2bSAndreas Gohr 16508a831f2bSAndreas Gohr // "Greeklish" 16518a831f2bSAndreas Gohr 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps', 16528a831f2bSAndreas Gohr 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps', 16538a831f2bSAndreas Gohr 16548a831f2bSAndreas Gohr // Thai 16558a831f2bSAndreas Gohr 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch', 16568a831f2bSAndreas Gohr 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th', 16578a831f2bSAndreas Gohr 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th', 16588a831f2bSAndreas Gohr 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph', 16598a831f2bSAndreas Gohr 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue', 16608a831f2bSAndreas Gohr 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h', 1661014d0ab6SAndreas Gohr 'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am', 1662014d0ab6SAndreas Gohr 'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u', 1663014d0ab6SAndreas Gohr 'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o', 1664014d0ab6SAndreas Gohr 'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua', 1665014d0ab6SAndreas Gohr 'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao', 1666014d0ab6SAndreas Gohr 'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai', 1667014d0ab6SAndreas Gohr 'ิว'=>'io','็ว'=>'eo','ียว'=>'iao', 1668014d0ab6SAndreas Gohr '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'', 1669014d0ab6SAndreas Gohr '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'', 1670014d0ab6SAndreas Gohr 'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-', 1671014d0ab6SAndreas Gohr '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4', 1672014d0ab6SAndreas Gohr '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9', 16738a831f2bSAndreas Gohr 16748a831f2bSAndreas Gohr // Korean 16758a831f2bSAndreas Gohr 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p', 16768a831f2bSAndreas Gohr 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss', 16778a831f2bSAndreas Gohr 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o', 16788a831f2bSAndreas Gohr 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we', 16798a831f2bSAndreas Gohr 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy', 16808a831f2bSAndreas Gohr 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey', 16818a831f2bSAndreas Gohr); 1682340756e4Sandi 16838a831f2bSAndreas Gohr 1684