1ed7b5f09Sandi<?php 282257610Sandi/** 382257610Sandi * UTF8 helper functions 482257610Sandi * 582257610Sandi * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 782257610Sandi */ 882257610Sandi 982257610Sandi/** 1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters 1149c713a3Sandi * 1249c713a3Sandi * Slashes are not encoded 1349c713a3Sandi * 14f59b22f0Sandi * When the second parameter is true the string will 15f59b22f0Sandi * be encoded only if non ASCII characters are detected - 16f59b22f0Sandi * This makes it safe to run it multiple times on the 17f59b22f0Sandi * same string (default is true) 18f59b22f0Sandi * 1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 20f59b22f0Sandi * @see urlencode 2149c713a3Sandi */ 22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){ 23f59b22f0Sandi if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 24f59b22f0Sandi return $file; 25f59b22f0Sandi } 26f59b22f0Sandi $file = urlencode($file); 2749c713a3Sandi $file = str_replace('%2F','/',$file); 2849c713a3Sandi return $file; 2949c713a3Sandi} 3049c713a3Sandi 3149c713a3Sandi/** 3249c713a3Sandi * URL-Decode a filename 3349c713a3Sandi * 34f59b22f0Sandi * This is just a wrapper around urldecode 35f59b22f0Sandi * 3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org> 37f59b22f0Sandi * @see urldecode 3849c713a3Sandi */ 3949c713a3Sandifunction utf8_decodeFN($file){ 40f59b22f0Sandi $file = urldecode($file); 4149c713a3Sandi return $file; 4249c713a3Sandi} 4349c713a3Sandi 44f29bd553Sandi/** 4544f669e9Sandi * Checks if a string contains 7bit ASCII only 4644f669e9Sandi * 4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org> 4844f669e9Sandi */ 4944f669e9Sandifunction utf8_isASCII($str){ 5044f669e9Sandi for($i=0; $i<strlen($str); $i++){ 5144f669e9Sandi if(ord($str{$i}) >127) return false; 5244f669e9Sandi } 5344f669e9Sandi return true; 5444f669e9Sandi} 5544f669e9Sandi 5644f669e9Sandi/** 57e1906e6eSandi * Strips all highbyte chars 58e1906e6eSandi * 59e1906e6eSandi * Returns a pure ASCII7 string 60e1906e6eSandi * 61e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org> 62e1906e6eSandi */ 63e1906e6eSandifunction utf8_strip($str){ 64e1906e6eSandi $ascii = ''; 65e1906e6eSandi for($i=0; $i<strlen($str); $i++){ 66e1906e6eSandi if(ord($str{$i}) <128){ 67e1906e6eSandi $ascii .= $str{$i}; 68e1906e6eSandi } 69e1906e6eSandi } 70e1906e6eSandi return $ascii; 71e1906e6eSandi} 72e1906e6eSandi 73e1906e6eSandi/** 74f29bd553Sandi * Tries to detect if a string is in Unicode encoding 75f29bd553Sandi * 76f29bd553Sandi * @author <bmorel@ssi.fr> 77f29bd553Sandi * @link http://www.php.net/manual/en/function.utf8-encode.php 78f29bd553Sandi */ 79f29bd553Sandifunction utf8_check($Str) { 80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) { 81f29bd553Sandi if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 82f29bd553Sandi elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 83f29bd553Sandi elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 84f29bd553Sandi elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 85f29bd553Sandi elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 86f29bd553Sandi elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 87f29bd553Sandi else return false; # Does not match any model 88f29bd553Sandi for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 89f29bd553Sandi if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 90f29bd553Sandi return false; 91f29bd553Sandi } 92f29bd553Sandi } 93f29bd553Sandi return true; 94f29bd553Sandi} 9549c713a3Sandi 962f954959Sandi/** 97f29317c1Sandi * Unicode aware replacement for strlen() 982f954959Sandi * 99f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1 100f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's 101f29317c1Sandi * even faster than mb_strlen. 1022f954959Sandi * 103f29317c1Sandi * @author <chernyshevsky at hotmail dot com> 1042f954959Sandi * @see strlen() 105f29317c1Sandi * @see utf8_decode() 1062f954959Sandi */ 1072f954959Sandifunction utf8_strlen($string){ 108f29317c1Sandi return strlen(utf8_decode($str)); 1092f954959Sandi} 1102f954959Sandi 1117077c942Sandi/** 112f29317c1Sandi * Unicode aware replacement for substr() 1137077c942Sandi * 114f29317c1Sandi * @todo Handle negative positions etc. 115f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 1167077c942Sandi * @see substr() 1177077c942Sandi */ 1187077c942Sandifunction utf8_substr($str, $start, $length=null){ 119f29317c1Sandi if ( is_null($length) ) { 120f29317c1Sandi $length = '*'; 121f29317c1Sandi } else { 122f29317c1Sandi $length = '{0,'.$length.'}'; 1237077c942Sandi } 124f29317c1Sandi $pattern = '/^.{'.$start.'}(.'.$length.')/us'; 125f29317c1Sandi preg_match($pattern, $str, $matches); 126f29317c1Sandi 127f29317c1Sandi if ( isset($matches[1]) ) { 128f29317c1Sandi return $matches[1]; 129f29317c1Sandi } 130f29317c1Sandi return false; 131f29317c1Sandi} 132f29317c1Sandi 133f29317c1Sandi/** 134f29317c1Sandi * Unicode aware replacement for explode 135f29317c1Sandi * 136f29317c1Sandi * @TODO support third limit arg 137f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 138f29317c1Sandi * @see explode(); 139f29317c1Sandi */ 140f29317c1Sandifunction utf8_explode($sep, $str) { 141f29317c1Sandi if ( $sep == '' ) { 142f29317c1Sandi trigger_error('Empty delimiter',E_USER_WARNING); 143f29317c1Sandi return FALSE; 144f29317c1Sandi } 145f29317c1Sandi 146f29317c1Sandi return preg_split('!'.preg_quote($sep,'!').'!u',$str); 147f29317c1Sandi} 148f29317c1Sandi 149f29317c1Sandi/** 150f29317c1Sandi * Unicode aware replacement for strrepalce() 151f29317c1Sandi * 152f29317c1Sandi * @todo support PHP5 count (fourth arg) 153f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 154f29317c1Sandi * @see strreplace(); 155f29317c1Sandi */ 156f29317c1Sandifunction utf8_str_replace($s,$r,$str){ 157f29317c1Sandi if(!is_array($s)){ 158f29317c1Sandi $s = '!'.preg_quote($s,'!').'!u'; 159f29317c1Sandi }else{ 160f29317c1Sandi foreach ($s as $k => $v) { 161f29317c1Sandi $s[$k] = '!'.preg_quote($v).'!u'; 162f29317c1Sandi } 163f29317c1Sandi } 164f29317c1Sandi return preg_replace($s,$r,$str); 165f29317c1Sandi} 166f29317c1Sandi 167f29317c1Sandi/** 168f29317c1Sandi * Unicode aware replacement for ltrim() 169f29317c1Sandi * 170f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 171f29317c1Sandi * @see ltrim() 172f29317c1Sandi * @return string 173f29317c1Sandi */ 174f29317c1Sandifunction utf8_ltrim($str,$charlist=''){ 175f29317c1Sandi if($charlist == '') return ltrim($str); 176f29317c1Sandi 177f29317c1Sandi //quote charlist for use in a characterclass 178f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 179f29317c1Sandi 180f29317c1Sandi return preg_replace('/^['.$charlist.']+/u','',$str); 181f29317c1Sandi} 182f29317c1Sandi 183f29317c1Sandi/** 184f29317c1Sandi * Unicode aware replacement for ltrim() 185f29317c1Sandi * 186f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 187f29317c1Sandi * @see rtrim() 188f29317c1Sandi * @return string 189f29317c1Sandi */ 190f29317c1Sandifunction utf8_rtrim($str,$charlist=''){ 191f29317c1Sandi if($charlist == '') return rtrim($str); 192f29317c1Sandi 193f29317c1Sandi //quote charlist for use in a characterclass 194f29317c1Sandi $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); 195f29317c1Sandi 196f29317c1Sandi return preg_replace('/['.$charlist.']+$/u','',$str); 197f29317c1Sandi} 198f29317c1Sandi 199f29317c1Sandi/** 200f29317c1Sandi * Unicode aware replacement for trim() 201f29317c1Sandi * 202f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org> 203f29317c1Sandi * @see trim() 204f29317c1Sandi * @return string 205f29317c1Sandi */ 206f29317c1Sandifunction utf8_trim($str,$charlist='') { 207f29317c1Sandi if($charlist == '') return trim($str); 208f29317c1Sandi 209f29317c1Sandi return utf8_ltrim(utf8_rtrim($str)); 210f29317c1Sandi} 211f29317c1Sandi 2122f954959Sandi 21349c713a3Sandi/** 21482257610Sandi * This is a unicode aware replacement for strtolower() 21582257610Sandi * 21682257610Sandi * Uses mb_string extension if available 21782257610Sandi * 21882257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 21982257610Sandi * @see strtolower() 22082257610Sandi * @see utf8_strtoupper() 22182257610Sandi */ 22282257610Sandifunction utf8_strtolower($string){ 22382257610Sandi if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 22482257610Sandi return mb_strtolower($string,'utf-8'); 22582257610Sandi 22682257610Sandi global $UTF8_UPPER_TO_LOWER; 22782257610Sandi $uni = utf8_to_unicode($string); 22882257610Sandi for ($i=0; $i < count($uni); $i++){ 22982257610Sandi if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 23082257610Sandi $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 23182257610Sandi } 23282257610Sandi } 23382257610Sandi return unicode_to_utf8($uni); 23482257610Sandi} 23582257610Sandi 23682257610Sandi/** 23782257610Sandi * This is a unicode aware replacement for strtoupper() 23882257610Sandi * 23982257610Sandi * Uses mb_string extension if available 24082257610Sandi * 24182257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 24282257610Sandi * @see strtoupper() 24382257610Sandi * @see utf8_strtoupper() 24482257610Sandi */ 24582257610Sandifunction utf8_strtoupper($string){ 24682257610Sandi if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 24782257610Sandi return mb_strtolower($string,'utf-8'); 24882257610Sandi 24982257610Sandi global $UTF8_LOWER_TO_UPPER; 25082257610Sandi $uni = utf8_to_unicode($string); 25182257610Sandi for ($i=0; $i < count($uni); $i++){ 25282257610Sandi if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 25382257610Sandi $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 25482257610Sandi } 25582257610Sandi } 25682257610Sandi return unicode_to_utf8($uni); 25782257610Sandi} 25882257610Sandi 25982257610Sandi/** 26082257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 26182257610Sandi * 26282257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 26382257610Sandi * letters. Default is to deaccent both cases ($case = 0) 26482257610Sandi * 26582257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 26682257610Sandi */ 26782257610Sandifunction utf8_deaccent($string,$case=0){ 26882257610Sandi if($case <= 0){ 26982257610Sandi global $UTF8_LOWER_ACCENTS; 27082257610Sandi $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 27182257610Sandi } 27282257610Sandi if($case >= 0){ 27382257610Sandi global $UTF8_UPPER_ACCENTS; 27482257610Sandi $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 27582257610Sandi } 27682257610Sandi return $string; 27782257610Sandi} 27882257610Sandi 27982257610Sandi/** 280099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string 281099ada41Sandi * 282099ada41Sandi * Be sure to specify all specialchars you give in $repl in $keep, too 283099ada41Sandi * or it won't work. 284099ada41Sandi * 285099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of 286099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 287099ada41Sandi * 288099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 289099ada41Sandi * @param string $string The UTF8 string to strip of special chars 290099ada41Sandi * @param string $repl Replace special with this string 291099ada41Sandi * @param string $keep Special chars to keep (in UTF8) 292099ada41Sandi */ 293099ada41Sandifunction utf8_stripspecials($string,$repl='',$keep=''){ 294099ada41Sandi global $UTF8_SPECIAL_CHARS; 295099ada41Sandi if($keep != ''){ 296099ada41Sandi $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep)); 297099ada41Sandi }else{ 298099ada41Sandi $specials = $UTF8_SPECIAL_CHARS; 299099ada41Sandi } 300099ada41Sandi 301099ada41Sandi $specials = unicode_to_utf8($specials); 302099ada41Sandi $specials = preg_quote($specials, '/'); 303099ada41Sandi 304099ada41Sandi return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string); 305099ada41Sandi} 306099ada41Sandi 307099ada41Sandi/** 3082f954959Sandi * This is an Unicode aware replacement for strpos 3092f954959Sandi * 3102f954959Sandi * Uses mb_string extension if available 3112f954959Sandi * 312f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com> 3132f954959Sandi * @see strpos() 3142f954959Sandi */ 3152f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) { 3162f954959Sandi if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) 3172f954959Sandi return mb_strpos($haystack,$needle,$offset,'utf-8'); 3182f954959Sandi 319f29317c1Sandi if(!$offset){ 320f29317c1Sandi $ar = utf8_explode($needle, $str); 321f29317c1Sandi if ( count($ar) > 1 ) { 322f29317c1Sandi return utf8_strlen($ar[0]); 323f29317c1Sandi } 324f29317c1Sandi return false; 325f29317c1Sandi }else{ 326f29317c1Sandi if ( !is_int($offset) ) { 327f29317c1Sandi trigger_error('Offset must be an integer',E_USER_WARNING); 328f29317c1Sandi return false; 329f29317c1Sandi } 3302f954959Sandi 331f29317c1Sandi $str = utf8_substr($str, $offset); 332f29317c1Sandi 333f29317c1Sandi if ( false !== ($pos = utf8_strpos($str,$needle))){ 334f29317c1Sandi return $pos + $offset; 3352f954959Sandi } 336f29317c1Sandi return false; 3372f954959Sandi } 3382f954959Sandi} 3392f954959Sandi 3402f954959Sandi/** 341f29317c1Sandi * This function returns any UTF-8 encoded text as a list of 342f29317c1Sandi * Unicode values: 34382257610Sandi * 34482257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com> 34582257610Sandi * @link http://www.randomchaos.com/document.php?source=php_and_unicode 34682257610Sandi * @see unicode_to_utf8() 34782257610Sandi */ 34882257610Sandifunction utf8_to_unicode( $str ) { 34982257610Sandi $unicode = array(); 35082257610Sandi $values = array(); 35182257610Sandi $lookingFor = 1; 35282257610Sandi 35382257610Sandi for ($i = 0; $i < strlen( $str ); $i++ ) { 35482257610Sandi $thisValue = ord( $str[ $i ] ); 35582257610Sandi if ( $thisValue < 128 ) $unicode[] = $thisValue; 35682257610Sandi else { 35782257610Sandi if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3; 35882257610Sandi $values[] = $thisValue; 35982257610Sandi if ( count( $values ) == $lookingFor ) { 36082257610Sandi $number = ( $lookingFor == 3 ) ? 36182257610Sandi ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ): 36282257610Sandi ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 ); 36382257610Sandi $unicode[] = $number; 36482257610Sandi $values = array(); 36582257610Sandi $lookingFor = 1; 36682257610Sandi } 36782257610Sandi } 36882257610Sandi } 36982257610Sandi return $unicode; 37082257610Sandi} 37182257610Sandi 37282257610Sandi/** 373f29317c1Sandi * This function converts a Unicode array back to its UTF-8 representation 37482257610Sandi * 37582257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com> 37682257610Sandi * @link http://www.randomchaos.com/document.php?source=php_and_unicode 37782257610Sandi * @see utf8_to_unicode() 37882257610Sandi */ 37982257610Sandifunction unicode_to_utf8( $str ) { 38082257610Sandi $utf8 = ''; 38182257610Sandi foreach( $str as $unicode ) { 38282257610Sandi if ( $unicode < 128 ) { 38382257610Sandi $utf8.= chr( $unicode ); 38482257610Sandi } elseif ( $unicode < 2048 ) { 38582257610Sandi $utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) ); 38682257610Sandi $utf8.= chr( 128 + ( $unicode % 64 ) ); 38782257610Sandi } else { 38882257610Sandi $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) ); 38982257610Sandi $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) ); 39082257610Sandi $utf8.= chr( 128 + ( $unicode % 64 ) ); 39182257610Sandi } 39282257610Sandi } 39382257610Sandi return $utf8; 39482257610Sandi} 39582257610Sandi 39682257610Sandi/** 39782257610Sandi * UTF-8 Case lookup table 39882257610Sandi * 39982257610Sandi * This lookuptable defines the upper case letters to their correspponding 40082257610Sandi * lower case letter in UTF-8 40182257610Sandi * 40282257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 40382257610Sandi */ 40482257610Sandi$UTF8_LOWER_TO_UPPER = array( 40582257610Sandi 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 40682257610Sandi 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 40782257610Sandi 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 40882257610Sandi 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 40982257610Sandi 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 41082257610Sandi 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 41182257610Sandi 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 41282257610Sandi 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 41382257610Sandi 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 41482257610Sandi 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 41582257610Sandi 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 41682257610Sandi 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 41782257610Sandi 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 41882257610Sandi 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 41982257610Sandi 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 42082257610Sandi 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 42182257610Sandi 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 42282257610Sandi 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 42382257610Sandi 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 42482257610Sandi 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 42582257610Sandi 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 42682257610Sandi 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 42782257610Sandi 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 42882257610Sandi 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 42982257610Sandi 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 43082257610Sandi 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 43182257610Sandi 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 43282257610Sandi 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 43382257610Sandi 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 43482257610Sandi 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 43582257610Sandi 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 43682257610Sandi 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 43782257610Sandi 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 43882257610Sandi 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 43982257610Sandi 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 44082257610Sandi 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 44182257610Sandi 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 44282257610Sandi 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 44382257610Sandi 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 44482257610Sandi 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 44582257610Sandi 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 44682257610Sandi 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 44782257610Sandi 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 44882257610Sandi); 44982257610Sandi 45082257610Sandi/** 45182257610Sandi * UTF-8 Case lookup table 45282257610Sandi * 45382257610Sandi * This lookuptable defines the lower case letters to their correspponding 45482257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 45582257610Sandi * 45682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 45782257610Sandi */ 45882257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 45982257610Sandi 46082257610Sandi/** 46182257610Sandi * UTF-8 lookup table for lower case accented letters 46282257610Sandi * 46382257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 46482257610Sandi * range. This are lower case letters only. 46582257610Sandi * 46682257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 46782257610Sandi * @see utf8_deaccent() 46882257610Sandi */ 46982257610Sandi$UTF8_LOWER_ACCENTS = array( 47082257610Sandi 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 47182257610Sandi 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 47282257610Sandi 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 47382257610Sandi 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 47482257610Sandi 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 47582257610Sandi 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 47682257610Sandi 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 47782257610Sandi 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 47882257610Sandi 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 47982257610Sandi 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 48082257610Sandi 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 48182257610Sandi 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 48282257610Sandi 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 48382257610Sandi 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 4840c59b0cfSandi 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 48582257610Sandi); 48682257610Sandi 48782257610Sandi/** 48882257610Sandi * UTF-8 lookup table for upper case accented letters 48982257610Sandi * 49082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7 49182257610Sandi * range. This are upper case letters only. 49282257610Sandi * 49382257610Sandi * @author Andreas Gohr <andi@splitbrain.org> 49482257610Sandi * @see utf8_deaccent() 49582257610Sandi */ 49682257610Sandi$UTF8_UPPER_ACCENTS = array( 49782257610Sandi 'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O', 49882257610Sandi 'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K', 49982257610Sandi 'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O', 50082257610Sandi 'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O', 50182257610Sandi 'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C', 50282257610Sandi 'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T', 50382257610Sandi 'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L', 50482257610Sandi 'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z', 50582257610Sandi 'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T', 50682257610Sandi 'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O', 50782257610Sandi 'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J', 50882257610Sandi 'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O', 50982257610Sandi 'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G', 51082257610Sandi 'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A', 511099ada41Sandi 'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 51282257610Sandi); 51382257610Sandi 514099ada41Sandi/** 515099ada41Sandi * UTF-8 array of common special characters 516099ada41Sandi * 517099ada41Sandi * This array should contain all special characters (not a letter or digit) 518099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum 519099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special 520099ada41Sandi * chars. 521099ada41Sandi * 522099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 523099ada41Sandi * 524099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org> 525099ada41Sandi * @see utf8_stripspecials() 526099ada41Sandi */ 527099ada41Sandi$UTF8_SPECIAL_CHARS = array( 528099ada41Sandi 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 529099ada41Sandi 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 530099ada41Sandi 0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 5313ed6dbb8Sandi 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 532099ada41Sandi 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 533099ada41Sandi 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 534099ada41Sandi 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 535099ada41Sandi 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 536099ada41Sandi 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 537099ada41Sandi 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 538099ada41Sandi 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 539099ada41Sandi 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 540099ada41Sandi 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 541099ada41Sandi 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 542099ada41Sandi 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 543099ada41Sandi 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 544099ada41Sandi 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 545099ada41Sandi 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 546099ada41Sandi 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 547099ada41Sandi 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 548099ada41Sandi 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 549099ada41Sandi 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 550099ada41Sandi 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 551099ada41Sandi 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 552099ada41Sandi 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 553099ada41Sandi 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 554099ada41Sandi 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 555099ada41Sandi 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 556099ada41Sandi 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 557099ada41Sandi 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 558099ada41Sandi 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 559099ada41Sandi 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 560099ada41Sandi 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 561099ada41Sandi 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 562099ada41Sandi 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 563099ada41Sandi 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 564099ada41Sandi 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 565099ada41Sandi 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 566099ada41Sandi 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 567099ada41Sandi 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 568099ada41Sandi 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 569099ada41Sandi 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 570099ada41Sandi 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 571099ada41Sandi 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 572099ada41Sandi 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 573099ada41Sandi 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 574099ada41Sandi 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 575099ada41Sandi 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 576099ada41Sandi 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 577099ada41Sandi 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 578099ada41Sandi); 579*340756e4Sandi 580*340756e4Sandi 581*340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 : 582