1<?php 2/** 3 * UTF8 helper functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9/** 10 * URL-Encode a filename to allow unicodecharacters 11 * 12 * Slashes are not encoded 13 * 14 * When the second parameter is true the string will 15 * be encoded only if non ASCII characters are detected - 16 * This makes it safe to run it multiple times on the 17 * same string (default is true) 18 * 19 * @author Andreas Gohr <andi@splitbrain.org> 20 * @see urlencode 21 */ 22function utf8_encodeFN($file,$safe=true){ 23 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 24 return $file; 25 } 26 $file = urlencode($file); 27 $file = str_replace('%2F','/',$file); 28 return $file; 29} 30 31/** 32 * URL-Decode a filename 33 * 34 * This is just a wrapper around urldecode 35 * 36 * @author Andreas Gohr <andi@splitbrain.org> 37 * @see urldecode 38 */ 39function utf8_decodeFN($file){ 40 $file = urldecode($file); 41 return $file; 42} 43 44/** 45 * Checks if a string contains 7bit ASCII only 46 * 47 * @author Andreas Gohr <andi@splitbrain.org> 48 */ 49function utf8_isASCII($str){ 50 for($i=0; $i<strlen($str); $i++){ 51 if(ord($str{$i}) >127) return false; 52 } 53 return true; 54} 55 56/** 57 * Tries to detect if a string is in Unicode encoding 58 * 59 * @author <bmorel@ssi.fr> 60 * @link http://www.php.net/manual/en/function.utf8-encode.php 61 */ 62function utf8_check($Str) { 63 for ($i=0; $i<strlen($Str); $i++) { 64 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 65 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 66 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 67 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 68 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 69 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 70 else return false; # Does not match any model 71 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 72 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 73 return false; 74 } 75 } 76 return true; 77} 78 79/** 80 * This is a unicode aware replacement for strlen() 81 * 82 * Uses mb_string extension if available 83 * 84 * @author Andreas Gohr <andi@splitbrain.org> 85 * @see strlen() 86 */ 87function utf8_strlen($string){ 88 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strlen')) 89 return mb_strlen($string,'utf-8'); 90 91 $uni = utf8_to_unicode($string); 92 return count($uni); 93} 94 95/** 96 * This is a unicode aware replacement for substr() 97 * 98 * Uses mb_string extension if available 99 * 100 * @author Andreas Gohr <andi@splitbrain.org> 101 * @see substr() 102 */ 103function utf8_substr($str, $start, $length=null){ 104 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr')) 105 return mb_substr($str,$start,$length,'utf-8'); 106 107 $uni = utf8_to_unicode($str); 108 return unicode_to_utf8(array_slice($uni,$start,$length)); 109} 110 111/** 112 * This is a unicode aware replacement for strtolower() 113 * 114 * Uses mb_string extension if available 115 * 116 * @author Andreas Gohr <andi@splitbrain.org> 117 * @see strtolower() 118 * @see utf8_strtoupper() 119 */ 120function utf8_strtolower($string){ 121 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 122 return mb_strtolower($string,'utf-8'); 123 124 global $UTF8_UPPER_TO_LOWER; 125 $uni = utf8_to_unicode($string); 126 for ($i=0; $i < count($uni); $i++){ 127 if($UTF8_UPPER_TO_LOWER[$uni[$i]]){ 128 $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]]; 129 } 130 } 131 return unicode_to_utf8($uni); 132} 133 134/** 135 * This is a unicode aware replacement for strtoupper() 136 * 137 * Uses mb_string extension if available 138 * 139 * @author Andreas Gohr <andi@splitbrain.org> 140 * @see strtoupper() 141 * @see utf8_strtoupper() 142 */ 143function utf8_strtoupper($string){ 144 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) 145 return mb_strtolower($string,'utf-8'); 146 147 global $UTF8_LOWER_TO_UPPER; 148 $uni = utf8_to_unicode($string); 149 for ($i=0; $i < count($uni); $i++){ 150 if($UTF8_LOWER_TO_UPPER[$uni[$i]]){ 151 $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]]; 152 } 153 } 154 return unicode_to_utf8($uni); 155} 156 157/** 158 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 159 * 160 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 161 * letters. Default is to deaccent both cases ($case = 0) 162 * 163 * @author Andreas Gohr <andi@splitbrain.org> 164 */ 165function utf8_deaccent($string,$case=0){ 166 if($case <= 0){ 167 global $UTF8_LOWER_ACCENTS; 168 $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string); 169 } 170 if($case >= 0){ 171 global $UTF8_UPPER_ACCENTS; 172 $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string); 173 } 174 return $string; 175} 176 177/** 178 * Removes special characters (nonalphanumeric) from a UTF-8 string 179 * 180 * Be sure to specify all specialchars you give in $repl in $keep, too 181 * or it won't work. 182 * 183 * This function adds the controlchars 0x00 to 0x19 to the array of 184 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 185 * 186 * @author Andreas Gohr <andi@splitbrain.org> 187 * @param string $string The UTF8 string to strip of special chars 188 * @param string $repl Replace special with this string 189 * @param string $keep Special chars to keep (in UTF8) 190 */ 191function utf8_stripspecials($string,$repl='',$keep=''){ 192 global $UTF8_SPECIAL_CHARS; 193 if($keep != ''){ 194 $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep)); 195 }else{ 196 $specials = $UTF8_SPECIAL_CHARS; 197 } 198 199 $specials = unicode_to_utf8($specials); 200 $specials = preg_quote($specials, '/'); 201 202 return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string); 203} 204 205/** 206 * This is an Unicode aware replacement for strpos 207 * 208 * Uses mb_string extension if available 209 * 210 * @author Scott Michael Reynen <scott@randomchaos.com> 211 * @author Andreas Gohr <andi@splitbrain.org> 212 * @link http://www.randomchaos.com/document.php?source=php_and_unicode 213 * @see strpos() 214 */ 215function utf8_strpos($haystack, $needle,$offset=0) { 216 if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) 217 return mb_strpos($haystack,$needle,$offset,'utf-8'); 218 219 $haystack = utf8_to_unicode($haystack); 220 $needle = utf8_to_unicode($needle); 221 $position = $offset; 222 $found = false; 223 224 while( (! $found ) && ( $position < count( $haystack ) ) ) { 225 if ( $needle[0] == $haystack[$position] ) { 226 for ($i = 1; $i < count( $needle ); $i++ ) { 227 if ( $needle[$i] != $haystack[ $position + $i ] ) break; 228 } 229 if ( $i == count( $needle ) ) { 230 $found = true; 231 $position--; 232 } 233 } 234 $position++; 235 } 236 return ( $found == true ) ? $position : false; 237} 238 239/** 240 * This function will any UTF-8 encoded text and return it as 241 * a list of Unicode values: 242 * 243 * @author Scott Michael Reynen <scott@randomchaos.com> 244 * @link http://www.randomchaos.com/document.php?source=php_and_unicode 245 * @see unicode_to_utf8() 246 */ 247function utf8_to_unicode( $str ) { 248 $unicode = array(); 249 $values = array(); 250 $lookingFor = 1; 251 252 for ($i = 0; $i < strlen( $str ); $i++ ) { 253 $thisValue = ord( $str[ $i ] ); 254 if ( $thisValue < 128 ) $unicode[] = $thisValue; 255 else { 256 if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3; 257 $values[] = $thisValue; 258 if ( count( $values ) == $lookingFor ) { 259 $number = ( $lookingFor == 3 ) ? 260 ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ): 261 ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 ); 262 $unicode[] = $number; 263 $values = array(); 264 $lookingFor = 1; 265 } 266 } 267 } 268 return $unicode; 269} 270 271/** 272 * This function will convert a Unicode array back to its UTF-8 representation 273 * 274 * @author Scott Michael Reynen <scott@randomchaos.com> 275 * @link http://www.randomchaos.com/document.php?source=php_and_unicode 276 * @see utf8_to_unicode() 277 */ 278function unicode_to_utf8( $str ) { 279 $utf8 = ''; 280 foreach( $str as $unicode ) { 281 if ( $unicode < 128 ) { 282 $utf8.= chr( $unicode ); 283 } elseif ( $unicode < 2048 ) { 284 $utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) ); 285 $utf8.= chr( 128 + ( $unicode % 64 ) ); 286 } else { 287 $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) ); 288 $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) ); 289 $utf8.= chr( 128 + ( $unicode % 64 ) ); 290 } 291 } 292 return $utf8; 293} 294 295/** 296 * UTF-8 Case lookup table 297 * 298 * This lookuptable defines the upper case letters to their correspponding 299 * lower case letter in UTF-8 300 * 301 * @author Andreas Gohr <andi@splitbrain.org> 302 */ 303$UTF8_LOWER_TO_UPPER = array( 304 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 305 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 306 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 307 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 308 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 309 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 310 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 311 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 312 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 313 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 314 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 315 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 316 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 317 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 318 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 319 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 320 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 321 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 322 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 323 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 324 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 325 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 326 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 327 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 328 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 329 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 330 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 331 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 332 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 333 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 334 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 335 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 336 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 337 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 338 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 339 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 340 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 341 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 342 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 343 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 344 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 345 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 346 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, 347); 348 349/** 350 * UTF-8 Case lookup table 351 * 352 * This lookuptable defines the lower case letters to their correspponding 353 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER) 354 * 355 * @author Andreas Gohr <andi@splitbrain.org> 356 */ 357$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER); 358 359/** 360 * UTF-8 lookup table for lower case accented letters 361 * 362 * This lookuptable defines replacements for accented characters from the ASCII-7 363 * range. This are lower case letters only. 364 * 365 * @author Andreas Gohr <andi@splitbrain.org> 366 * @see utf8_deaccent() 367 */ 368$UTF8_LOWER_ACCENTS = array( 369 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o', 370 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k', 371 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o', 372 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o', 373 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c', 374 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't', 375 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l', 376 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z', 377 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't', 378 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', 379 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j', 380 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o', 381 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', 382 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', 383 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 384); 385 386/** 387 * UTF-8 lookup table for upper case accented letters 388 * 389 * This lookuptable defines replacements for accented characters from the ASCII-7 390 * range. This are upper case letters only. 391 * 392 * @author Andreas Gohr <andi@splitbrain.org> 393 * @see utf8_deaccent() 394 */ 395$UTF8_UPPER_ACCENTS = array( 396 'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O', 397 'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K', 398 'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O', 399 'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O', 400 'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C', 401 'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T', 402 'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L', 403 'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z', 404 'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T', 405 'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O', 406 'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J', 407 'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O', 408 'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G', 409 'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A', 410 'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 411); 412 413/** 414 * UTF-8 array of common special characters 415 * 416 * This array should contain all special characters (not a letter or digit) 417 * defined in the various local charsets - it's not a complete list of non-alphanum 418 * characters in UTF-8. It's not perfect but should match most cases of special 419 * chars. 420 * 421 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! 422 * 423 * @author Andreas Gohr <andi@splitbrain.org> 424 * @see utf8_stripspecials() 425 */ 426$UTF8_SPECIAL_CHARS = array( 427 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 428 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 429 0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, 430 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0142, 0x007b, 0x007c, 0x007d, 0x007e, 431 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 432 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 433 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 434 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 435 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 436 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 437 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, 438 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, 439 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, 440 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 441 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, 442 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, 443 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, 444 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, 445 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, 446 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, 447 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, 448 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, 449 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, 450 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, 451 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, 452 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 453 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, 454 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 455 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 456 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 457 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, 458 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, 459 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, 460 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, 461 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, 462 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, 463 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, 464 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, 465 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, 466 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, 467 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, 468 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, 469 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, 470 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, 471 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, 472 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, 473 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, 474 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, 475 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, 476 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, 477); 478?> 479