xref: /dokuwiki/inc/utf8.php (revision 3a8a9050bc97ca54a26f0163fb9b8d50c82f8b3d)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * URL-Encode a filename to allow unicodecharacters
11 *
12 * Slashes are not encoded
13 *
14 * When the second parameter is true the string will
15 * be encoded only if non ASCII characters are detected -
16 * This makes it safe to run it multiple times on the
17 * same string (default is true)
18 *
19 * @author Andreas Gohr <andi@splitbrain.org>
20 * @see    urlencode
21 */
22function utf8_encodeFN($file,$safe=true){
23  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24    return $file;
25  }
26  $file = urlencode($file);
27  $file = str_replace('%2F','/',$file);
28  return $file;
29}
30
31/**
32 * URL-Decode a filename
33 *
34 * This is just a wrapper around urldecode
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @see    urldecode
38 */
39function utf8_decodeFN($file){
40  $file = urldecode($file);
41  return $file;
42}
43
44/**
45 * Checks if a string contains 7bit ASCII only
46 *
47 * @author Andreas Gohr <andi@splitbrain.org>
48 */
49function utf8_isASCII($str){
50  for($i=0; $i<strlen($str); $i++){
51    if(ord($str{$i}) >127) return false;
52  }
53  return true;
54}
55
56/**
57 * Tries to detect if a string is in Unicode encoding
58 *
59 * @author <bmorel@ssi.fr>
60 * @link   http://www.php.net/manual/en/function.utf8-encode.php
61 */
62function utf8_check($Str) {
63 for ($i=0; $i<strlen($Str); $i++) {
64  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
65  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
66  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
67  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
68  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
69  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
70  else return false; # Does not match any model
71  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
72   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
73   return false;
74  }
75 }
76 return true;
77}
78
79/**
80 * This is a unicode aware replacement for strlen()
81 *
82 * Uses mb_string extension if available
83 *
84 * @author Andreas Gohr <andi@splitbrain.org>
85 * @see    strlen()
86 */
87function utf8_strlen($string){
88  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strlen'))
89    return mb_strlen($string,'utf-8');
90
91  $uni = utf8_to_unicode($string);
92  return count($uni);
93}
94
95/**
96 * This is a unicode aware replacement for substr()
97 *
98 * Uses mb_string extension if available
99 *
100 * @author Andreas Gohr <andi@splitbrain.org>
101 * @see    substr()
102 */
103function utf8_substr($str, $start, $length=null){
104  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr'))
105    return mb_substr($str,$start,$length,'utf-8');
106
107  $uni = utf8_to_unicode($str);
108  return unicode_to_utf8(array_slice($uni,$start,$length));
109}
110
111/**
112 * This is a unicode aware replacement for strtolower()
113 *
114 * Uses mb_string extension if available
115 *
116 * @author Andreas Gohr <andi@splitbrain.org>
117 * @see    strtolower()
118 * @see    utf8_strtoupper()
119 */
120function utf8_strtolower($string){
121  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
122    return mb_strtolower($string,'utf-8');
123
124  global $UTF8_UPPER_TO_LOWER;
125  $uni = utf8_to_unicode($string);
126  for ($i=0; $i < count($uni); $i++){
127    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
128      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
129    }
130  }
131  return unicode_to_utf8($uni);
132}
133
134/**
135 * This is a unicode aware replacement for strtoupper()
136 *
137 * Uses mb_string extension if available
138 *
139 * @author Andreas Gohr <andi@splitbrain.org>
140 * @see    strtoupper()
141 * @see    utf8_strtoupper()
142 */
143function utf8_strtoupper($string){
144  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
145    return mb_strtolower($string,'utf-8');
146
147  global $UTF8_LOWER_TO_UPPER;
148  $uni = utf8_to_unicode($string);
149  for ($i=0; $i < count($uni); $i++){
150    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
151      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
152    }
153  }
154  return unicode_to_utf8($uni);
155}
156
157/**
158 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
159 *
160 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
161 * letters. Default is to deaccent both cases ($case = 0)
162 *
163 * @author Andreas Gohr <andi@splitbrain.org>
164 */
165function utf8_deaccent($string,$case=0){
166  if($case <= 0){
167    global $UTF8_LOWER_ACCENTS;
168    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
169  }
170  if($case >= 0){
171    global $UTF8_UPPER_ACCENTS;
172    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
173  }
174  return $string;
175}
176
177/**
178 * Removes special characters (nonalphanumeric) from a UTF-8 string
179 *
180 * Be sure to specify all specialchars you give in $repl in $keep, too
181 * or it won't work.
182 *
183 * This function adds the controlchars 0x00 to 0x19 to the array of
184 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
185 *
186 * @author Andreas Gohr <andi@splitbrain.org>
187 * @param  string $string The UTF8 string to strip of special chars
188 * @param  string $repl   Replace special with this string
189 * @param  string $keep   Special chars to keep (in UTF8)
190 */
191function utf8_stripspecials($string,$repl='',$keep=''){
192  global $UTF8_SPECIAL_CHARS;
193  if($keep != ''){
194    $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep));
195  }else{
196    $specials = $UTF8_SPECIAL_CHARS;
197  }
198
199  $specials = unicode_to_utf8($specials);
200  $specials = preg_quote($specials, '/');
201
202  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
203}
204
205/**
206 * This is an Unicode aware replacement for strpos
207 *
208 * Uses mb_string extension if available
209 *
210 * @author Scott Michael Reynen <scott@randomchaos.com>
211 * @author Andreas Gohr <andi@splitbrain.org>
212 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
213 * @see    strpos()
214 */
215function utf8_strpos($haystack, $needle,$offset=0) {
216  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
217    return mb_strpos($haystack,$needle,$offset,'utf-8');
218
219  $haystack = utf8_to_unicode($haystack);
220  $needle   = utf8_to_unicode($needle);
221  $position = $offset;
222  $found = false;
223
224  while( (! $found ) && ( $position < count( $haystack ) ) ) {
225    if ( $needle[0] == $haystack[$position] ) {
226      for ($i = 1; $i < count( $needle ); $i++ ) {
227        if ( $needle[$i] != $haystack[ $position + $i ] ) break;
228      }
229      if ( $i == count( $needle ) ) {
230        $found = true;
231        $position--;
232      }
233    }
234    $position++;
235  }
236  return ( $found == true ) ? $position : false;
237}
238
239/**
240 * This function will any UTF-8 encoded text and return it as
241 * a list of Unicode values:
242 *
243 * @author Scott Michael Reynen <scott@randomchaos.com>
244 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
245 * @see    unicode_to_utf8()
246 */
247function utf8_to_unicode( $str ) {
248  $unicode = array();
249  $values = array();
250  $lookingFor = 1;
251
252  for ($i = 0; $i < strlen( $str ); $i++ ) {
253    $thisValue = ord( $str[ $i ] );
254    if ( $thisValue < 128 ) $unicode[] = $thisValue;
255    else {
256      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
257      $values[] = $thisValue;
258      if ( count( $values ) == $lookingFor ) {
259  $number = ( $lookingFor == 3 ) ?
260    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
261  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
262  $unicode[] = $number;
263  $values = array();
264  $lookingFor = 1;
265      }
266    }
267  }
268  return $unicode;
269}
270
271/**
272 * This function will convert a Unicode array back to its UTF-8 representation
273 *
274 * @author Scott Michael Reynen <scott@randomchaos.com>
275 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
276 * @see    utf8_to_unicode()
277 */
278function unicode_to_utf8( $str ) {
279  $utf8 = '';
280  foreach( $str as $unicode ) {
281    if ( $unicode < 128 ) {
282      $utf8.= chr( $unicode );
283    } elseif ( $unicode < 2048 ) {
284      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
285      $utf8.= chr( 128 + ( $unicode % 64 ) );
286    } else {
287      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
288      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
289      $utf8.= chr( 128 + ( $unicode % 64 ) );
290    }
291  }
292  return $utf8;
293}
294
295/**
296 * UTF-8 Case lookup table
297 *
298 * This lookuptable defines the upper case letters to their correspponding
299 * lower case letter in UTF-8
300 *
301 * @author Andreas Gohr <andi@splitbrain.org>
302 */
303$UTF8_LOWER_TO_UPPER = array(
304  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
305  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
306  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
307  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
308  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
309  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
310  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
311  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
312  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
313  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
314  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
315  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
316  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
317  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
318  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
319  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
320  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
321  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
322  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
323  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
324  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
325  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
326  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
327  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
328  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
329  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
330  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
331  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
332  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
333  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
334  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
335  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
336  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
337  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
338  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
339  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
340  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
341  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
342  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
343  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
344  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
345  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
346  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
347);
348
349/**
350 * UTF-8 Case lookup table
351 *
352 * This lookuptable defines the lower case letters to their correspponding
353 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
354 *
355 * @author Andreas Gohr <andi@splitbrain.org>
356 */
357$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
358
359/**
360 * UTF-8 lookup table for lower case accented letters
361 *
362 * This lookuptable defines replacements for accented characters from the ASCII-7
363 * range. This are lower case letters only.
364 *
365 * @author Andreas Gohr <andi@splitbrain.org>
366 * @see    utf8_deaccent()
367 */
368$UTF8_LOWER_ACCENTS = array(
369  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
370  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
371  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
372  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
373  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
374  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
375  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
376  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
377  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
378  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
379  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
380  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
381  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
382  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
383  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
384);
385
386/**
387 * UTF-8 lookup table for upper case accented letters
388 *
389 * This lookuptable defines replacements for accented characters from the ASCII-7
390 * range. This are upper case letters only.
391 *
392 * @author Andreas Gohr <andi@splitbrain.org>
393 * @see    utf8_deaccent()
394 */
395$UTF8_UPPER_ACCENTS = array(
396  'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O',
397  'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K',
398  'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O',
399  'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O',
400  'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C',
401  'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T',
402  'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L',
403  'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z',
404  'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T',
405  'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O',
406  'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J',
407  'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O',
408  'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G',
409  'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A',
410  'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
411);
412
413/**
414 * UTF-8 array of common special characters
415 *
416 * This array should contain all special characters (not a letter or digit)
417 * defined in the various local charsets - it's not a complete list of non-alphanum
418 * characters in UTF-8. It's not perfect but should match most cases of special
419 * chars.
420 *
421 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
422 *
423 * @author Andreas Gohr <andi@splitbrain.org>
424 * @see    utf8_stripspecials()
425 */
426$UTF8_SPECIAL_CHARS = array(
427  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
428  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d,
429  0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
430  0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0142, 0x007b, 0x007c, 0x007d, 0x007e,
431  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
432  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
433	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
434	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
435	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
436	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
437	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
438	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
439	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
440	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
441	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
442	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
443	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
444	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
445	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
446	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
447	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
448	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
449	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
450	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
451	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
452	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
453	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
454	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
455	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
456	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
457	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
458	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
459	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
460	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
461	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
462	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
463	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
464	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
465	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
466	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
467	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
468	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
469	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
470	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
471	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
472	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
473	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
474	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
475	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
476	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
477);
478?>
479