xref: /dokuwiki/inc/utf8.php (revision f62ea8a1d1cf10eddeae777b11420624e111b7ea)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * URL-Encode a filename to allow unicodecharacters
11 *
12 * Slashes are not encoded
13 *
14 * When the second parameter is true the string will
15 * be encoded only if non ASCII characters are detected -
16 * This makes it safe to run it multiple times on the
17 * same string (default is true)
18 *
19 * @author Andreas Gohr <andi@splitbrain.org>
20 * @see    urlencode
21 */
22function utf8_encodeFN($file,$safe=true){
23  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24    return $file;
25  }
26  $file = urlencode($file);
27  $file = str_replace('%2F','/',$file);
28  return $file;
29}
30
31/**
32 * URL-Decode a filename
33 *
34 * This is just a wrapper around urldecode
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @see    urldecode
38 */
39function utf8_decodeFN($file){
40  $file = urldecode($file);
41  return $file;
42}
43
44/**
45 * Checks if a string contains 7bit ASCII only
46 *
47 * @author Andreas Gohr <andi@splitbrain.org>
48 */
49function utf8_isASCII($str){
50  for($i=0; $i<strlen($str); $i++){
51    if(ord($str{$i}) >127) return false;
52  }
53  return true;
54}
55
56/**
57 * Strips all highbyte chars
58 *
59 * Returns a pure ASCII7 string
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_strip($str){
64  $ascii = '';
65  for($i=0; $i<strlen($str); $i++){
66    if(ord($str{$i}) <128){
67      $ascii .= $str{$i};
68    }
69  }
70  return $ascii;
71}
72
73/**
74 * Tries to detect if a string is in Unicode encoding
75 *
76 * @author <bmorel@ssi.fr>
77 * @link   http://www.php.net/manual/en/function.utf8-encode.php
78 */
79function utf8_check($Str) {
80 for ($i=0; $i<strlen($Str); $i++) {
81  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87  else return false; # Does not match any model
88  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90   return false;
91  }
92 }
93 return true;
94}
95
96/**
97 * Unicode aware replacement for strlen()
98 *
99 * utf8_decode() converts characters that are not in ISO-8859-1
100 * to '?', which, for the purpose of counting, is alright - It's
101 * even faster than mb_strlen.
102 *
103 * @author <chernyshevsky at hotmail dot com>
104 * @see    strlen()
105 * @see    utf8_decode()
106 */
107function utf8_strlen($string){
108  return strlen(utf8_decode($str));
109}
110
111/**
112 * Unicode aware replacement for substr()
113 *
114 * @author lmak at NOSPAM dot iti dot gr
115 * @link   http://www.php.net/manual/en/function.substr.php
116 * @see    substr()
117 */
118function utf8_substr($str,$start,$length=null){
119   preg_match_all("/./u", $str, $ar);
120
121   if($length != null) {
122       return join("",array_slice($ar[0],$start,$length));
123   } else {
124       return join("",array_slice($ar[0],$start));
125   }
126}
127
128/**
129 * Unicode aware replacement for explode
130 *
131 * @TODO   support third limit arg
132 * @author Harry Fuecks <hfuecks@gmail.com>
133 * @see    explode();
134 */
135function utf8_explode($sep, $str) {
136  if ( $sep == '' ) {
137    trigger_error('Empty delimiter',E_USER_WARNING);
138    return FALSE;
139  }
140
141  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
142}
143
144/**
145 * Unicode aware replacement for strrepalce()
146 *
147 * @todo   support PHP5 count (fourth arg)
148 * @author Harry Fuecks <hfuecks@gmail.com>
149 * @see    strreplace();
150 */
151function utf8_str_replace($s,$r,$str){
152  if(!is_array($s)){
153    $s = '!'.preg_quote($s,'!').'!u';
154  }else{
155    foreach ($s as $k => $v) {
156      $s[$k] = '!'.preg_quote($v).'!u';
157    }
158  }
159  return preg_replace($s,$r,$str);
160}
161
162/**
163 * Unicode aware replacement for ltrim()
164 *
165 * @author Andreas Gohr <andi@splitbrain.org>
166 * @see    ltrim()
167 * @return string
168 */
169function utf8_ltrim($str,$charlist=''){
170  if($charlist == '') return ltrim($str);
171
172  //quote charlist for use in a characterclass
173  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
174
175  return preg_replace('/^['.$charlist.']+/u','',$str);
176}
177
178/**
179 * Unicode aware replacement for ltrim()
180 *
181 * @author Andreas Gohr <andi@splitbrain.org>
182 * @see    rtrim()
183 * @return string
184 */
185function  utf8_rtrim($str,$charlist=''){
186  if($charlist == '') return rtrim($str);
187
188  //quote charlist for use in a characterclass
189  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
190
191  return preg_replace('/['.$charlist.']+$/u','',$str);
192}
193
194/**
195 * Unicode aware replacement for trim()
196 *
197 * @author Andreas Gohr <andi@splitbrain.org>
198 * @see    trim()
199 * @return string
200 */
201function  utf8_trim($str,$charlist='') {
202  if($charlist == '') return trim($str);
203
204  return utf8_ltrim(utf8_rtrim($str));
205}
206
207
208/**
209 * This is a unicode aware replacement for strtolower()
210 *
211 * Uses mb_string extension if available
212 *
213 * @author Andreas Gohr <andi@splitbrain.org>
214 * @see    strtolower()
215 * @see    utf8_strtoupper()
216 */
217function utf8_strtolower($string){
218  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
219    return mb_strtolower($string,'utf-8');
220
221  global $UTF8_UPPER_TO_LOWER;
222  $uni = utf8_to_unicode($string);
223  $cnt = count($uni);
224  for ($i=0; $i < $cnt; $i++){
225    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
226      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
227    }
228  }
229  return unicode_to_utf8($uni);
230}
231
232/**
233 * This is a unicode aware replacement for strtoupper()
234 *
235 * Uses mb_string extension if available
236 *
237 * @author Andreas Gohr <andi@splitbrain.org>
238 * @see    strtoupper()
239 * @see    utf8_strtoupper()
240 */
241function utf8_strtoupper($string){
242  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
243    return mb_strtolower($string,'utf-8');
244
245  global $UTF8_LOWER_TO_UPPER;
246  $uni = utf8_to_unicode($string);
247  $cnt = count($uni);
248  for ($i=0; $i < $cnt; $i++){
249    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
250      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
251    }
252  }
253  return unicode_to_utf8($uni);
254}
255
256/**
257 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
258 *
259 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
260 * letters. Default is to deaccent both cases ($case = 0)
261 *
262 * @author Andreas Gohr <andi@splitbrain.org>
263 */
264function utf8_deaccent($string,$case=0){
265  if($case <= 0){
266    global $UTF8_LOWER_ACCENTS;
267    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
268  }
269  if($case >= 0){
270    global $UTF8_UPPER_ACCENTS;
271    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
272  }
273  return $string;
274}
275
276/**
277 * Removes special characters (nonalphanumeric) from a UTF-8 string
278 *
279 * This function adds the controlchars 0x00 to 0x19 to the array of
280 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
281 *
282 * @author Andreas Gohr <andi@splitbrain.org>
283 * @param  string $string The UTF8 string to strip of special chars
284 * @param  string $repl   Replace special with this string
285 */
286function utf8_stripspecials($string,$repl=''){
287  global $UTF8_SPECIAL_CHARS;
288
289  static $specials = null;
290  if(is_null($specials)){
291    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
292  }
293
294  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
295}
296
297/**
298 * This is an Unicode aware replacement for strpos
299 *
300 * Uses mb_string extension if available
301 *
302 * @author Harry Fuecks <hfuecks@gmail.com>
303 * @see    strpos()
304 */
305function utf8_strpos($haystack, $needle,$offset=0) {
306  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
307    return mb_strpos($haystack,$needle,$offset,'utf-8');
308
309  if(!$offset){
310    $ar = utf8_explode($needle, $str);
311    if ( count($ar) > 1 ) {
312       return utf8_strlen($ar[0]);
313    }
314    return false;
315  }else{
316    if ( !is_int($offset) ) {
317      trigger_error('Offset must be an integer',E_USER_WARNING);
318      return false;
319    }
320
321    $str = utf8_substr($str, $offset);
322
323    if ( false !== ($pos = utf8_strpos($str,$needle))){
324       return $pos + $offset;
325    }
326    return false;
327  }
328}
329
330/**
331 * This function returns any UTF-8 encoded text as a list of
332 * Unicode values:
333 *
334 * @author Scott Michael Reynen <scott@randomchaos.com>
335 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
336 * @see    unicode_to_utf8()
337 */
338function utf8_to_unicode( $str ) {
339  $unicode = array();
340  $values = array();
341  $lookingFor = 1;
342
343  for ($i = 0; $i < strlen( $str ); $i++ ) {
344    $thisValue = ord( $str[ $i ] );
345    if ( $thisValue < 128 ) $unicode[] = $thisValue;
346    else {
347      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
348      $values[] = $thisValue;
349      if ( count( $values ) == $lookingFor ) {
350  $number = ( $lookingFor == 3 ) ?
351    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
352  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
353  $unicode[] = $number;
354  $values = array();
355  $lookingFor = 1;
356      }
357    }
358  }
359  return $unicode;
360}
361
362/**
363 * This function converts a Unicode array back to its UTF-8 representation
364 *
365 * @author Scott Michael Reynen <scott@randomchaos.com>
366 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
367 * @see    utf8_to_unicode()
368 */
369function unicode_to_utf8( $str ) {
370  $utf8 = '';
371  foreach( $str as $unicode ) {
372    if ( $unicode < 128 ) {
373      $utf8.= chr( $unicode );
374    } elseif ( $unicode < 2048 ) {
375      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
376      $utf8.= chr( 128 + ( $unicode % 64 ) );
377    } else {
378      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
379      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
380      $utf8.= chr( 128 + ( $unicode % 64 ) );
381    }
382  }
383  return $utf8;
384}
385
386/**
387 * UTF-8 Case lookup table
388 *
389 * This lookuptable defines the upper case letters to their correspponding
390 * lower case letter in UTF-8
391 *
392 * @author Andreas Gohr <andi@splitbrain.org>
393 */
394$UTF8_LOWER_TO_UPPER = array(
395  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
396  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
397  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
398  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
399  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
400  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
401  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
402  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
403  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
404  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
405  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
406  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
407  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
408  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
409  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
410  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
411  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
412  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
413  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
414  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
415  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
416  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
417  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
418  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
419  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
420  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
421  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
422  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
423  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
424  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
425  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
426  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
427  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
428  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
429  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
430  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
431  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
432  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
433  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
434  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
435  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
436  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
437  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
438);
439
440/**
441 * UTF-8 Case lookup table
442 *
443 * This lookuptable defines the lower case letters to their correspponding
444 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
445 *
446 * @author Andreas Gohr <andi@splitbrain.org>
447 */
448$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
449
450/**
451 * UTF-8 lookup table for lower case accented letters
452 *
453 * This lookuptable defines replacements for accented characters from the ASCII-7
454 * range. This are lower case letters only.
455 *
456 * @author Andreas Gohr <andi@splitbrain.org>
457 * @see    utf8_deaccent()
458 */
459$UTF8_LOWER_ACCENTS = array(
460  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
461  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
462  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
463  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
464  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
465  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
466  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
467  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
468  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
469  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
470  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
471  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
472  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
473  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
474  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
475);
476
477/**
478 * UTF-8 lookup table for upper case accented letters
479 *
480 * This lookuptable defines replacements for accented characters from the ASCII-7
481 * range. This are upper case letters only.
482 *
483 * @author Andreas Gohr <andi@splitbrain.org>
484 * @see    utf8_deaccent()
485 */
486$UTF8_UPPER_ACCENTS = array(
487  'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O',
488  'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K',
489  'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O',
490  'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O',
491  'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C',
492  'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T',
493  'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L',
494  'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z',
495  'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T',
496  'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O',
497  'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J',
498  'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O',
499  'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G',
500  'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A',
501  'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
502);
503
504/**
505 * UTF-8 array of common special characters
506 *
507 * This array should contain all special characters (not a letter or digit)
508 * defined in the various local charsets - it's not a complete list of non-alphanum
509 * characters in UTF-8. It's not perfect but should match most cases of special
510 * chars.
511 *
512 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
513 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d
514 *
515 * @author Andreas Gohr <andi@splitbrain.org>
516 * @see    utf8_stripspecials()
517 */
518$UTF8_SPECIAL_CHARS = array(
519  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
520  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
521          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
522  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
523  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
524  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
525	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
526	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
527	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
528	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
529	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
530	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
531	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
532	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
533	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
534	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
535	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
536	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
537	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
538	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
539	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
540	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
541	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
542	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
543	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
544	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
545	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
546	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
547	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
548	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
549	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
550	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
551	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
552	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
553	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
554	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
555	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
556	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
557	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
558	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
559	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
560	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
561	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
562	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
563	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
564	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
565	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
566	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
567	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
568	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
569);
570
571
572//Setup VIM: ex: et ts=2 enc=utf-8 :
573