xref: /dokuwiki/inc/utf8.php (revision 92b83b77a91e8d0d82845ec6783c3a181068071d)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * URL-Encode a filename to allow unicodecharacters
11 *
12 * Slashes are not encoded
13 *
14 * When the second parameter is true the string will
15 * be encoded only if non ASCII characters are detected -
16 * This makes it safe to run it multiple times on the
17 * same string (default is true)
18 *
19 * @author Andreas Gohr <andi@splitbrain.org>
20 * @see    urlencode
21 */
22function utf8_encodeFN($file,$safe=true){
23  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24    return $file;
25  }
26  $file = urlencode($file);
27  $file = str_replace('%2F','/',$file);
28  return $file;
29}
30
31/**
32 * URL-Decode a filename
33 *
34 * This is just a wrapper around urldecode
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @see    urldecode
38 */
39function utf8_decodeFN($file){
40  $file = urldecode($file);
41  return $file;
42}
43
44/**
45 * Checks if a string contains 7bit ASCII only
46 *
47 * @author Andreas Gohr <andi@splitbrain.org>
48 */
49function utf8_isASCII($str){
50  for($i=0; $i<strlen($str); $i++){
51    if(ord($str{$i}) >127) return false;
52  }
53  return true;
54}
55
56/**
57 * Strips all highbyte chars
58 *
59 * Returns a pure ASCII7 string
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_strip($str){
64  $ascii = '';
65  for($i=0; $i<strlen($str); $i++){
66    if(ord($str{$i}) <128){
67      $ascii .= $str{$i};
68    }
69  }
70  return $ascii;
71}
72
73/**
74 * Tries to detect if a string is in Unicode encoding
75 *
76 * @author <bmorel@ssi.fr>
77 * @link   http://www.php.net/manual/en/function.utf8-encode.php
78 */
79function utf8_check($Str) {
80 for ($i=0; $i<strlen($Str); $i++) {
81  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87  else return false; # Does not match any model
88  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90   return false;
91  }
92 }
93 return true;
94}
95
96/**
97 * Unicode aware replacement for strlen()
98 *
99 * utf8_decode() converts characters that are not in ISO-8859-1
100 * to '?', which, for the purpose of counting, is alright - It's
101 * even faster than mb_strlen.
102 *
103 * @author <chernyshevsky at hotmail dot com>
104 * @see    strlen()
105 * @see    utf8_decode()
106 */
107function utf8_strlen($string){
108  return strlen(utf8_decode($str));
109}
110
111/**
112 * Unicode aware replacement for substr()
113 *
114 * @todo   Handle negative positions etc.
115 * @author Harry Fuecks <hfuecks@gmail.com>
116 * @see    substr()
117 */
118function utf8_substr($str, $start, $length=null){
119  if ( is_null($length) ) {
120    $length = '*';
121  } else {
122    $length = '{0,'.$length.'}';
123  }
124  $pattern = '/^.{'.$start.'}(.'.$length.')/us';
125  preg_match($pattern, $str, $matches);
126
127  if ( isset($matches[1]) ) {
128    return $matches[1];
129  }
130  return false;
131}
132
133/**
134 * Unicode aware replacement for explode
135 *
136 * @TODO   support third limit arg
137 * @author Harry Fuecks <hfuecks@gmail.com>
138 * @see    explode();
139 */
140function utf8_explode($sep, $str) {
141  if ( $sep == '' ) {
142    trigger_error('Empty delimiter',E_USER_WARNING);
143    return FALSE;
144  }
145
146  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
147}
148
149/**
150 * Unicode aware replacement for strrepalce()
151 *
152 * @todo   support PHP5 count (fourth arg)
153 * @author Harry Fuecks <hfuecks@gmail.com>
154 * @see    strreplace();
155 */
156function utf8_str_replace($s,$r,$str){
157  if(!is_array($s)){
158    $s = '!'.preg_quote($s,'!').'!u';
159  }else{
160    foreach ($s as $k => $v) {
161      $s[$k] = '!'.preg_quote($v).'!u';
162    }
163  }
164  return preg_replace($s,$r,$str);
165}
166
167/**
168 * Unicode aware replacement for ltrim()
169 *
170 * @author Andreas Gohr <andi@splitbrain.org>
171 * @see    ltrim()
172 * @return string
173 */
174function utf8_ltrim($str,$charlist=''){
175  if($charlist == '') return ltrim($str);
176
177  //quote charlist for use in a characterclass
178  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
179
180  return preg_replace('/^['.$charlist.']+/u','',$str);
181}
182
183/**
184 * Unicode aware replacement for ltrim()
185 *
186 * @author Andreas Gohr <andi@splitbrain.org>
187 * @see    rtrim()
188 * @return string
189 */
190function  utf8_rtrim($str,$charlist=''){
191  if($charlist == '') return rtrim($str);
192
193  //quote charlist for use in a characterclass
194  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
195
196  return preg_replace('/['.$charlist.']+$/u','',$str);
197}
198
199/**
200 * Unicode aware replacement for trim()
201 *
202 * @author Andreas Gohr <andi@splitbrain.org>
203 * @see    trim()
204 * @return string
205 */
206function  utf8_trim($str,$charlist='') {
207  if($charlist == '') return trim($str);
208
209  return utf8_ltrim(utf8_rtrim($str));
210}
211
212
213/**
214 * This is a unicode aware replacement for strtolower()
215 *
216 * Uses mb_string extension if available
217 *
218 * @author Andreas Gohr <andi@splitbrain.org>
219 * @see    strtolower()
220 * @see    utf8_strtoupper()
221 */
222function utf8_strtolower($string){
223  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
224    return mb_strtolower($string,'utf-8');
225
226  global $UTF8_UPPER_TO_LOWER;
227  $uni = utf8_to_unicode($string);
228  $cnt = count($uni);
229  for ($i=0; $i < $cnt; $i++){
230    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
231      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
232    }
233  }
234  return unicode_to_utf8($uni);
235}
236
237/**
238 * This is a unicode aware replacement for strtoupper()
239 *
240 * Uses mb_string extension if available
241 *
242 * @author Andreas Gohr <andi@splitbrain.org>
243 * @see    strtoupper()
244 * @see    utf8_strtoupper()
245 */
246function utf8_strtoupper($string){
247  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
248    return mb_strtolower($string,'utf-8');
249
250  global $UTF8_LOWER_TO_UPPER;
251  $uni = utf8_to_unicode($string);
252  $cnt = count($uni);
253  for ($i=0; $i < $cnt; $i++){
254    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
255      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
256    }
257  }
258  return unicode_to_utf8($uni);
259}
260
261/**
262 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
263 *
264 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
265 * letters. Default is to deaccent both cases ($case = 0)
266 *
267 * @author Andreas Gohr <andi@splitbrain.org>
268 */
269function utf8_deaccent($string,$case=0){
270  if($case <= 0){
271    global $UTF8_LOWER_ACCENTS;
272    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
273  }
274  if($case >= 0){
275    global $UTF8_UPPER_ACCENTS;
276    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
277  }
278  return $string;
279}
280
281/**
282 * Removes special characters (nonalphanumeric) from a UTF-8 string
283 *
284 * Be sure to specify all specialchars you give in $repl in $keep, too
285 * or it won't work.
286 *
287 * This function adds the controlchars 0x00 to 0x19 to the array of
288 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
289 *
290 * @author Andreas Gohr <andi@splitbrain.org>
291 * @param  string $string The UTF8 string to strip of special chars
292 * @param  string $repl   Replace special with this string
293 * @param  string $keep   Special chars to keep (in UTF8)
294 */
295function utf8_stripspecials($string,$repl='',$keep=''){
296  global $UTF8_SPECIAL_CHARS;
297  if($keep != ''){
298    $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep));
299  }else{
300    $specials = $UTF8_SPECIAL_CHARS;
301  }
302
303  $specials = unicode_to_utf8($specials);
304  $specials = preg_quote($specials, '/');
305
306  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
307}
308
309/**
310 * This is an Unicode aware replacement for strpos
311 *
312 * Uses mb_string extension if available
313 *
314 * @author Harry Fuecks <hfuecks@gmail.com>
315 * @see    strpos()
316 */
317function utf8_strpos($haystack, $needle,$offset=0) {
318  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
319    return mb_strpos($haystack,$needle,$offset,'utf-8');
320
321  if(!$offset){
322    $ar = utf8_explode($needle, $str);
323    if ( count($ar) > 1 ) {
324       return utf8_strlen($ar[0]);
325    }
326    return false;
327  }else{
328    if ( !is_int($offset) ) {
329      trigger_error('Offset must be an integer',E_USER_WARNING);
330      return false;
331    }
332
333    $str = utf8_substr($str, $offset);
334
335    if ( false !== ($pos = utf8_strpos($str,$needle))){
336       return $pos + $offset;
337    }
338    return false;
339  }
340}
341
342/**
343 * This function returns any UTF-8 encoded text as a list of
344 * Unicode values:
345 *
346 * @author Scott Michael Reynen <scott@randomchaos.com>
347 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
348 * @see    unicode_to_utf8()
349 */
350function utf8_to_unicode( $str ) {
351  $unicode = array();
352  $values = array();
353  $lookingFor = 1;
354
355  for ($i = 0; $i < strlen( $str ); $i++ ) {
356    $thisValue = ord( $str[ $i ] );
357    if ( $thisValue < 128 ) $unicode[] = $thisValue;
358    else {
359      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
360      $values[] = $thisValue;
361      if ( count( $values ) == $lookingFor ) {
362  $number = ( $lookingFor == 3 ) ?
363    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
364  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
365  $unicode[] = $number;
366  $values = array();
367  $lookingFor = 1;
368      }
369    }
370  }
371  return $unicode;
372}
373
374/**
375 * This function converts a Unicode array back to its UTF-8 representation
376 *
377 * @author Scott Michael Reynen <scott@randomchaos.com>
378 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
379 * @see    utf8_to_unicode()
380 */
381function unicode_to_utf8( $str ) {
382  $utf8 = '';
383  foreach( $str as $unicode ) {
384    if ( $unicode < 128 ) {
385      $utf8.= chr( $unicode );
386    } elseif ( $unicode < 2048 ) {
387      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
388      $utf8.= chr( 128 + ( $unicode % 64 ) );
389    } else {
390      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
391      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
392      $utf8.= chr( 128 + ( $unicode % 64 ) );
393    }
394  }
395  return $utf8;
396}
397
398/**
399 * UTF-8 Case lookup table
400 *
401 * This lookuptable defines the upper case letters to their correspponding
402 * lower case letter in UTF-8
403 *
404 * @author Andreas Gohr <andi@splitbrain.org>
405 */
406$UTF8_LOWER_TO_UPPER = array(
407  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
408  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
409  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
410  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
411  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
412  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
413  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
414  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
415  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
416  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
417  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
418  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
419  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
420  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
421  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
422  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
423  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
424  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
425  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
426  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
427  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
428  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
429  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
430  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
431  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
432  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
433  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
434  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
435  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
436  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
437  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
438  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
439  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
440  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
441  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
442  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
443  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
444  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
445  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
446  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
447  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
448  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
449  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
450);
451
452/**
453 * UTF-8 Case lookup table
454 *
455 * This lookuptable defines the lower case letters to their correspponding
456 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
457 *
458 * @author Andreas Gohr <andi@splitbrain.org>
459 */
460$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
461
462/**
463 * UTF-8 lookup table for lower case accented letters
464 *
465 * This lookuptable defines replacements for accented characters from the ASCII-7
466 * range. This are lower case letters only.
467 *
468 * @author Andreas Gohr <andi@splitbrain.org>
469 * @see    utf8_deaccent()
470 */
471$UTF8_LOWER_ACCENTS = array(
472  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
473  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
474  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
475  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
476  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
477  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
478  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
479  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
480  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
481  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
482  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
483  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
484  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
485  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
486  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
487);
488
489/**
490 * UTF-8 lookup table for upper case accented letters
491 *
492 * This lookuptable defines replacements for accented characters from the ASCII-7
493 * range. This are upper case letters only.
494 *
495 * @author Andreas Gohr <andi@splitbrain.org>
496 * @see    utf8_deaccent()
497 */
498$UTF8_UPPER_ACCENTS = array(
499  'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O',
500  'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K',
501  'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O',
502  'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O',
503  'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C',
504  'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T',
505  'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L',
506  'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z',
507  'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T',
508  'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O',
509  'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J',
510  'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O',
511  'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G',
512  'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A',
513  'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
514);
515
516/**
517 * UTF-8 array of common special characters
518 *
519 * This array should contain all special characters (not a letter or digit)
520 * defined in the various local charsets - it's not a complete list of non-alphanum
521 * characters in UTF-8. It's not perfect but should match most cases of special
522 * chars.
523 *
524 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
525 *
526 * @author Andreas Gohr <andi@splitbrain.org>
527 * @see    utf8_stripspecials()
528 */
529$UTF8_SPECIAL_CHARS = array(
530  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
531  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d,
532  0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
533  0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
534  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
535  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
536	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
537	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
538	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
539	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
540	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
541	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
542	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
543	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
544	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
545	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
546	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
547	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
548	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
549	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
550	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
551	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
552	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
553	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
554	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
555	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
556	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
557	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
558	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
559	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
560	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
561	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
562	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
563	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
564	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
565	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
566	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
567	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
568	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
569	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
570	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
571	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
572	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
573	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
574	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
575	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
576	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
577	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
578	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
579	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
580);
581
582
583//Setup VIM: ex: et ts=2 enc=utf-8 :
584