xref: /dokuwiki/inc/utf8.php (revision 42905504e134d999710eacf73253844e85cf6fec)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * URL-Encode a filename to allow unicodecharacters
11 *
12 * Slashes are not encoded
13 *
14 * When the second parameter is true the string will
15 * be encoded only if non ASCII characters are detected -
16 * This makes it safe to run it multiple times on the
17 * same string (default is true)
18 *
19 * @author Andreas Gohr <andi@splitbrain.org>
20 * @see    urlencode
21 */
22function utf8_encodeFN($file,$safe=true){
23  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24    return $file;
25  }
26  $file = urlencode($file);
27  $file = str_replace('%2F','/',$file);
28  return $file;
29}
30
31/**
32 * URL-Decode a filename
33 *
34 * This is just a wrapper around urldecode
35 *
36 * @author Andreas Gohr <andi@splitbrain.org>
37 * @see    urldecode
38 */
39function utf8_decodeFN($file){
40  $file = urldecode($file);
41  return $file;
42}
43
44/**
45 * Checks if a string contains 7bit ASCII only
46 *
47 * @author Andreas Gohr <andi@splitbrain.org>
48 */
49function utf8_isASCII($str){
50  for($i=0; $i<strlen($str); $i++){
51    if(ord($str{$i}) >127) return false;
52  }
53  return true;
54}
55
56/**
57 * Strips all highbyte chars
58 *
59 * Returns a pure ASCII7 string
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_strip($str){
64  $ascii = '';
65  for($i=0; $i<strlen($str); $i++){
66    if(ord($str{$i}) <128){
67      $ascii .= $str{$i};
68    }
69  }
70  return $ascii;
71}
72
73/**
74 * Tries to detect if a string is in Unicode encoding
75 *
76 * @author <bmorel@ssi.fr>
77 * @link   http://www.php.net/manual/en/function.utf8-encode.php
78 */
79function utf8_check($Str) {
80 for ($i=0; $i<strlen($Str); $i++) {
81  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87  else return false; # Does not match any model
88  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90   return false;
91  }
92 }
93 return true;
94}
95
96/**
97 * Unicode aware replacement for strlen()
98 *
99 * utf8_decode() converts characters that are not in ISO-8859-1
100 * to '?', which, for the purpose of counting, is alright - It's
101 * even faster than mb_strlen.
102 *
103 * @author <chernyshevsky at hotmail dot com>
104 * @see    strlen()
105 * @see    utf8_decode()
106 */
107function utf8_strlen($string){
108  return strlen(utf8_decode($string));
109}
110
111/**
112 * Unicode aware replacement for substr()
113 *
114 * @author lmak at NOSPAM dot iti dot gr
115 * @link   http://www.php.net/manual/en/function.substr.php
116 * @see    substr()
117 */
118function utf8_substr($str,$start,$length=null){
119   preg_match_all("/./u", $str, $ar);
120
121   if($length != null) {
122       return join("",array_slice($ar[0],$start,$length));
123   } else {
124       return join("",array_slice($ar[0],$start));
125   }
126}
127
128/**
129 * Unicode aware replacement for substr_replace()
130 *
131 * @author Andreas Gohr <andi@splitbrain.org>
132 * @see    substr_replace()
133 */
134function utf8_substr_replace($string, $replacement, $start , $length=0 ){
135  $ret = '';
136  if($start>0) $ret .= utf8_substr($string, 0, $start);
137  $ret .= $replacement;
138  $ret .= utf8_substr($string, $start+$length);
139  return $ret;
140}
141
142/**
143 * Unicode aware replacement for explode
144 *
145 * @TODO   support third limit arg
146 * @author Harry Fuecks <hfuecks@gmail.com>
147 * @see    explode();
148 */
149function utf8_explode($sep, $str) {
150  if ( $sep == '' ) {
151    trigger_error('Empty delimiter',E_USER_WARNING);
152    return FALSE;
153  }
154
155  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
156}
157
158/**
159 * Unicode aware replacement for strrepalce()
160 *
161 * @todo   support PHP5 count (fourth arg)
162 * @author Harry Fuecks <hfuecks@gmail.com>
163 * @see    strreplace();
164 */
165function utf8_str_replace($s,$r,$str){
166  if(!is_array($s)){
167    $s = '!'.preg_quote($s,'!').'!u';
168  }else{
169    foreach ($s as $k => $v) {
170      $s[$k] = '!'.preg_quote($v).'!u';
171    }
172  }
173  return preg_replace($s,$r,$str);
174}
175
176/**
177 * Unicode aware replacement for ltrim()
178 *
179 * @author Andreas Gohr <andi@splitbrain.org>
180 * @see    ltrim()
181 * @return string
182 */
183function utf8_ltrim($str,$charlist=''){
184  if($charlist == '') return ltrim($str);
185
186  //quote charlist for use in a characterclass
187  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
188
189  return preg_replace('/^['.$charlist.']+/u','',$str);
190}
191
192/**
193 * Unicode aware replacement for rtrim()
194 *
195 * @author Andreas Gohr <andi@splitbrain.org>
196 * @see    rtrim()
197 * @return string
198 */
199function  utf8_rtrim($str,$charlist=''){
200  if($charlist == '') return rtrim($str);
201
202  //quote charlist for use in a characterclass
203  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
204
205  return preg_replace('/['.$charlist.']+$/u','',$str);
206}
207
208/**
209 * Unicode aware replacement for trim()
210 *
211 * @author Andreas Gohr <andi@splitbrain.org>
212 * @see    trim()
213 * @return string
214 */
215function  utf8_trim($str,$charlist='') {
216  if($charlist == '') return trim($str);
217
218  return utf8_ltrim(utf8_rtrim($str));
219}
220
221
222/**
223 * This is a unicode aware replacement for strtolower()
224 *
225 * Uses mb_string extension if available
226 *
227 * @author Andreas Gohr <andi@splitbrain.org>
228 * @see    strtolower()
229 * @see    utf8_strtoupper()
230 */
231function utf8_strtolower($string){
232  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
233    return mb_strtolower($string,'utf-8');
234
235  global $UTF8_UPPER_TO_LOWER;
236  $uni = utf8_to_unicode($string);
237  $cnt = count($uni);
238  for ($i=0; $i < $cnt; $i++){
239    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
240      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
241    }
242  }
243  return unicode_to_utf8($uni);
244}
245
246/**
247 * This is a unicode aware replacement for strtoupper()
248 *
249 * Uses mb_string extension if available
250 *
251 * @author Andreas Gohr <andi@splitbrain.org>
252 * @see    strtoupper()
253 * @see    utf8_strtoupper()
254 */
255function utf8_strtoupper($string){
256  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
257    return mb_strtoupper($string,'utf-8');
258
259  global $UTF8_LOWER_TO_UPPER;
260  $uni = utf8_to_unicode($string);
261  $cnt = count($uni);
262  for ($i=0; $i < $cnt; $i++){
263    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
264      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
265    }
266  }
267  return unicode_to_utf8($uni);
268}
269
270/**
271 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
272 *
273 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
274 * letters. Default is to deaccent both cases ($case = 0)
275 *
276 * @author Andreas Gohr <andi@splitbrain.org>
277 */
278function utf8_deaccent($string,$case=0){
279  if($case <= 0){
280    global $UTF8_LOWER_ACCENTS;
281    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
282  }
283  if($case >= 0){
284    global $UTF8_UPPER_ACCENTS;
285    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
286  }
287  return $string;
288}
289
290/**
291 * Removes special characters (nonalphanumeric) from a UTF-8 string
292 *
293 * This function adds the controlchars 0x00 to 0x19 to the array of
294 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
295 *
296 * @author Andreas Gohr <andi@splitbrain.org>
297 * @param  string $string     The UTF8 string to strip of special chars
298 * @param  string $repl       Replace special with this string
299 * @param  string $additional Additional chars to strip (used in regexp char class)
300 */
301function utf8_stripspecials($string,$repl='',$additional=''){
302  global $UTF8_SPECIAL_CHARS;
303
304  static $specials = null;
305  if(is_null($specials)){
306    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
307  }
308
309  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
310}
311
312/**
313 * This is an Unicode aware replacement for strpos
314 *
315 * Uses mb_string extension if available
316 *
317 * @author Harry Fuecks <hfuecks@gmail.com>
318 * @see    strpos()
319 */
320function utf8_strpos($haystack, $needle,$offset=0) {
321  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
322    return mb_strpos($haystack,$needle,$offset,'utf-8');
323
324  if(!$offset){
325    $ar = utf8_explode($needle, $str);
326    if ( count($ar) > 1 ) {
327       return utf8_strlen($ar[0]);
328    }
329    return false;
330  }else{
331    if ( !is_int($offset) ) {
332      trigger_error('Offset must be an integer',E_USER_WARNING);
333      return false;
334    }
335
336    $str = utf8_substr($str, $offset);
337
338    if ( false !== ($pos = utf8_strpos($str,$needle))){
339       return $pos + $offset;
340    }
341    return false;
342  }
343}
344
345/**
346 * Encodes UTF-8 characters to HTML entities
347 *
348 * @author <vpribish at shopping dot com>
349 * @link   http://www.php.net/manual/en/function.utf8-decode.php
350 */
351function utf8_tohtml ($str) {
352  $ret = '';
353  $max = strlen($str);
354  $last = 0;  // keeps the index of the last regular character
355  for ($i=0; $i<$max; $i++) {
356    $c = $str{$i};
357    $c1 = ord($c);
358    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
359      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
360      $c1 &= 31; // remove the 3 bit two bytes prefix
361      $c2 = ord($str{++$i}); // the next byte
362      $c2 &= 63;  // remove the 2 bit trailing byte prefix
363      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
364      $c1 >>= 2; // c1 shifts 2 to the right
365      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
366      $last = $i+1;
367    }
368  }
369  return $ret . substr($str, $last, $i); // append the last batch of regular characters
370}
371
372/**
373 * This function returns any UTF-8 encoded text as a list of
374 * Unicode values:
375 *
376 * @author Scott Michael Reynen <scott@randomchaos.com>
377 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
378 * @see    unicode_to_utf8()
379 */
380function utf8_to_unicode( &$str ) {
381  $unicode = array();
382  $values = array();
383  $lookingFor = 1;
384
385  for ($i = 0; $i < strlen( $str ); $i++ ) {
386    $thisValue = ord( $str[ $i ] );
387    if ( $thisValue < 128 ) $unicode[] = $thisValue;
388    else {
389      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
390      $values[] = $thisValue;
391      if ( count( $values ) == $lookingFor ) {
392  $number = ( $lookingFor == 3 ) ?
393    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
394  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
395  $unicode[] = $number;
396  $values = array();
397  $lookingFor = 1;
398      }
399    }
400  }
401  return $unicode;
402}
403
404/**
405 * This function converts a Unicode array back to its UTF-8 representation
406 *
407 * @author Scott Michael Reynen <scott@randomchaos.com>
408 * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
409 * @see    utf8_to_unicode()
410 */
411function unicode_to_utf8( &$str ) {
412  $utf8 = '';
413  foreach( $str as $unicode ) {
414    if ( $unicode < 128 ) {
415      $utf8.= chr( $unicode );
416    } elseif ( $unicode < 2048 ) {
417      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
418      $utf8.= chr( 128 + ( $unicode % 64 ) );
419    } else {
420      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
421      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
422      $utf8.= chr( 128 + ( $unicode % 64 ) );
423    }
424  }
425  return $utf8;
426}
427
428/**
429 * UTF-8 to UTF-16BE conversion.
430 *
431 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
432 */
433function utf8_to_utf16be(&$str, $bom = false) {
434  $out = $bom ? "\xFE\xFF" : '';
435  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
436    return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
437
438  $uni = utf8_to_unicode($str);
439  foreach($uni as $cp){
440    $out .= pack('n',$cp);
441  }
442  return $out;
443}
444
445/**
446 * UTF-8 to UTF-16BE conversion.
447 *
448 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
449 */
450function utf16be_to_utf8(&$str) {
451  $uni = unpack('n*',$str);
452  return unicode_to_utf8($uni);
453}
454
455/**
456 * UTF-8 Case lookup table
457 *
458 * This lookuptable defines the upper case letters to their correspponding
459 * lower case letter in UTF-8
460 *
461 * @author Andreas Gohr <andi@splitbrain.org>
462 */
463$UTF8_LOWER_TO_UPPER = array(
464  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
465  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
466  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
467  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
468  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
469  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
470  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
471  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
472  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
473  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
474  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
475  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
476  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
477  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
478  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
479  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
480  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
481  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
482  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
483  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
484  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
485  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
486  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
487  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
488  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
489  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
490  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
491  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
492  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
493  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
494  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
495  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
496  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
497  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
498  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
499  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
500  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
501  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
502  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
503  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
504  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
505  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
506  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
507);
508
509/**
510 * UTF-8 Case lookup table
511 *
512 * This lookuptable defines the lower case letters to their correspponding
513 * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
514 *
515 * @author Andreas Gohr <andi@splitbrain.org>
516 */
517$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
518
519/**
520 * UTF-8 lookup table for lower case accented letters
521 *
522 * This lookuptable defines replacements for accented characters from the ASCII-7
523 * range. This are lower case letters only.
524 *
525 * @author Andreas Gohr <andi@splitbrain.org>
526 * @see    utf8_deaccent()
527 */
528$UTF8_LOWER_ACCENTS = array(
529  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
530  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
531  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
532  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
533  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
534  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
535  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
536  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
537  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
538  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
539  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
540  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
541  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
542  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
543  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
544);
545
546/**
547 * UTF-8 lookup table for upper case accented letters
548 *
549 * This lookuptable defines replacements for accented characters from the ASCII-7
550 * range. This are upper case letters only.
551 *
552 * @author Andreas Gohr <andi@splitbrain.org>
553 * @see    utf8_deaccent()
554 */
555$UTF8_UPPER_ACCENTS = array(
556  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
557  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
558  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
559  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
560  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
561  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
562  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
563  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
564  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
565  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
566  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
567  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
568  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
569  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
570  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
571);
572
573/**
574 * UTF-8 array of common special characters
575 *
576 * This array should contain all special characters (not a letter or digit)
577 * defined in the various local charsets - it's not a complete list of non-alphanum
578 * characters in UTF-8. It's not perfect but should match most cases of special
579 * chars.
580 *
581 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
582 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d
583 *
584 * @author Andreas Gohr <andi@splitbrain.org>
585 * @see    utf8_stripspecials()
586 */
587$UTF8_SPECIAL_CHARS = array(
588  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
589  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
590          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
591  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
592  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
593  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
594	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
595	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
596	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
597	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
598	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
599	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
600	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
601	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
602	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
603	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
604	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
605	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
606	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
607	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
608	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
609	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
610	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
611	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
612	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
613	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
614	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
615	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
616	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
617	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
618	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
619	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
620	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
621	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
622	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
623	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
624	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
625	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
626	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
627	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
628	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
629	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
630	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
631	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
632	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
633	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
634	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
635	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
636	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
637	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
638);
639
640
641//Setup VIM: ex: et ts=2 enc=utf-8 :
642