xref: /dokuwiki/inc/utf8.php (revision 2cd2db38e844b55a35cbbbf800835d6b84531308)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
582257610Sandi * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
982257610Sandi/**
1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters
1149c713a3Sandi *
1249c713a3Sandi * Slashes are not encoded
1349c713a3Sandi *
14f59b22f0Sandi * When the second parameter is true the string will
15f59b22f0Sandi * be encoded only if non ASCII characters are detected -
16f59b22f0Sandi * This makes it safe to run it multiple times on the
17f59b22f0Sandi * same string (default is true)
18f59b22f0Sandi *
1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
20f59b22f0Sandi * @see    urlencode
2149c713a3Sandi */
22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
23f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24f59b22f0Sandi    return $file;
25f59b22f0Sandi  }
26f59b22f0Sandi  $file = urlencode($file);
2749c713a3Sandi  $file = str_replace('%2F','/',$file);
2849c713a3Sandi  return $file;
2949c713a3Sandi}
3049c713a3Sandi
3149c713a3Sandi/**
3249c713a3Sandi * URL-Decode a filename
3349c713a3Sandi *
34f59b22f0Sandi * This is just a wrapper around urldecode
35f59b22f0Sandi *
3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
37f59b22f0Sandi * @see    urldecode
3849c713a3Sandi */
3949c713a3Sandifunction utf8_decodeFN($file){
40f59b22f0Sandi  $file = urldecode($file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
44f29bd553Sandi/**
4544f669e9Sandi * Checks if a string contains 7bit ASCII only
4644f669e9Sandi *
4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
4844f669e9Sandi */
4944f669e9Sandifunction utf8_isASCII($str){
5044f669e9Sandi  for($i=0; $i<strlen($str); $i++){
5144f669e9Sandi    if(ord($str{$i}) >127) return false;
5244f669e9Sandi  }
5344f669e9Sandi  return true;
5444f669e9Sandi}
5544f669e9Sandi
5644f669e9Sandi/**
57e1906e6eSandi * Strips all highbyte chars
58e1906e6eSandi *
59e1906e6eSandi * Returns a pure ASCII7 string
60e1906e6eSandi *
61e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
62e1906e6eSandi */
63e1906e6eSandifunction utf8_strip($str){
64e1906e6eSandi  $ascii = '';
65e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
66e1906e6eSandi    if(ord($str{$i}) <128){
67e1906e6eSandi      $ascii .= $str{$i};
68e1906e6eSandi    }
69e1906e6eSandi  }
70e1906e6eSandi  return $ascii;
71e1906e6eSandi}
72e1906e6eSandi
73e1906e6eSandi/**
74f29bd553Sandi * Tries to detect if a string is in Unicode encoding
75f29bd553Sandi *
76f29bd553Sandi * @author <bmorel@ssi.fr>
77f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
78f29bd553Sandi */
79f29bd553Sandifunction utf8_check($Str) {
80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
81f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87f29bd553Sandi  else return false; # Does not match any model
88f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90f29bd553Sandi   return false;
91f29bd553Sandi  }
92f29bd553Sandi }
93f29bd553Sandi return true;
94f29bd553Sandi}
9549c713a3Sandi
962f954959Sandi/**
97f29317c1Sandi * Unicode aware replacement for strlen()
982f954959Sandi *
99f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
100f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
101f29317c1Sandi * even faster than mb_strlen.
1022f954959Sandi *
103f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1042f954959Sandi * @see    strlen()
105f29317c1Sandi * @see    utf8_decode()
1062f954959Sandi */
1072f954959Sandifunction utf8_strlen($string){
108f29317c1Sandi  return strlen(utf8_decode($str));
1092f954959Sandi}
1102f954959Sandi
1117077c942Sandi/**
112f29317c1Sandi * Unicode aware replacement for substr()
1137077c942Sandi *
114f29317c1Sandi * @todo   Handle negative positions etc.
115f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
1167077c942Sandi * @see    substr()
1177077c942Sandi */
1187077c942Sandifunction utf8_substr($str, $start, $length=null){
119f29317c1Sandi  if ( is_null($length) ) {
120f29317c1Sandi    $length = '*';
121f29317c1Sandi  } else {
122f29317c1Sandi    $length = '{0,'.$length.'}';
1237077c942Sandi  }
124f29317c1Sandi  $pattern = '/^.{'.$start.'}(.'.$length.')/us';
125f29317c1Sandi  preg_match($pattern, $str, $matches);
126f29317c1Sandi
127f29317c1Sandi  if ( isset($matches[1]) ) {
128f29317c1Sandi    return $matches[1];
129f29317c1Sandi  }
130f29317c1Sandi  return false;
131f29317c1Sandi}
132f29317c1Sandi
133f29317c1Sandi/**
134f29317c1Sandi * Unicode aware replacement for explode
135f29317c1Sandi *
136f29317c1Sandi * @TODO   support third limit arg
137f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
138f29317c1Sandi * @see    explode();
139f29317c1Sandi */
140f29317c1Sandifunction utf8_explode($sep, $str) {
141f29317c1Sandi  if ( $sep == '' ) {
142f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
143f29317c1Sandi    return FALSE;
144f29317c1Sandi  }
145f29317c1Sandi
146f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
147f29317c1Sandi}
148f29317c1Sandi
149f29317c1Sandi/**
150f29317c1Sandi * Unicode aware replacement for strrepalce()
151f29317c1Sandi *
152f29317c1Sandi * @todo   support PHP5 count (fourth arg)
153f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
154f29317c1Sandi * @see    strreplace();
155f29317c1Sandi */
156f29317c1Sandifunction utf8_str_replace($s,$r,$str){
157f29317c1Sandi  if(!is_array($s)){
158f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
159f29317c1Sandi  }else{
160f29317c1Sandi    foreach ($s as $k => $v) {
161f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
162f29317c1Sandi    }
163f29317c1Sandi  }
164f29317c1Sandi  return preg_replace($s,$r,$str);
165f29317c1Sandi}
166f29317c1Sandi
167f29317c1Sandi/**
168f29317c1Sandi * Unicode aware replacement for ltrim()
169f29317c1Sandi *
170f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
171f29317c1Sandi * @see    ltrim()
172f29317c1Sandi * @return string
173f29317c1Sandi */
174f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
175f29317c1Sandi  if($charlist == '') return ltrim($str);
176f29317c1Sandi
177f29317c1Sandi  //quote charlist for use in a characterclass
178f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
179f29317c1Sandi
180f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
181f29317c1Sandi}
182f29317c1Sandi
183f29317c1Sandi/**
184f29317c1Sandi * Unicode aware replacement for ltrim()
185f29317c1Sandi *
186f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
187f29317c1Sandi * @see    rtrim()
188f29317c1Sandi * @return string
189f29317c1Sandi */
190f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
191f29317c1Sandi  if($charlist == '') return rtrim($str);
192f29317c1Sandi
193f29317c1Sandi  //quote charlist for use in a characterclass
194f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
195f29317c1Sandi
196f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
197f29317c1Sandi}
198f29317c1Sandi
199f29317c1Sandi/**
200f29317c1Sandi * Unicode aware replacement for trim()
201f29317c1Sandi *
202f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
203f29317c1Sandi * @see    trim()
204f29317c1Sandi * @return string
205f29317c1Sandi */
206f29317c1Sandifunction  utf8_trim($str,$charlist='') {
207f29317c1Sandi  if($charlist == '') return trim($str);
208f29317c1Sandi
209f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
210f29317c1Sandi}
211f29317c1Sandi
2122f954959Sandi
21349c713a3Sandi/**
21482257610Sandi * This is a unicode aware replacement for strtolower()
21582257610Sandi *
21682257610Sandi * Uses mb_string extension if available
21782257610Sandi *
21882257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
21982257610Sandi * @see    strtolower()
22082257610Sandi * @see    utf8_strtoupper()
22182257610Sandi */
22282257610Sandifunction utf8_strtolower($string){
22382257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
22482257610Sandi    return mb_strtolower($string,'utf-8');
22582257610Sandi
22682257610Sandi  global $UTF8_UPPER_TO_LOWER;
22782257610Sandi  $uni = utf8_to_unicode($string);
228*2cd2db38Sandi  $cnt = count($uni);
229*2cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
23082257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
23182257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
23282257610Sandi    }
23382257610Sandi  }
23482257610Sandi  return unicode_to_utf8($uni);
23582257610Sandi}
23682257610Sandi
23782257610Sandi/**
23882257610Sandi * This is a unicode aware replacement for strtoupper()
23982257610Sandi *
24082257610Sandi * Uses mb_string extension if available
24182257610Sandi *
24282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
24382257610Sandi * @see    strtoupper()
24482257610Sandi * @see    utf8_strtoupper()
24582257610Sandi */
24682257610Sandifunction utf8_strtoupper($string){
24782257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
24882257610Sandi    return mb_strtolower($string,'utf-8');
24982257610Sandi
25082257610Sandi  global $UTF8_LOWER_TO_UPPER;
25182257610Sandi  $uni = utf8_to_unicode($string);
252*2cd2db38Sandi  $cnt = count($uni);
253*2cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
25482257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
25582257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
25682257610Sandi    }
25782257610Sandi  }
25882257610Sandi  return unicode_to_utf8($uni);
25982257610Sandi}
26082257610Sandi
26182257610Sandi/**
26282257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
26382257610Sandi *
26482257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
26582257610Sandi * letters. Default is to deaccent both cases ($case = 0)
26682257610Sandi *
26782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
26882257610Sandi */
26982257610Sandifunction utf8_deaccent($string,$case=0){
27082257610Sandi  if($case <= 0){
27182257610Sandi    global $UTF8_LOWER_ACCENTS;
27282257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
27382257610Sandi  }
27482257610Sandi  if($case >= 0){
27582257610Sandi    global $UTF8_UPPER_ACCENTS;
27682257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
27782257610Sandi  }
27882257610Sandi  return $string;
27982257610Sandi}
28082257610Sandi
28182257610Sandi/**
282099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
283099ada41Sandi *
284099ada41Sandi * Be sure to specify all specialchars you give in $repl in $keep, too
285099ada41Sandi * or it won't work.
286099ada41Sandi *
287099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
288099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
289099ada41Sandi *
290099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
291099ada41Sandi * @param  string $string The UTF8 string to strip of special chars
292099ada41Sandi * @param  string $repl   Replace special with this string
293099ada41Sandi * @param  string $keep   Special chars to keep (in UTF8)
294099ada41Sandi */
295099ada41Sandifunction utf8_stripspecials($string,$repl='',$keep=''){
296099ada41Sandi  global $UTF8_SPECIAL_CHARS;
297099ada41Sandi  if($keep != ''){
298099ada41Sandi    $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep));
299099ada41Sandi  }else{
300099ada41Sandi    $specials = $UTF8_SPECIAL_CHARS;
301099ada41Sandi  }
302099ada41Sandi
303099ada41Sandi  $specials = unicode_to_utf8($specials);
304099ada41Sandi  $specials = preg_quote($specials, '/');
305099ada41Sandi
306099ada41Sandi  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
307099ada41Sandi}
308099ada41Sandi
309099ada41Sandi/**
3102f954959Sandi * This is an Unicode aware replacement for strpos
3112f954959Sandi *
3122f954959Sandi * Uses mb_string extension if available
3132f954959Sandi *
314f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
3152f954959Sandi * @see    strpos()
3162f954959Sandi */
3172f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
3182f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
3192f954959Sandi    return mb_strpos($haystack,$needle,$offset,'utf-8');
3202f954959Sandi
321f29317c1Sandi  if(!$offset){
322f29317c1Sandi    $ar = utf8_explode($needle, $str);
323f29317c1Sandi    if ( count($ar) > 1 ) {
324f29317c1Sandi       return utf8_strlen($ar[0]);
325f29317c1Sandi    }
326f29317c1Sandi    return false;
327f29317c1Sandi  }else{
328f29317c1Sandi    if ( !is_int($offset) ) {
329f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
330f29317c1Sandi      return false;
331f29317c1Sandi    }
3322f954959Sandi
333f29317c1Sandi    $str = utf8_substr($str, $offset);
334f29317c1Sandi
335f29317c1Sandi    if ( false !== ($pos = utf8_strpos($str,$needle))){
336f29317c1Sandi       return $pos + $offset;
3372f954959Sandi    }
338f29317c1Sandi    return false;
3392f954959Sandi  }
3402f954959Sandi}
3412f954959Sandi
3422f954959Sandi/**
343f29317c1Sandi * This function returns any UTF-8 encoded text as a list of
344f29317c1Sandi * Unicode values:
34582257610Sandi *
34682257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
34782257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
34882257610Sandi * @see    unicode_to_utf8()
34982257610Sandi */
35082257610Sandifunction utf8_to_unicode( $str ) {
35182257610Sandi  $unicode = array();
35282257610Sandi  $values = array();
35382257610Sandi  $lookingFor = 1;
35482257610Sandi
35582257610Sandi  for ($i = 0; $i < strlen( $str ); $i++ ) {
35682257610Sandi    $thisValue = ord( $str[ $i ] );
35782257610Sandi    if ( $thisValue < 128 ) $unicode[] = $thisValue;
35882257610Sandi    else {
35982257610Sandi      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
36082257610Sandi      $values[] = $thisValue;
36182257610Sandi      if ( count( $values ) == $lookingFor ) {
36282257610Sandi  $number = ( $lookingFor == 3 ) ?
36382257610Sandi    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
36482257610Sandi  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
36582257610Sandi  $unicode[] = $number;
36682257610Sandi  $values = array();
36782257610Sandi  $lookingFor = 1;
36882257610Sandi      }
36982257610Sandi    }
37082257610Sandi  }
37182257610Sandi  return $unicode;
37282257610Sandi}
37382257610Sandi
37482257610Sandi/**
375f29317c1Sandi * This function converts a Unicode array back to its UTF-8 representation
37682257610Sandi *
37782257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
37882257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
37982257610Sandi * @see    utf8_to_unicode()
38082257610Sandi */
38182257610Sandifunction unicode_to_utf8( $str ) {
38282257610Sandi  $utf8 = '';
38382257610Sandi  foreach( $str as $unicode ) {
38482257610Sandi    if ( $unicode < 128 ) {
38582257610Sandi      $utf8.= chr( $unicode );
38682257610Sandi    } elseif ( $unicode < 2048 ) {
38782257610Sandi      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
38882257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
38982257610Sandi    } else {
39082257610Sandi      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
39182257610Sandi      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
39282257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
39382257610Sandi    }
39482257610Sandi  }
39582257610Sandi  return $utf8;
39682257610Sandi}
39782257610Sandi
39882257610Sandi/**
39982257610Sandi * UTF-8 Case lookup table
40082257610Sandi *
40182257610Sandi * This lookuptable defines the upper case letters to their correspponding
40282257610Sandi * lower case letter in UTF-8
40382257610Sandi *
40482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
40582257610Sandi */
40682257610Sandi$UTF8_LOWER_TO_UPPER = array(
40782257610Sandi  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
40882257610Sandi  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
40982257610Sandi  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
41082257610Sandi  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
41182257610Sandi  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
41282257610Sandi  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
41382257610Sandi  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
41482257610Sandi  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
41582257610Sandi  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
41682257610Sandi  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
41782257610Sandi  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
41882257610Sandi  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
41982257610Sandi  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
42082257610Sandi  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
42182257610Sandi  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
42282257610Sandi  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
42382257610Sandi  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
42482257610Sandi  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
42582257610Sandi  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
42682257610Sandi  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
42782257610Sandi  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
42882257610Sandi  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
42982257610Sandi  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
43082257610Sandi  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
43182257610Sandi  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
43282257610Sandi  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
43382257610Sandi  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
43482257610Sandi  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
43582257610Sandi  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
43682257610Sandi  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
43782257610Sandi  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
43882257610Sandi  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
43982257610Sandi  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
44082257610Sandi  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
44182257610Sandi  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
44282257610Sandi  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
44382257610Sandi  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
44482257610Sandi  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
44582257610Sandi  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
44682257610Sandi  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
44782257610Sandi  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
44882257610Sandi  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
44982257610Sandi  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
45082257610Sandi);
45182257610Sandi
45282257610Sandi/**
45382257610Sandi * UTF-8 Case lookup table
45482257610Sandi *
45582257610Sandi * This lookuptable defines the lower case letters to their correspponding
45682257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
45782257610Sandi *
45882257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
45982257610Sandi */
46082257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
46182257610Sandi
46282257610Sandi/**
46382257610Sandi * UTF-8 lookup table for lower case accented letters
46482257610Sandi *
46582257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
46682257610Sandi * range. This are lower case letters only.
46782257610Sandi *
46882257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
46982257610Sandi * @see    utf8_deaccent()
47082257610Sandi */
47182257610Sandi$UTF8_LOWER_ACCENTS = array(
47282257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
47382257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
47482257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
47582257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
47682257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
47782257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
47882257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
47982257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
48082257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
48182257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
48282257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
48382257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
48482257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
48582257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
4860c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
48782257610Sandi);
48882257610Sandi
48982257610Sandi/**
49082257610Sandi * UTF-8 lookup table for upper case accented letters
49182257610Sandi *
49282257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
49382257610Sandi * range. This are upper case letters only.
49482257610Sandi *
49582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
49682257610Sandi * @see    utf8_deaccent()
49782257610Sandi */
49882257610Sandi$UTF8_UPPER_ACCENTS = array(
49982257610Sandi  'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O',
50082257610Sandi  'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K',
50182257610Sandi  'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O',
50282257610Sandi  'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O',
50382257610Sandi  'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C',
50482257610Sandi  'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T',
50582257610Sandi  'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L',
50682257610Sandi  'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z',
50782257610Sandi  'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T',
50882257610Sandi  'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O',
50982257610Sandi  'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J',
51082257610Sandi  'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O',
51182257610Sandi  'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G',
51282257610Sandi  'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A',
513099ada41Sandi  'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
51482257610Sandi);
51582257610Sandi
516099ada41Sandi/**
517099ada41Sandi * UTF-8 array of common special characters
518099ada41Sandi *
519099ada41Sandi * This array should contain all special characters (not a letter or digit)
520099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
521099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
522099ada41Sandi * chars.
523099ada41Sandi *
524099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
525099ada41Sandi *
526099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
527099ada41Sandi * @see    utf8_stripspecials()
528099ada41Sandi */
529099ada41Sandi$UTF8_SPECIAL_CHARS = array(
530099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
531099ada41Sandi  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d,
532099ada41Sandi  0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
5333ed6dbb8Sandi  0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
534099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
535099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
536099ada41Sandi	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
537099ada41Sandi	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
538099ada41Sandi	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
539099ada41Sandi	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
540099ada41Sandi	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
541099ada41Sandi	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
542099ada41Sandi	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
543099ada41Sandi	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
544099ada41Sandi	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
545099ada41Sandi	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
546099ada41Sandi	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
547099ada41Sandi	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
548099ada41Sandi	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
549099ada41Sandi	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
550099ada41Sandi	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
551099ada41Sandi	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
552099ada41Sandi	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
553099ada41Sandi	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
554099ada41Sandi	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
555099ada41Sandi	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
556099ada41Sandi	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
557099ada41Sandi	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
558099ada41Sandi	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
559099ada41Sandi	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
560099ada41Sandi	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
561099ada41Sandi	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
562099ada41Sandi	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
563099ada41Sandi	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
564099ada41Sandi	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
565099ada41Sandi	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
566099ada41Sandi	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
567099ada41Sandi	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
568099ada41Sandi	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
569099ada41Sandi	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
570099ada41Sandi	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
571099ada41Sandi	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
572099ada41Sandi	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
573099ada41Sandi	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
574099ada41Sandi	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
575099ada41Sandi	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
576099ada41Sandi	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
577099ada41Sandi	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
578099ada41Sandi	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
579099ada41Sandi	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
580099ada41Sandi);
581340756e4Sandi
582340756e4Sandi
583340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
584