xref: /dokuwiki/inc/utf8.php (revision e1906e6eda8098573a47fc7c78663348500920bd)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
582257610Sandi * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
982257610Sandi/**
1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters
1149c713a3Sandi *
1249c713a3Sandi * Slashes are not encoded
1349c713a3Sandi *
14f59b22f0Sandi * When the second parameter is true the string will
15f59b22f0Sandi * be encoded only if non ASCII characters are detected -
16f59b22f0Sandi * This makes it safe to run it multiple times on the
17f59b22f0Sandi * same string (default is true)
18f59b22f0Sandi *
1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
20f59b22f0Sandi * @see    urlencode
2149c713a3Sandi */
22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
23f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24f59b22f0Sandi    return $file;
25f59b22f0Sandi  }
26f59b22f0Sandi  $file = urlencode($file);
2749c713a3Sandi  $file = str_replace('%2F','/',$file);
2849c713a3Sandi  return $file;
2949c713a3Sandi}
3049c713a3Sandi
3149c713a3Sandi/**
3249c713a3Sandi * URL-Decode a filename
3349c713a3Sandi *
34f59b22f0Sandi * This is just a wrapper around urldecode
35f59b22f0Sandi *
3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
37f59b22f0Sandi * @see    urldecode
3849c713a3Sandi */
3949c713a3Sandifunction utf8_decodeFN($file){
40f59b22f0Sandi  $file = urldecode($file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
44f29bd553Sandi/**
4544f669e9Sandi * Checks if a string contains 7bit ASCII only
4644f669e9Sandi *
4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
4844f669e9Sandi */
4944f669e9Sandifunction utf8_isASCII($str){
5044f669e9Sandi  for($i=0; $i<strlen($str); $i++){
5144f669e9Sandi    if(ord($str{$i}) >127) return false;
5244f669e9Sandi  }
5344f669e9Sandi  return true;
5444f669e9Sandi}
5544f669e9Sandi
5644f669e9Sandi/**
57*e1906e6eSandi * Strips all highbyte chars
58*e1906e6eSandi *
59*e1906e6eSandi * Returns a pure ASCII7 string
60*e1906e6eSandi *
61*e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
62*e1906e6eSandi */
63*e1906e6eSandifunction utf8_strip($str){
64*e1906e6eSandi  $ascii = '';
65*e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
66*e1906e6eSandi    if(ord($str{$i}) <128){
67*e1906e6eSandi      $ascii .= $str{$i};
68*e1906e6eSandi    }
69*e1906e6eSandi  }
70*e1906e6eSandi  return $ascii;
71*e1906e6eSandi}
72*e1906e6eSandi
73*e1906e6eSandi/**
74f29bd553Sandi * Tries to detect if a string is in Unicode encoding
75f29bd553Sandi *
76f29bd553Sandi * @author <bmorel@ssi.fr>
77f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
78f29bd553Sandi */
79f29bd553Sandifunction utf8_check($Str) {
80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
81f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87f29bd553Sandi  else return false; # Does not match any model
88f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90f29bd553Sandi   return false;
91f29bd553Sandi  }
92f29bd553Sandi }
93f29bd553Sandi return true;
94f29bd553Sandi}
9549c713a3Sandi
962f954959Sandi/**
972f954959Sandi * This is a unicode aware replacement for strlen()
982f954959Sandi *
992f954959Sandi * Uses mb_string extension if available
1002f954959Sandi *
1012f954959Sandi * @author Andreas Gohr <andi@splitbrain.org>
1022f954959Sandi * @see    strlen()
1032f954959Sandi */
1042f954959Sandifunction utf8_strlen($string){
1052f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strlen'))
1062f954959Sandi    return mb_strlen($string,'utf-8');
1072f954959Sandi
1082f954959Sandi  $uni = utf8_to_unicode($string);
1092f954959Sandi  return count($uni);
1102f954959Sandi}
1112f954959Sandi
1127077c942Sandi/**
1137077c942Sandi * This is a unicode aware replacement for substr()
1147077c942Sandi *
1157077c942Sandi * Uses mb_string extension if available
1167077c942Sandi *
1177077c942Sandi * @author Andreas Gohr <andi@splitbrain.org>
1187077c942Sandi * @see    substr()
1197077c942Sandi */
1207077c942Sandifunction utf8_substr($str, $start, $length=null){
1217077c942Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr'))
1227077c942Sandi    return mb_substr($str,$start,$length,'utf-8');
1237077c942Sandi
1247077c942Sandi  $uni = utf8_to_unicode($str);
1257077c942Sandi  return unicode_to_utf8(array_slice($uni,$start,$length));
1267077c942Sandi}
1272f954959Sandi
12849c713a3Sandi/**
12982257610Sandi * This is a unicode aware replacement for strtolower()
13082257610Sandi *
13182257610Sandi * Uses mb_string extension if available
13282257610Sandi *
13382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
13482257610Sandi * @see    strtolower()
13582257610Sandi * @see    utf8_strtoupper()
13682257610Sandi */
13782257610Sandifunction utf8_strtolower($string){
13882257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
13982257610Sandi    return mb_strtolower($string,'utf-8');
14082257610Sandi
14182257610Sandi  global $UTF8_UPPER_TO_LOWER;
14282257610Sandi  $uni = utf8_to_unicode($string);
14382257610Sandi  for ($i=0; $i < count($uni); $i++){
14482257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
14582257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
14682257610Sandi    }
14782257610Sandi  }
14882257610Sandi  return unicode_to_utf8($uni);
14982257610Sandi}
15082257610Sandi
15182257610Sandi/**
15282257610Sandi * This is a unicode aware replacement for strtoupper()
15382257610Sandi *
15482257610Sandi * Uses mb_string extension if available
15582257610Sandi *
15682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
15782257610Sandi * @see    strtoupper()
15882257610Sandi * @see    utf8_strtoupper()
15982257610Sandi */
16082257610Sandifunction utf8_strtoupper($string){
16182257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
16282257610Sandi    return mb_strtolower($string,'utf-8');
16382257610Sandi
16482257610Sandi  global $UTF8_LOWER_TO_UPPER;
16582257610Sandi  $uni = utf8_to_unicode($string);
16682257610Sandi  for ($i=0; $i < count($uni); $i++){
16782257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
16882257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
16982257610Sandi    }
17082257610Sandi  }
17182257610Sandi  return unicode_to_utf8($uni);
17282257610Sandi}
17382257610Sandi
17482257610Sandi/**
17582257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
17682257610Sandi *
17782257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
17882257610Sandi * letters. Default is to deaccent both cases ($case = 0)
17982257610Sandi *
18082257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
18182257610Sandi */
18282257610Sandifunction utf8_deaccent($string,$case=0){
18382257610Sandi  if($case <= 0){
18482257610Sandi    global $UTF8_LOWER_ACCENTS;
18582257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
18682257610Sandi  }
18782257610Sandi  if($case >= 0){
18882257610Sandi    global $UTF8_UPPER_ACCENTS;
18982257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
19082257610Sandi  }
19182257610Sandi  return $string;
19282257610Sandi}
19382257610Sandi
19482257610Sandi/**
195099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
196099ada41Sandi *
197099ada41Sandi * Be sure to specify all specialchars you give in $repl in $keep, too
198099ada41Sandi * or it won't work.
199099ada41Sandi *
200099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
201099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
202099ada41Sandi *
203099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
204099ada41Sandi * @param  string $string The UTF8 string to strip of special chars
205099ada41Sandi * @param  string $repl   Replace special with this string
206099ada41Sandi * @param  string $keep   Special chars to keep (in UTF8)
207099ada41Sandi */
208099ada41Sandifunction utf8_stripspecials($string,$repl='',$keep=''){
209099ada41Sandi  global $UTF8_SPECIAL_CHARS;
210099ada41Sandi  if($keep != ''){
211099ada41Sandi    $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep));
212099ada41Sandi  }else{
213099ada41Sandi    $specials = $UTF8_SPECIAL_CHARS;
214099ada41Sandi  }
215099ada41Sandi
216099ada41Sandi  $specials = unicode_to_utf8($specials);
217099ada41Sandi  $specials = preg_quote($specials, '/');
218099ada41Sandi
219099ada41Sandi  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
220099ada41Sandi}
221099ada41Sandi
222099ada41Sandi/**
2232f954959Sandi * This is an Unicode aware replacement for strpos
2242f954959Sandi *
2252f954959Sandi * Uses mb_string extension if available
2262f954959Sandi *
2272f954959Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
2282f954959Sandi * @author Andreas Gohr <andi@splitbrain.org>
2292f954959Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
2302f954959Sandi * @see    strpos()
2312f954959Sandi */
2322f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
2332f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
2342f954959Sandi    return mb_strpos($haystack,$needle,$offset,'utf-8');
2352f954959Sandi
2362f954959Sandi  $haystack = utf8_to_unicode($haystack);
2372f954959Sandi  $needle   = utf8_to_unicode($needle);
2382f954959Sandi  $position = $offset;
2392f954959Sandi  $found = false;
2402f954959Sandi
2412f954959Sandi  while( (! $found ) && ( $position < count( $haystack ) ) ) {
2422f954959Sandi    if ( $needle[0] == $haystack[$position] ) {
2432f954959Sandi      for ($i = 1; $i < count( $needle ); $i++ ) {
2442f954959Sandi        if ( $needle[$i] != $haystack[ $position + $i ] ) break;
2452f954959Sandi      }
2462f954959Sandi      if ( $i == count( $needle ) ) {
2472f954959Sandi        $found = true;
2482f954959Sandi        $position--;
2492f954959Sandi      }
2502f954959Sandi    }
2512f954959Sandi    $position++;
2522f954959Sandi  }
2532f954959Sandi  return ( $found == true ) ? $position : false;
2542f954959Sandi}
2552f954959Sandi
2562f954959Sandi/**
25782257610Sandi * This function will any UTF-8 encoded text and return it as
25882257610Sandi * a list of Unicode values:
25982257610Sandi *
26082257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
26182257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
26282257610Sandi * @see    unicode_to_utf8()
26382257610Sandi */
26482257610Sandifunction utf8_to_unicode( $str ) {
26582257610Sandi  $unicode = array();
26682257610Sandi  $values = array();
26782257610Sandi  $lookingFor = 1;
26882257610Sandi
26982257610Sandi  for ($i = 0; $i < strlen( $str ); $i++ ) {
27082257610Sandi    $thisValue = ord( $str[ $i ] );
27182257610Sandi    if ( $thisValue < 128 ) $unicode[] = $thisValue;
27282257610Sandi    else {
27382257610Sandi      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
27482257610Sandi      $values[] = $thisValue;
27582257610Sandi      if ( count( $values ) == $lookingFor ) {
27682257610Sandi  $number = ( $lookingFor == 3 ) ?
27782257610Sandi    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
27882257610Sandi  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
27982257610Sandi  $unicode[] = $number;
28082257610Sandi  $values = array();
28182257610Sandi  $lookingFor = 1;
28282257610Sandi      }
28382257610Sandi    }
28482257610Sandi  }
28582257610Sandi  return $unicode;
28682257610Sandi}
28782257610Sandi
28882257610Sandi/**
28982257610Sandi * This function will convert a Unicode array back to its UTF-8 representation
29082257610Sandi *
29182257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
29282257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
29382257610Sandi * @see    utf8_to_unicode()
29482257610Sandi */
29582257610Sandifunction unicode_to_utf8( $str ) {
29682257610Sandi  $utf8 = '';
29782257610Sandi  foreach( $str as $unicode ) {
29882257610Sandi    if ( $unicode < 128 ) {
29982257610Sandi      $utf8.= chr( $unicode );
30082257610Sandi    } elseif ( $unicode < 2048 ) {
30182257610Sandi      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
30282257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
30382257610Sandi    } else {
30482257610Sandi      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
30582257610Sandi      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
30682257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
30782257610Sandi    }
30882257610Sandi  }
30982257610Sandi  return $utf8;
31082257610Sandi}
31182257610Sandi
31282257610Sandi/**
31382257610Sandi * UTF-8 Case lookup table
31482257610Sandi *
31582257610Sandi * This lookuptable defines the upper case letters to their correspponding
31682257610Sandi * lower case letter in UTF-8
31782257610Sandi *
31882257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
31982257610Sandi */
32082257610Sandi$UTF8_LOWER_TO_UPPER = array(
32182257610Sandi  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
32282257610Sandi  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
32382257610Sandi  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
32482257610Sandi  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
32582257610Sandi  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
32682257610Sandi  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
32782257610Sandi  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
32882257610Sandi  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
32982257610Sandi  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
33082257610Sandi  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
33182257610Sandi  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
33282257610Sandi  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
33382257610Sandi  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
33482257610Sandi  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
33582257610Sandi  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
33682257610Sandi  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
33782257610Sandi  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
33882257610Sandi  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
33982257610Sandi  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
34082257610Sandi  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
34182257610Sandi  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
34282257610Sandi  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
34382257610Sandi  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
34482257610Sandi  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
34582257610Sandi  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
34682257610Sandi  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
34782257610Sandi  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
34882257610Sandi  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
34982257610Sandi  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
35082257610Sandi  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
35182257610Sandi  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
35282257610Sandi  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
35382257610Sandi  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
35482257610Sandi  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
35582257610Sandi  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
35682257610Sandi  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
35782257610Sandi  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
35882257610Sandi  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
35982257610Sandi  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
36082257610Sandi  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
36182257610Sandi  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
36282257610Sandi  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
36382257610Sandi  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
36482257610Sandi);
36582257610Sandi
36682257610Sandi/**
36782257610Sandi * UTF-8 Case lookup table
36882257610Sandi *
36982257610Sandi * This lookuptable defines the lower case letters to their correspponding
37082257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
37182257610Sandi *
37282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
37382257610Sandi */
37482257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
37582257610Sandi
37682257610Sandi/**
37782257610Sandi * UTF-8 lookup table for lower case accented letters
37882257610Sandi *
37982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
38082257610Sandi * range. This are lower case letters only.
38182257610Sandi *
38282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
38382257610Sandi * @see    utf8_deaccent()
38482257610Sandi */
38582257610Sandi$UTF8_LOWER_ACCENTS = array(
38682257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
38782257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
38882257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
38982257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
39082257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
39182257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
39282257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
39382257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
39482257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
39582257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
39682257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
39782257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
39882257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
39982257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
4000c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
40182257610Sandi);
40282257610Sandi
40382257610Sandi/**
40482257610Sandi * UTF-8 lookup table for upper case accented letters
40582257610Sandi *
40682257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
40782257610Sandi * range. This are upper case letters only.
40882257610Sandi *
40982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
41082257610Sandi * @see    utf8_deaccent()
41182257610Sandi */
41282257610Sandi$UTF8_UPPER_ACCENTS = array(
41382257610Sandi  'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O',
41482257610Sandi  'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K',
41582257610Sandi  'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O',
41682257610Sandi  'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O',
41782257610Sandi  'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C',
41882257610Sandi  'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T',
41982257610Sandi  'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L',
42082257610Sandi  'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z',
42182257610Sandi  'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T',
42282257610Sandi  'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O',
42382257610Sandi  'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J',
42482257610Sandi  'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O',
42582257610Sandi  'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G',
42682257610Sandi  'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A',
427099ada41Sandi  'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
42882257610Sandi);
42982257610Sandi
430099ada41Sandi/**
431099ada41Sandi * UTF-8 array of common special characters
432099ada41Sandi *
433099ada41Sandi * This array should contain all special characters (not a letter or digit)
434099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
435099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
436099ada41Sandi * chars.
437099ada41Sandi *
438099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
439099ada41Sandi *
440099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
441099ada41Sandi * @see    utf8_stripspecials()
442099ada41Sandi */
443099ada41Sandi$UTF8_SPECIAL_CHARS = array(
444099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
445099ada41Sandi  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d,
446099ada41Sandi  0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
4473ed6dbb8Sandi  0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
448099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
449099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
450099ada41Sandi	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
451099ada41Sandi	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
452099ada41Sandi	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
453099ada41Sandi	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
454099ada41Sandi	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
455099ada41Sandi	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
456099ada41Sandi	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
457099ada41Sandi	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
458099ada41Sandi	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
459099ada41Sandi	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
460099ada41Sandi	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
461099ada41Sandi	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
462099ada41Sandi	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
463099ada41Sandi	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
464099ada41Sandi	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
465099ada41Sandi	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
466099ada41Sandi	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
467099ada41Sandi	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
468099ada41Sandi	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
469099ada41Sandi	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
470099ada41Sandi	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
471099ada41Sandi	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
472099ada41Sandi	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
473099ada41Sandi	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
474099ada41Sandi	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
475099ada41Sandi	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
476099ada41Sandi	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
477099ada41Sandi	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
478099ada41Sandi	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
479099ada41Sandi	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
480099ada41Sandi	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
481099ada41Sandi	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
482099ada41Sandi	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
483099ada41Sandi	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
484099ada41Sandi	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
485099ada41Sandi	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
486099ada41Sandi	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
487099ada41Sandi	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
488099ada41Sandi	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
489099ada41Sandi	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
490099ada41Sandi	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
491099ada41Sandi	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
492099ada41Sandi	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
493099ada41Sandi	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
494099ada41Sandi);
49582257610Sandi?>
496