xref: /dokuwiki/inc/utf8.php (revision ed7b5f0908941f1bacef7e7c3a02c106a42cd5cc)
1*ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
582257610Sandi * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
982257610Sandi/**
1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters
1149c713a3Sandi *
1249c713a3Sandi * Slashes are not encoded
1349c713a3Sandi *
14f59b22f0Sandi * When the second parameter is true the string will
15f59b22f0Sandi * be encoded only if non ASCII characters are detected -
16f59b22f0Sandi * This makes it safe to run it multiple times on the
17f59b22f0Sandi * same string (default is true)
18f59b22f0Sandi *
1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
20f59b22f0Sandi * @see    urlencode
2149c713a3Sandi */
22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
23f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24f59b22f0Sandi    return $file;
25f59b22f0Sandi  }
26f59b22f0Sandi  $file = urlencode($file);
2749c713a3Sandi  $file = str_replace('%2F','/',$file);
2849c713a3Sandi  return $file;
2949c713a3Sandi}
3049c713a3Sandi
3149c713a3Sandi/**
3249c713a3Sandi * URL-Decode a filename
3349c713a3Sandi *
34f59b22f0Sandi * This is just a wrapper around urldecode
35f59b22f0Sandi *
3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
37f59b22f0Sandi * @see    urldecode
3849c713a3Sandi */
3949c713a3Sandifunction utf8_decodeFN($file){
40f59b22f0Sandi  $file = urldecode($file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
44f29bd553Sandi/**
4544f669e9Sandi * Checks if a string contains 7bit ASCII only
4644f669e9Sandi *
4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
4844f669e9Sandi */
4944f669e9Sandifunction utf8_isASCII($str){
5044f669e9Sandi  for($i=0; $i<strlen($str); $i++){
5144f669e9Sandi    if(ord($str{$i}) >127) return false;
5244f669e9Sandi  }
5344f669e9Sandi  return true;
5444f669e9Sandi}
5544f669e9Sandi
5644f669e9Sandi/**
57f29bd553Sandi * Tries to detect if a string is in Unicode encoding
58f29bd553Sandi *
59f29bd553Sandi * @author <bmorel@ssi.fr>
60f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
61f29bd553Sandi */
62f29bd553Sandifunction utf8_check($Str) {
63f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
64f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
65f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
66f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
67f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
68f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
69f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
70f29bd553Sandi  else return false; # Does not match any model
71f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
72f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
73f29bd553Sandi   return false;
74f29bd553Sandi  }
75f29bd553Sandi }
76f29bd553Sandi return true;
77f29bd553Sandi}
7849c713a3Sandi
792f954959Sandi/**
802f954959Sandi * This is a unicode aware replacement for strlen()
812f954959Sandi *
822f954959Sandi * Uses mb_string extension if available
832f954959Sandi *
842f954959Sandi * @author Andreas Gohr <andi@splitbrain.org>
852f954959Sandi * @see    strlen()
862f954959Sandi */
872f954959Sandifunction utf8_strlen($string){
882f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strlen'))
892f954959Sandi    return mb_strlen($string,'utf-8');
902f954959Sandi
912f954959Sandi  $uni = utf8_to_unicode($string);
922f954959Sandi  return count($uni);
932f954959Sandi}
942f954959Sandi
957077c942Sandi/**
967077c942Sandi * This is a unicode aware replacement for substr()
977077c942Sandi *
987077c942Sandi * Uses mb_string extension if available
997077c942Sandi *
1007077c942Sandi * @author Andreas Gohr <andi@splitbrain.org>
1017077c942Sandi * @see    substr()
1027077c942Sandi */
1037077c942Sandifunction utf8_substr($str, $start, $length=null){
1047077c942Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr'))
1057077c942Sandi    return mb_substr($str,$start,$length,'utf-8');
1067077c942Sandi
1077077c942Sandi  $uni = utf8_to_unicode($str);
1087077c942Sandi  return unicode_to_utf8(array_slice($uni,$start,$length));
1097077c942Sandi}
1102f954959Sandi
11149c713a3Sandi/**
11282257610Sandi * This is a unicode aware replacement for strtolower()
11382257610Sandi *
11482257610Sandi * Uses mb_string extension if available
11582257610Sandi *
11682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
11782257610Sandi * @see    strtolower()
11882257610Sandi * @see    utf8_strtoupper()
11982257610Sandi */
12082257610Sandifunction utf8_strtolower($string){
12182257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
12282257610Sandi    return mb_strtolower($string,'utf-8');
12382257610Sandi
12482257610Sandi  global $UTF8_UPPER_TO_LOWER;
12582257610Sandi  $uni = utf8_to_unicode($string);
12682257610Sandi  for ($i=0; $i < count($uni); $i++){
12782257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
12882257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
12982257610Sandi    }
13082257610Sandi  }
13182257610Sandi  return unicode_to_utf8($uni);
13282257610Sandi}
13382257610Sandi
13482257610Sandi/**
13582257610Sandi * This is a unicode aware replacement for strtoupper()
13682257610Sandi *
13782257610Sandi * Uses mb_string extension if available
13882257610Sandi *
13982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
14082257610Sandi * @see    strtoupper()
14182257610Sandi * @see    utf8_strtoupper()
14282257610Sandi */
14382257610Sandifunction utf8_strtoupper($string){
14482257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
14582257610Sandi    return mb_strtolower($string,'utf-8');
14682257610Sandi
14782257610Sandi  global $UTF8_LOWER_TO_UPPER;
14882257610Sandi  $uni = utf8_to_unicode($string);
14982257610Sandi  for ($i=0; $i < count($uni); $i++){
15082257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
15182257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
15282257610Sandi    }
15382257610Sandi  }
15482257610Sandi  return unicode_to_utf8($uni);
15582257610Sandi}
15682257610Sandi
15782257610Sandi/**
15882257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
15982257610Sandi *
16082257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
16182257610Sandi * letters. Default is to deaccent both cases ($case = 0)
16282257610Sandi *
16382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
16482257610Sandi */
16582257610Sandifunction utf8_deaccent($string,$case=0){
16682257610Sandi  if($case <= 0){
16782257610Sandi    global $UTF8_LOWER_ACCENTS;
16882257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
16982257610Sandi  }
17082257610Sandi  if($case >= 0){
17182257610Sandi    global $UTF8_UPPER_ACCENTS;
17282257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
17382257610Sandi  }
17482257610Sandi  return $string;
17582257610Sandi}
17682257610Sandi
17782257610Sandi/**
178099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
179099ada41Sandi *
180099ada41Sandi * Be sure to specify all specialchars you give in $repl in $keep, too
181099ada41Sandi * or it won't work.
182099ada41Sandi *
183099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
184099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
185099ada41Sandi *
186099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
187099ada41Sandi * @param  string $string The UTF8 string to strip of special chars
188099ada41Sandi * @param  string $repl   Replace special with this string
189099ada41Sandi * @param  string $keep   Special chars to keep (in UTF8)
190099ada41Sandi */
191099ada41Sandifunction utf8_stripspecials($string,$repl='',$keep=''){
192099ada41Sandi  global $UTF8_SPECIAL_CHARS;
193099ada41Sandi  if($keep != ''){
194099ada41Sandi    $specials = array_diff($UTF8_SPECIAL_CHARS, utf8_to_unicode($keep));
195099ada41Sandi  }else{
196099ada41Sandi    $specials = $UTF8_SPECIAL_CHARS;
197099ada41Sandi  }
198099ada41Sandi
199099ada41Sandi  $specials = unicode_to_utf8($specials);
200099ada41Sandi  $specials = preg_quote($specials, '/');
201099ada41Sandi
202099ada41Sandi  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
203099ada41Sandi}
204099ada41Sandi
205099ada41Sandi/**
2062f954959Sandi * This is an Unicode aware replacement for strpos
2072f954959Sandi *
2082f954959Sandi * Uses mb_string extension if available
2092f954959Sandi *
2102f954959Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
2112f954959Sandi * @author Andreas Gohr <andi@splitbrain.org>
2122f954959Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
2132f954959Sandi * @see    strpos()
2142f954959Sandi */
2152f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
2162f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
2172f954959Sandi    return mb_strpos($haystack,$needle,$offset,'utf-8');
2182f954959Sandi
2192f954959Sandi  $haystack = utf8_to_unicode($haystack);
2202f954959Sandi  $needle   = utf8_to_unicode($needle);
2212f954959Sandi  $position = $offset;
2222f954959Sandi  $found = false;
2232f954959Sandi
2242f954959Sandi  while( (! $found ) && ( $position < count( $haystack ) ) ) {
2252f954959Sandi    if ( $needle[0] == $haystack[$position] ) {
2262f954959Sandi      for ($i = 1; $i < count( $needle ); $i++ ) {
2272f954959Sandi        if ( $needle[$i] != $haystack[ $position + $i ] ) break;
2282f954959Sandi      }
2292f954959Sandi      if ( $i == count( $needle ) ) {
2302f954959Sandi        $found = true;
2312f954959Sandi        $position--;
2322f954959Sandi      }
2332f954959Sandi    }
2342f954959Sandi    $position++;
2352f954959Sandi  }
2362f954959Sandi  return ( $found == true ) ? $position : false;
2372f954959Sandi}
2382f954959Sandi
2392f954959Sandi/**
24082257610Sandi * This function will any UTF-8 encoded text and return it as
24182257610Sandi * a list of Unicode values:
24282257610Sandi *
24382257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
24482257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
24582257610Sandi * @see    unicode_to_utf8()
24682257610Sandi */
24782257610Sandifunction utf8_to_unicode( $str ) {
24882257610Sandi  $unicode = array();
24982257610Sandi  $values = array();
25082257610Sandi  $lookingFor = 1;
25182257610Sandi
25282257610Sandi  for ($i = 0; $i < strlen( $str ); $i++ ) {
25382257610Sandi    $thisValue = ord( $str[ $i ] );
25482257610Sandi    if ( $thisValue < 128 ) $unicode[] = $thisValue;
25582257610Sandi    else {
25682257610Sandi      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
25782257610Sandi      $values[] = $thisValue;
25882257610Sandi      if ( count( $values ) == $lookingFor ) {
25982257610Sandi  $number = ( $lookingFor == 3 ) ?
26082257610Sandi    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
26182257610Sandi  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
26282257610Sandi  $unicode[] = $number;
26382257610Sandi  $values = array();
26482257610Sandi  $lookingFor = 1;
26582257610Sandi      }
26682257610Sandi    }
26782257610Sandi  }
26882257610Sandi  return $unicode;
26982257610Sandi}
27082257610Sandi
27182257610Sandi/**
27282257610Sandi * This function will convert a Unicode array back to its UTF-8 representation
27382257610Sandi *
27482257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
27582257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
27682257610Sandi * @see    utf8_to_unicode()
27782257610Sandi */
27882257610Sandifunction unicode_to_utf8( $str ) {
27982257610Sandi  $utf8 = '';
28082257610Sandi  foreach( $str as $unicode ) {
28182257610Sandi    if ( $unicode < 128 ) {
28282257610Sandi      $utf8.= chr( $unicode );
28382257610Sandi    } elseif ( $unicode < 2048 ) {
28482257610Sandi      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
28582257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
28682257610Sandi    } else {
28782257610Sandi      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
28882257610Sandi      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
28982257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
29082257610Sandi    }
29182257610Sandi  }
29282257610Sandi  return $utf8;
29382257610Sandi}
29482257610Sandi
29582257610Sandi/**
29682257610Sandi * UTF-8 Case lookup table
29782257610Sandi *
29882257610Sandi * This lookuptable defines the upper case letters to their correspponding
29982257610Sandi * lower case letter in UTF-8
30082257610Sandi *
30182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
30282257610Sandi */
30382257610Sandi$UTF8_LOWER_TO_UPPER = array(
30482257610Sandi  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
30582257610Sandi  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
30682257610Sandi  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
30782257610Sandi  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
30882257610Sandi  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
30982257610Sandi  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
31082257610Sandi  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
31182257610Sandi  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
31282257610Sandi  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
31382257610Sandi  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
31482257610Sandi  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
31582257610Sandi  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
31682257610Sandi  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
31782257610Sandi  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
31882257610Sandi  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
31982257610Sandi  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
32082257610Sandi  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
32182257610Sandi  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
32282257610Sandi  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
32382257610Sandi  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
32482257610Sandi  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
32582257610Sandi  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
32682257610Sandi  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
32782257610Sandi  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
32882257610Sandi  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
32982257610Sandi  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
33082257610Sandi  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
33182257610Sandi  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
33282257610Sandi  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
33382257610Sandi  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
33482257610Sandi  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
33582257610Sandi  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
33682257610Sandi  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
33782257610Sandi  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
33882257610Sandi  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
33982257610Sandi  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
34082257610Sandi  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
34182257610Sandi  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
34282257610Sandi  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
34382257610Sandi  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
34482257610Sandi  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
34582257610Sandi  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
34682257610Sandi  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
34782257610Sandi);
34882257610Sandi
34982257610Sandi/**
35082257610Sandi * UTF-8 Case lookup table
35182257610Sandi *
35282257610Sandi * This lookuptable defines the lower case letters to their correspponding
35382257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
35482257610Sandi *
35582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
35682257610Sandi */
35782257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
35882257610Sandi
35982257610Sandi/**
36082257610Sandi * UTF-8 lookup table for lower case accented letters
36182257610Sandi *
36282257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
36382257610Sandi * range. This are lower case letters only.
36482257610Sandi *
36582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
36682257610Sandi * @see    utf8_deaccent()
36782257610Sandi */
36882257610Sandi$UTF8_LOWER_ACCENTS = array(
36982257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
37082257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
37182257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
37282257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
37382257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
37482257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
37582257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
37682257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
37782257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
37882257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
37982257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
38082257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
38182257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
38282257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
3830c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
38482257610Sandi);
38582257610Sandi
38682257610Sandi/**
38782257610Sandi * UTF-8 lookup table for upper case accented letters
38882257610Sandi *
38982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
39082257610Sandi * range. This are upper case letters only.
39182257610Sandi *
39282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
39382257610Sandi * @see    utf8_deaccent()
39482257610Sandi */
39582257610Sandi$UTF8_UPPER_ACCENTS = array(
39682257610Sandi  'à' => 'A', 'ô' => 'O', 'ď' => 'D', 'ḟ' => 'F', 'ë' => 'E', 'š' => 'S', 'ơ' => 'O',
39782257610Sandi  'ß' => 'Ss', 'ă' => 'A', 'ř' => 'R', 'ț' => 'T', 'ň' => 'N', 'ā' => 'A', 'ķ' => 'K',
39882257610Sandi  'ŝ' => 'S', 'ỳ' => 'Y', 'ņ' => 'N', 'ĺ' => 'L', 'ħ' => 'H', 'ṗ' => 'P', 'ó' => 'O',
39982257610Sandi  'ú' => 'U', 'ě' => 'E', 'é' => 'E', 'ç' => 'C', 'ẁ' => 'W', 'ċ' => 'C', 'õ' => 'O',
40082257610Sandi  'ṡ' => 'S', 'ø' => 'O', 'ģ' => 'G', 'ŧ' => 'T', 'ș' => 'S', 'ė' => 'E', 'ĉ' => 'C',
40182257610Sandi  'ś' => 'S', 'î' => 'I', 'ű' => 'U', 'ć' => 'C', 'ę' => 'E', 'ŵ' => 'W', 'ṫ' => 'T',
40282257610Sandi  'ū' => 'U', 'č' => 'C', 'ö' => 'Oe', 'è' => 'E', 'ŷ' => 'Y', 'ą' => 'A', 'ł' => 'L',
40382257610Sandi  'ų' => 'U', 'ů' => 'U', 'ş' => 'S', 'ğ' => 'G', 'ļ' => 'L', 'ƒ' => 'F', 'ž' => 'Z',
40482257610Sandi  'ẃ' => 'W', 'ḃ' => 'B', 'å' => 'A', 'ì' => 'I', 'ï' => 'I', 'ḋ' => 'D', 'ť' => 'T',
40582257610Sandi  'ŗ' => 'R', 'ä' => 'Ae', 'í' => 'I', 'ŕ' => 'R', 'ê' => 'E', 'ü' => 'Ue', 'ò' => 'O',
40682257610Sandi  'ē' => 'E', 'ñ' => 'N', 'ń' => 'N', 'ĥ' => 'H', 'ĝ' => 'G', 'đ' => 'D', 'ĵ' => 'J',
40782257610Sandi  'ÿ' => 'Y', 'ũ' => 'U', 'ŭ' => 'U', 'ư' => 'U', 'ţ' => 'T', 'ý' => 'Y', 'ő' => 'O',
40882257610Sandi  'â' => 'A', 'ľ' => 'L', 'ẅ' => 'W', 'ż' => 'Z', 'ī' => 'I', 'ã' => 'A', 'ġ' => 'G',
40982257610Sandi  'ṁ' => 'M', 'ō' => 'O', 'ĩ' => 'I', 'ù' => 'U', 'į' => 'I', 'ź' => 'Z', 'á' => 'A',
410099ada41Sandi  'û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
41182257610Sandi);
41282257610Sandi
413099ada41Sandi/**
414099ada41Sandi * UTF-8 array of common special characters
415099ada41Sandi *
416099ada41Sandi * This array should contain all special characters (not a letter or digit)
417099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
418099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
419099ada41Sandi * chars.
420099ada41Sandi *
421099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
422099ada41Sandi *
423099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
424099ada41Sandi * @see    utf8_stripspecials()
425099ada41Sandi */
426099ada41Sandi$UTF8_SPECIAL_CHARS = array(
427099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
428099ada41Sandi  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d,
429099ada41Sandi  0x002e, 0x002f, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
430099ada41Sandi  0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0142, 0x007b, 0x007c, 0x007d, 0x007e,
431099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
432099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
433099ada41Sandi	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
434099ada41Sandi	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
435099ada41Sandi	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
436099ada41Sandi	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
437099ada41Sandi	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
438099ada41Sandi	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
439099ada41Sandi	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
440099ada41Sandi	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
441099ada41Sandi	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
442099ada41Sandi	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
443099ada41Sandi	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
444099ada41Sandi	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
445099ada41Sandi	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
446099ada41Sandi	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
447099ada41Sandi	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
448099ada41Sandi	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
449099ada41Sandi	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
450099ada41Sandi	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
451099ada41Sandi	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
452099ada41Sandi	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
453099ada41Sandi	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
454099ada41Sandi	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
455099ada41Sandi	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
456099ada41Sandi	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
457099ada41Sandi	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
458099ada41Sandi	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
459099ada41Sandi	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
460099ada41Sandi	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
461099ada41Sandi	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
462099ada41Sandi	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
463099ada41Sandi	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
464099ada41Sandi	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
465099ada41Sandi	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
466099ada41Sandi	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
467099ada41Sandi	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
468099ada41Sandi	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
469099ada41Sandi	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
470099ada41Sandi	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
471099ada41Sandi	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
472099ada41Sandi	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
473099ada41Sandi	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
474099ada41Sandi	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
475099ada41Sandi	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
476099ada41Sandi	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
477099ada41Sandi);
47882257610Sandi?>
479