1 <?php
2 
3 namespace dokuwiki\Utf8;
4 
5 /**
6  * Methods to assess and clean UTF-8 strings
7  */
8 class Clean
9 {
10     /**
11      * Checks if a string contains 7bit ASCII only
12      *
13      * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
14      *
15      * @param string $str
16      * @return bool
17      */
18     public static function isASCII($str)
19     {
20         return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
21     }
22 
23     /**
24      * Tries to detect if a string is in Unicode encoding
25      *
26      * @author <bmorel@ssi.fr>
27      * @link   http://php.net/manual/en/function.utf8-encode.php
28      *
29      * @param string $str
30      * @return bool
31      */
32     public static function isUtf8($str)
33     {
34         $len = strlen($str);
35         for ($i = 0; $i < $len; $i++) {
36             $b = ord($str[$i]);
37             if ($b < 0x80) continue; # 0bbbbbbb
38             elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
39             elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
40             elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
41             elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
42             elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
43             else return false; # Does not match any model
44 
45             for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
46                 if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
47                     return false;
48             }
49         }
50         return true;
51     }
52 
53     /**
54      * Strips all high byte chars
55      *
56      * Returns a pure ASCII7 string
57      *
58      * @author Andreas Gohr <andi@splitbrain.org>
59      *
60      * @param string $str
61      * @return string
62      */
63     public static function strip($str)
64     {
65         $ascii = '';
66         $len = strlen($str);
67         for ($i = 0; $i < $len; $i++) {
68             if (ord($str[$i]) < 128) {
69                 $ascii .= $str[$i];
70             }
71         }
72         return $ascii;
73     }
74 
75     /**
76      * Removes special characters (nonalphanumeric) from a UTF-8 string
77      *
78      * This function adds the controlchars 0x00 to 0x19 to the array of
79      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
80      *
81      * @author Andreas Gohr <andi@splitbrain.org>
82      *
83      * @param  string $string The UTF8 string to strip of special chars
84      * @param  string $repl Replace special with this string
85      * @param  string $additional Additional chars to strip (used in regexp char class)
86      * @return string
87      */
88     public static function stripspecials($string, $repl = '', $additional = '')
89     {
90         static $specials = null;
91         if ($specials === null) {
92             $specials = preg_quote(Table::specialChars(), '/');
93         }
94 
95         return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
96     }
97 
98     /**
99      * Replace bad bytes with an alternative character
100      *
101      * ASCII character is recommended for replacement char
102      *
103      * PCRE Pattern to locate bad bytes in a UTF-8 string
104      * Comes from W3 FAQ: Multilingual Forms
105      * Note: modified to include full ASCII range including control chars
106      *
107      * @author Harry Fuecks <hfuecks@gmail.com>
108      * @see http://www.w3.org/International/questions/qa-forms-utf-8
109      *
110      * @param string $str to search
111      * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
112      * @return string
113      */
114     public static function replaceBadBytes($str, $replace = '')
115     {
116         $UTF8_BAD =
117             '([\x00-\x7F]' .                          # ASCII (including control chars)
118             '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
119             '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
120             '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
121             '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
122             '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
123             '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
124             '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
125             '|(.{1}))';                               # invalid byte
126         ob_start();
127         while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
128             if (!isset($matches[2])) {
129                 echo $matches[0];
130             } else {
131                 echo $replace;
132             }
133             $str = substr($str, strlen($matches[0]));
134         }
135         return ob_get_clean();
136     }
137 
138 
139     /**
140      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
141      *
142      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
143      * letters. Default is to deaccent both cases ($case = 0)
144      *
145      * @author Andreas Gohr <andi@splitbrain.org>
146      *
147      * @param string $string
148      * @param int $case
149      * @return string
150      */
151     public static function deaccent($string, $case = 0)
152     {
153         if ($case <= 0) {
154             $string = strtr($string, Table::lowerAccents());
155         }
156         if ($case >= 0) {
157             $string = strtr($string, Table::upperAccents());
158         }
159         return $string;
160     }
161 
162     /**
163      * Romanize a non-latin string
164      *
165      * @author Andreas Gohr <andi@splitbrain.org>
166      *
167      * @param string $string
168      * @return string
169      */
170     public static function romanize($string)
171     {
172         if (self::isASCII($string)) return $string; //nothing to do
173 
174         return strtr($string, Table::romanization());
175     }
176 
177     /**
178      * adjust a byte index into a utf8 string to a utf8 character boundary
179      *
180      * @author       chris smith <chris@jalakai.co.uk>
181      *
182      * @param string $str utf8 character string
183      * @param int $i byte index into $str
184      * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
185      * @return int byte index into $str now pointing to a utf8 character boundary
186      */
187     public static function correctIdx($str, $i, $next = false)
188     {
189 
190         if ($i <= 0) return 0;
191 
192         $limit = strlen($str);
193         if ($i >= $limit) return $limit;
194 
195         if ($next) {
196             while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
197         } else {
198             while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
199         }
200 
201         return $i;
202     }
203 }
204