xref: /dokuwiki/inc/Utf8/Clean.php (revision f41bbe4cad0871728891d9ffb45bd6fd79ab1024)
1*f41bbe4cSAndreas Gohr<?php
2*f41bbe4cSAndreas Gohr
3*f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8;
4*f41bbe4cSAndreas Gohr
5*f41bbe4cSAndreas Gohr/**
6*f41bbe4cSAndreas Gohr * Methods to assess and clean UTF-8 strings
7*f41bbe4cSAndreas Gohr */
8*f41bbe4cSAndreas Gohrclass Clean
9*f41bbe4cSAndreas Gohr{
10*f41bbe4cSAndreas Gohr    /**
11*f41bbe4cSAndreas Gohr     * Checks if a string contains 7bit ASCII only
12*f41bbe4cSAndreas Gohr     *
13*f41bbe4cSAndreas Gohr     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
14*f41bbe4cSAndreas Gohr     *
15*f41bbe4cSAndreas Gohr     * @param string $str
16*f41bbe4cSAndreas Gohr     * @return bool
17*f41bbe4cSAndreas Gohr     */
18*f41bbe4cSAndreas Gohr    public static function isASCII($str)
19*f41bbe4cSAndreas Gohr    {
20*f41bbe4cSAndreas Gohr        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
21*f41bbe4cSAndreas Gohr    }
22*f41bbe4cSAndreas Gohr
23*f41bbe4cSAndreas Gohr    /**
24*f41bbe4cSAndreas Gohr     * Tries to detect if a string is in Unicode encoding
25*f41bbe4cSAndreas Gohr     *
26*f41bbe4cSAndreas Gohr     * @author <bmorel@ssi.fr>
27*f41bbe4cSAndreas Gohr     * @link   http://php.net/manual/en/function.utf8-encode.php
28*f41bbe4cSAndreas Gohr     *
29*f41bbe4cSAndreas Gohr     * @param string $str
30*f41bbe4cSAndreas Gohr     * @return bool
31*f41bbe4cSAndreas Gohr     */
32*f41bbe4cSAndreas Gohr    public static function isUtf8($str)
33*f41bbe4cSAndreas Gohr    {
34*f41bbe4cSAndreas Gohr        $len = strlen($str);
35*f41bbe4cSAndreas Gohr        for ($i = 0; $i < $len; $i++) {
36*f41bbe4cSAndreas Gohr            $b = ord($str[$i]);
37*f41bbe4cSAndreas Gohr            if ($b < 0x80) continue; # 0bbbbbbb
38*f41bbe4cSAndreas Gohr            elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
39*f41bbe4cSAndreas Gohr            elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
40*f41bbe4cSAndreas Gohr            elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
41*f41bbe4cSAndreas Gohr            elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
42*f41bbe4cSAndreas Gohr            elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
43*f41bbe4cSAndreas Gohr            else return false; # Does not match any model
44*f41bbe4cSAndreas Gohr
45*f41bbe4cSAndreas Gohr            for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
46*f41bbe4cSAndreas Gohr                if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
47*f41bbe4cSAndreas Gohr                    return false;
48*f41bbe4cSAndreas Gohr            }
49*f41bbe4cSAndreas Gohr        }
50*f41bbe4cSAndreas Gohr        return true;
51*f41bbe4cSAndreas Gohr    }
52*f41bbe4cSAndreas Gohr
53*f41bbe4cSAndreas Gohr    /**
54*f41bbe4cSAndreas Gohr     * Strips all high byte chars
55*f41bbe4cSAndreas Gohr     *
56*f41bbe4cSAndreas Gohr     * Returns a pure ASCII7 string
57*f41bbe4cSAndreas Gohr     *
58*f41bbe4cSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
59*f41bbe4cSAndreas Gohr     *
60*f41bbe4cSAndreas Gohr     * @param string $str
61*f41bbe4cSAndreas Gohr     * @return string
62*f41bbe4cSAndreas Gohr     */
63*f41bbe4cSAndreas Gohr    public static function strip($str)
64*f41bbe4cSAndreas Gohr    {
65*f41bbe4cSAndreas Gohr        $ascii = '';
66*f41bbe4cSAndreas Gohr        $len = strlen($str);
67*f41bbe4cSAndreas Gohr        for ($i = 0; $i < $len; $i++) {
68*f41bbe4cSAndreas Gohr            if (ord($str{$i}) < 128) {
69*f41bbe4cSAndreas Gohr                $ascii .= $str{$i};
70*f41bbe4cSAndreas Gohr            }
71*f41bbe4cSAndreas Gohr        }
72*f41bbe4cSAndreas Gohr        return $ascii;
73*f41bbe4cSAndreas Gohr    }
74*f41bbe4cSAndreas Gohr
75*f41bbe4cSAndreas Gohr    /**
76*f41bbe4cSAndreas Gohr     * Removes special characters (nonalphanumeric) from a UTF-8 string
77*f41bbe4cSAndreas Gohr     *
78*f41bbe4cSAndreas Gohr     * This function adds the controlchars 0x00 to 0x19 to the array of
79*f41bbe4cSAndreas Gohr     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
80*f41bbe4cSAndreas Gohr     *
81*f41bbe4cSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
82*f41bbe4cSAndreas Gohr     *
83*f41bbe4cSAndreas Gohr     * @param  string $string The UTF8 string to strip of special chars
84*f41bbe4cSAndreas Gohr     * @param  string $repl Replace special with this string
85*f41bbe4cSAndreas Gohr     * @param  string $additional Additional chars to strip (used in regexp char class)
86*f41bbe4cSAndreas Gohr     * @return string
87*f41bbe4cSAndreas Gohr     */
88*f41bbe4cSAndreas Gohr    public static function stripspecials($string, $repl = '', $additional = '')
89*f41bbe4cSAndreas Gohr    {
90*f41bbe4cSAndreas Gohr        static $specials = null;
91*f41bbe4cSAndreas Gohr        if ($specials === null) {
92*f41bbe4cSAndreas Gohr            $specials = preg_quote(Table::specialChars(), '/');
93*f41bbe4cSAndreas Gohr        }
94*f41bbe4cSAndreas Gohr
95*f41bbe4cSAndreas Gohr        return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
96*f41bbe4cSAndreas Gohr    }
97*f41bbe4cSAndreas Gohr
98*f41bbe4cSAndreas Gohr    /**
99*f41bbe4cSAndreas Gohr     * Replace bad bytes with an alternative character
100*f41bbe4cSAndreas Gohr     *
101*f41bbe4cSAndreas Gohr     * ASCII character is recommended for replacement char
102*f41bbe4cSAndreas Gohr     *
103*f41bbe4cSAndreas Gohr     * PCRE Pattern to locate bad bytes in a UTF-8 string
104*f41bbe4cSAndreas Gohr     * Comes from W3 FAQ: Multilingual Forms
105*f41bbe4cSAndreas Gohr     * Note: modified to include full ASCII range including control chars
106*f41bbe4cSAndreas Gohr     *
107*f41bbe4cSAndreas Gohr     * @author Harry Fuecks <hfuecks@gmail.com>
108*f41bbe4cSAndreas Gohr     * @see http://www.w3.org/International/questions/qa-forms-utf-8
109*f41bbe4cSAndreas Gohr     *
110*f41bbe4cSAndreas Gohr     * @param string $str to search
111*f41bbe4cSAndreas Gohr     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
112*f41bbe4cSAndreas Gohr     * @return string
113*f41bbe4cSAndreas Gohr     */
114*f41bbe4cSAndreas Gohr    public static function replaceBadBytes($str, $replace = '')
115*f41bbe4cSAndreas Gohr    {
116*f41bbe4cSAndreas Gohr        $UTF8_BAD =
117*f41bbe4cSAndreas Gohr            '([\x00-\x7F]' .                          # ASCII (including control chars)
118*f41bbe4cSAndreas Gohr            '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
119*f41bbe4cSAndreas Gohr            '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
120*f41bbe4cSAndreas Gohr            '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
121*f41bbe4cSAndreas Gohr            '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
122*f41bbe4cSAndreas Gohr            '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
123*f41bbe4cSAndreas Gohr            '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
124*f41bbe4cSAndreas Gohr            '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
125*f41bbe4cSAndreas Gohr            '|(.{1}))';                               # invalid byte
126*f41bbe4cSAndreas Gohr        ob_start();
127*f41bbe4cSAndreas Gohr        while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
128*f41bbe4cSAndreas Gohr            if (!isset($matches[2])) {
129*f41bbe4cSAndreas Gohr                echo $matches[0];
130*f41bbe4cSAndreas Gohr            } else {
131*f41bbe4cSAndreas Gohr                echo $replace;
132*f41bbe4cSAndreas Gohr            }
133*f41bbe4cSAndreas Gohr            $str = substr($str, strlen($matches[0]));
134*f41bbe4cSAndreas Gohr        }
135*f41bbe4cSAndreas Gohr        return ob_get_clean();
136*f41bbe4cSAndreas Gohr    }
137*f41bbe4cSAndreas Gohr
138*f41bbe4cSAndreas Gohr
139*f41bbe4cSAndreas Gohr    /**
140*f41bbe4cSAndreas Gohr     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
141*f41bbe4cSAndreas Gohr     *
142*f41bbe4cSAndreas Gohr     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
143*f41bbe4cSAndreas Gohr     * letters. Default is to deaccent both cases ($case = 0)
144*f41bbe4cSAndreas Gohr     *
145*f41bbe4cSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
146*f41bbe4cSAndreas Gohr     *
147*f41bbe4cSAndreas Gohr     * @param string $string
148*f41bbe4cSAndreas Gohr     * @param int $case
149*f41bbe4cSAndreas Gohr     * @return string
150*f41bbe4cSAndreas Gohr     */
151*f41bbe4cSAndreas Gohr    public static function deaccent($string, $case = 0)
152*f41bbe4cSAndreas Gohr    {
153*f41bbe4cSAndreas Gohr        if ($case <= 0) {
154*f41bbe4cSAndreas Gohr            $string = strtr($string, Table::lowerAccents());
155*f41bbe4cSAndreas Gohr        }
156*f41bbe4cSAndreas Gohr        if ($case >= 0) {
157*f41bbe4cSAndreas Gohr            $string = strtr($string, Table::upperAccents());
158*f41bbe4cSAndreas Gohr        }
159*f41bbe4cSAndreas Gohr        return $string;
160*f41bbe4cSAndreas Gohr    }
161*f41bbe4cSAndreas Gohr
162*f41bbe4cSAndreas Gohr    /**
163*f41bbe4cSAndreas Gohr     * Romanize a non-latin string
164*f41bbe4cSAndreas Gohr     *
165*f41bbe4cSAndreas Gohr     * @author Andreas Gohr <andi@splitbrain.org>
166*f41bbe4cSAndreas Gohr     *
167*f41bbe4cSAndreas Gohr     * @param string $string
168*f41bbe4cSAndreas Gohr     * @return string
169*f41bbe4cSAndreas Gohr     */
170*f41bbe4cSAndreas Gohr    public static function romanize($string)
171*f41bbe4cSAndreas Gohr    {
172*f41bbe4cSAndreas Gohr        if (self::isASCII($string)) return $string; //nothing to do
173*f41bbe4cSAndreas Gohr
174*f41bbe4cSAndreas Gohr        return strtr($string, Table::romanization());
175*f41bbe4cSAndreas Gohr    }
176*f41bbe4cSAndreas Gohr
177*f41bbe4cSAndreas Gohr    /**
178*f41bbe4cSAndreas Gohr     * adjust a byte index into a utf8 string to a utf8 character boundary
179*f41bbe4cSAndreas Gohr     *
180*f41bbe4cSAndreas Gohr     * @author       chris smith <chris@jalakai.co.uk>
181*f41bbe4cSAndreas Gohr     *
182*f41bbe4cSAndreas Gohr     * @param string $str utf8 character string
183*f41bbe4cSAndreas Gohr     * @param int $i byte index into $str
184*f41bbe4cSAndreas Gohr     * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
185*f41bbe4cSAndreas Gohr     * @return int byte index into $str now pointing to a utf8 character boundary
186*f41bbe4cSAndreas Gohr     */
187*f41bbe4cSAndreas Gohr    public static function correctIdx($str, $i, $next = false)
188*f41bbe4cSAndreas Gohr    {
189*f41bbe4cSAndreas Gohr
190*f41bbe4cSAndreas Gohr        if ($i <= 0) return 0;
191*f41bbe4cSAndreas Gohr
192*f41bbe4cSAndreas Gohr        $limit = strlen($str);
193*f41bbe4cSAndreas Gohr        if ($i >= $limit) return $limit;
194*f41bbe4cSAndreas Gohr
195*f41bbe4cSAndreas Gohr        if ($next) {
196*f41bbe4cSAndreas Gohr            while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
197*f41bbe4cSAndreas Gohr        } else {
198*f41bbe4cSAndreas Gohr            while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
199*f41bbe4cSAndreas Gohr        }
200*f41bbe4cSAndreas Gohr
201*f41bbe4cSAndreas Gohr        return $i;
202*f41bbe4cSAndreas Gohr    }
203*f41bbe4cSAndreas Gohr
204*f41bbe4cSAndreas Gohr}
205