xref: /dokuwiki/inc/Utf8/PhpString.php (revision 6c16a3a9aa602bb7e269fb6d5d18e1353e17f97f)
1<?php
2
3namespace dokuwiki\Utf8;
4
5/**
6 * UTF-8 aware equivalents to PHP's string functions
7 */
8class PhpString
9{
10    /**
11     * A locale independent basename() implementation
12     *
13     * works around a bug in PHP's basename() implementation
14     *
15     * @param string $path A path
16     * @param string $suffix If the name component ends in suffix this will also be cut off
17     * @return string
18     * @link   https://bugs.php.net/bug.php?id=37738
19     *
20     * @see basename()
21     */
22    public static function basename($path, $suffix = '')
23    {
24        $path = trim($path, '\\/');
25        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
26        if ($rpos) {
27            $path = substr($path, $rpos + 1);
28        }
29
30        if (str_ends_with($path, $suffix)) {
31            $path = substr($path, 0, -strlen($suffix));
32        }
33
34        return $path;
35    }
36
37    /**
38     * Unicode aware replacement for strlen()
39     *
40     * utf8_decode() converts characters that are not in ISO-8859-1
41     * to '?', which, for the purpose of counting, is alright
42     *
43     * @param string $string
44     * @return int
45     * @see    utf8_decode()
46     *
47     * @author <chernyshevsky at hotmail dot com>
48     * @see    strlen()
49     */
50    public static function strlen($string)
51    {
52        if (UTF8_MBSTRING) {
53            return mb_strlen($string, 'UTF-8');
54        }
55
56        if (function_exists('iconv_strlen')) {
57            return iconv_strlen($string, 'UTF-8');
58        }
59
60        // utf8_decode is deprecated
61        if (function_exists('utf8_decode')) {
62            return strlen(utf8_decode($string));
63        }
64
65        return strlen($string);
66    }
67
68    /**
69     * UTF-8 aware alternative to substr
70     *
71     * Return part of a string given character offset (and optionally length)
72     *
73     * @param string $str
74     * @param int $offset number of UTF-8 characters offset (from left)
75     * @param int $length (optional) length in UTF-8 characters from offset
76     * @return string
77     * @author Harry Fuecks <hfuecks@gmail.com>
78     * @author Chris Smith <chris@jalakai.co.uk>
79     *
80     */
81    public static function substr($str, $offset, $length = null)
82    {
83        if (UTF8_MBSTRING) {
84            if ($length === null) {
85                return mb_substr($str, $offset);
86            }
87
88            return mb_substr($str, $offset, $length);
89        }
90
91        /*
92         * Notes:
93         *
94         * no mb string support, so we'll use pcre regex's with 'u' flag
95         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
96         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
97         *
98         * substr documentation states false can be returned in some cases (e.g. offset > string length)
99         * mb_substr never returns false, it will return an empty string instead.
100         *
101         * calculating the number of characters in the string is a relatively expensive operation, so
102         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
103         */
104
105        // cast parameters to appropriate types to avoid multiple notices/warnings
106        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
107        $offset = (int)$offset;
108        if ($length !== null) $length = (int)$length;
109
110        // handle trivial cases
111        if ($length === 0) return '';
112        if ($offset < 0 && $length < 0 && $length < $offset) return '';
113
114        $offset_pattern = '';
115        $length_pattern = '';
116
117        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
118        if ($offset < 0) {
119            $strlen = self::strlen($str);        // see notes
120            $offset = $strlen + $offset;
121            if ($offset < 0) $offset = 0;
122        }
123
124        // establish a pattern for offset, a non-captured group equal in length to offset
125        if ($offset > 0) {
126            $Ox = (int)($offset / 65535);
127            $Oy = $offset % 65535;
128
129            if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
130            $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
131        } else {
132            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
133        }
134
135        // establish a pattern for length
136        if ($length === null) {
137            $length_pattern = '(.*)$';                  // the rest of the string
138        } else {
139            if (!isset($strlen)) $strlen = self::strlen($str);    // see notes
140            if ($offset > $strlen) return '';           // another trivial case
141
142            if ($length > 0) {
143                // reduce any length that would go past the end of the string
144                $length = min($strlen - $offset, $length);
145                $Lx = (int)($length / 65535);
146                $Ly = $length % 65535;
147                // +ve length requires ... a captured group of length characters
148                if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
149                $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
150            } elseif ($length < 0) {
151                if ($length < ($offset - $strlen)) return '';
152                $Lx = (int)((-$length) / 65535);
153                $Ly = (-$length) % 65535;
154                // -ve length requires ... capture everything except a group of -length characters
155                //                         anchored at the tail-end of the string
156                if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
157                $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
158            }
159        }
160
161        if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
162        return $match[1];
163    }
164
165    // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
166    /**
167     * Unicode aware replacement for substr_replace()
168     *
169     * @param string $string input string
170     * @param string $replacement the replacement
171     * @param int $start the replacing will begin at the start'th offset into string.
172     * @param int $length If given and is positive, it represents the length of the portion of string which is
173     *                            to be replaced. If length is zero then this function will have the effect of inserting
174     *                            replacement into string at the given start offset.
175     * @return string
176     * @see    substr_replace()
177     *
178     * @author Andreas Gohr <andi@splitbrain.org>
179     */
180    public static function substr_replace($string, $replacement, $start, $length = 0)
181    {
182        $ret = '';
183        if ($start > 0) $ret .= self::substr($string, 0, $start);
184        $ret .= $replacement;
185        $ret .= self::substr($string, $start + $length);
186        return $ret;
187    }
188    // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
189
190    /**
191     * Unicode aware replacement for ltrim()
192     *
193     * @param string $str
194     * @param string $charlist
195     * @return string
196     * @see    ltrim()
197     *
198     * @author Andreas Gohr <andi@splitbrain.org>
199     */
200    public static function ltrim($str, $charlist = '')
201    {
202        if ($charlist === '') return ltrim($str);
203
204        //quote charlist for use in a characterclass
205        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
206
207        return preg_replace('/^[' . $charlist . ']+/u', '', $str);
208    }
209
210    /**
211     * Unicode aware replacement for rtrim()
212     *
213     * @param string $str
214     * @param string $charlist
215     * @return string
216     * @see    rtrim()
217     *
218     * @author Andreas Gohr <andi@splitbrain.org>
219     */
220    public static function rtrim($str, $charlist = '')
221    {
222        if ($charlist === '') return rtrim($str);
223
224        //quote charlist for use in a characterclass
225        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
226
227        return preg_replace('/[' . $charlist . ']+$/u', '', $str);
228    }
229
230    /**
231     * Unicode aware replacement for trim()
232     *
233     * @param string $str
234     * @param string $charlist
235     * @return string
236     * @see    trim()
237     *
238     * @author Andreas Gohr <andi@splitbrain.org>
239     */
240    public static function trim($str, $charlist = '')
241    {
242        if ($charlist === '') return trim($str);
243
244        return self::ltrim(self::rtrim($str, $charlist), $charlist);
245    }
246
247    /**
248     * This is a unicode aware replacement for strtolower()
249     *
250     * Uses mb_string extension if available
251     *
252     * @param string $string
253     * @return string
254     * @see    \dokuwiki\Utf8\PhpString::strtoupper()
255     *
256     * @author Leo Feyer <leo@typolight.org>
257     * @see    strtolower()
258     */
259    public static function strtolower($string)
260    {
261        if ($string === null) return ''; // pre-8.1 behaviour
262        if (UTF8_MBSTRING) {
263            if (class_exists('Normalizer', $autoload = false)) {
264                return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
265            }
266            return (mb_strtolower($string, 'utf-8'));
267        }
268        return strtr($string, Table::upperCaseToLowerCase());
269    }
270
271    /**
272     * This is a unicode aware replacement for strtoupper()
273     *
274     * Uses mb_string extension if available
275     *
276     * @param string $string
277     * @return string
278     * @see    \dokuwiki\Utf8\PhpString::strtoupper()
279     *
280     * @author Leo Feyer <leo@typolight.org>
281     * @see    strtoupper()
282     */
283    public static function strtoupper($string)
284    {
285        if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
286
287        return strtr($string, Table::lowerCaseToUpperCase());
288    }
289
290
291    /**
292     * UTF-8 aware alternative to ucfirst
293     * Make a string's first character uppercase
294     *
295     * @param string $str
296     * @return string with first character as upper case (if applicable)
297     * @author Harry Fuecks
298     *
299     */
300    public static function ucfirst($str)
301    {
302        switch (self::strlen($str)) {
303            case 0:
304                return '';
305            case 1:
306                return self::strtoupper($str);
307            default:
308                preg_match('/^(.{1})(.*)$/us', $str, $matches);
309                return self::strtoupper($matches[1]) . $matches[2];
310        }
311    }
312
313    /**
314     * UTF-8 aware alternative to ucwords
315     * Uppercase the first character of each word in a string
316     *
317     * @param string $str
318     * @return string with first char of each word uppercase
319     * @author Harry Fuecks
320     * @see http://php.net/ucwords
321     *
322     */
323    public static function ucwords($str)
324    {
325        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
326        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
327        // This corresponds to the definition of a "word" defined at http://php.net/ucwords
328        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
329
330        return preg_replace_callback(
331            $pattern,
332            function ($matches) {
333                $leadingws = $matches[2];
334                $ucfirst = self::strtoupper($matches[3]);
335                $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
336                return $leadingws . $ucword;
337            },
338            $str
339        );
340    }
341
342    /**
343     * This is an Unicode aware replacement for strpos
344     *
345     * @param string $haystack
346     * @param string $needle
347     * @param integer $offset
348     * @return integer
349     * @author Leo Feyer <leo@typolight.org>
350     * @see    strpos()
351     *
352     */
353    public static function strpos($haystack, $needle, $offset = 0)
354    {
355        $comp = 0;
356        $length = null;
357
358        while ($length === null || $length < $offset) {
359            $pos = strpos($haystack, $needle, $offset + $comp);
360
361            if ($pos === false)
362                return false;
363
364            $length = self::strlen(substr($haystack, 0, $pos));
365
366            if ($length < $offset)
367                $comp = $pos - $length;
368        }
369
370        return $length;
371    }
372}
373