xref: /dokuwiki/inc/Utf8/PhpString.php (revision 316e3ee67cce340deac79a8c6f89d881b178d094)
1<?php
2
3namespace dokuwiki\Utf8;
4
5/**
6 * UTF-8 aware equivalents to PHP's string functions
7 */
8class PhpString
9{
10
11    /**
12     * A locale independent basename() implementation
13     *
14     * works around a bug in PHP's basename() implementation
15     *
16     * @param string $path A path
17     * @param string $suffix If the name component ends in suffix this will also be cut off
18     * @return string
19     * @link   https://bugs.php.net/bug.php?id=37738
20     *
21     * @see basename()
22     */
23    public static function basename($path, $suffix = '')
24    {
25        $path = trim($path, '\\/');
26        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
27        if ($rpos) {
28            $path = substr($path, $rpos + 1);
29        }
30
31        $suflen = strlen($suffix);
32        if ($suflen && (substr($path, -$suflen) === $suffix)) {
33            $path = substr($path, 0, -$suflen);
34        }
35
36        return $path;
37    }
38
39    /**
40     * Unicode aware replacement for strlen()
41     *
42     * utf8_decode() converts characters that are not in ISO-8859-1
43     * to '?', which, for the purpose of counting, is alright
44     *
45     * @param string $string
46     * @return int
47     * @see    utf8_decode()
48     *
49     * @author <chernyshevsky at hotmail dot com>
50     * @see    strlen()
51     */
52    public static function strlen($string)
53    {
54        if (UTF8_MBSTRING) {
55            return mb_strlen($string, 'UTF-8');
56        }
57
58        if (function_exists('iconv_strlen')) {
59            return iconv_strlen($string, 'UTF-8');
60        }
61
62        // utf8_decode is deprecated
63        if (function_exists('utf8_decode')) {
64            return strlen(utf8_decode($string));
65        }
66
67        return strlen($string);
68    }
69
70    /**
71     * UTF-8 aware alternative to substr
72     *
73     * Return part of a string given character offset (and optionally length)
74     *
75     * @param string $str
76     * @param int $offset number of UTF-8 characters offset (from left)
77     * @param int $length (optional) length in UTF-8 characters from offset
78     * @return string
79     * @author Harry Fuecks <hfuecks@gmail.com>
80     * @author Chris Smith <chris@jalakai.co.uk>
81     *
82     */
83    public static function substr($str, $offset, $length = null)
84    {
85        if (UTF8_MBSTRING) {
86            if ($length === null) {
87                return mb_substr($str, $offset);
88            }
89
90            return mb_substr($str, $offset, $length);
91        }
92
93        /*
94         * Notes:
95         *
96         * no mb string support, so we'll use pcre regex's with 'u' flag
97         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
98         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
99         *
100         * substr documentation states false can be returned in some cases (e.g. offset > string length)
101         * mb_substr never returns false, it will return an empty string instead.
102         *
103         * calculating the number of characters in the string is a relatively expensive operation, so
104         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
105         */
106
107        // cast parameters to appropriate types to avoid multiple notices/warnings
108        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
109        $offset = (int)$offset;
110        if ($length !== null) $length = (int)$length;
111
112        // handle trivial cases
113        if ($length === 0) return '';
114        if ($offset < 0 && $length < 0 && $length < $offset) return '';
115
116        $offset_pattern = '';
117        $length_pattern = '';
118
119        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
120        if ($offset < 0) {
121            $strlen = self::strlen($str);        // see notes
122            $offset = $strlen + $offset;
123            if ($offset < 0) $offset = 0;
124        }
125
126        // establish a pattern for offset, a non-captured group equal in length to offset
127        if ($offset > 0) {
128            $Ox = (int)($offset / 65535);
129            $Oy = $offset % 65535;
130
131            if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
132            $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
133        } else {
134            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
135        }
136
137        // establish a pattern for length
138        if ($length === null) {
139            $length_pattern = '(.*)$';                  // the rest of the string
140        } else {
141
142            if (!isset($strlen)) $strlen = self::strlen($str);    // see notes
143            if ($offset > $strlen) return '';           // another trivial case
144
145            if ($length > 0) {
146                // reduce any length that would go past the end of the string
147                $length = min($strlen - $offset, $length);
148                $Lx = (int)($length / 65535);
149                $Ly = $length % 65535;
150                // +ve length requires ... a captured group of length characters
151                if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
152                $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
153            } elseif ($length < 0) {
154                if ($length < ($offset - $strlen)) return '';
155                $Lx = (int)((-$length) / 65535);
156                $Ly = (-$length) % 65535;
157                // -ve length requires ... capture everything except a group of -length characters
158                //                         anchored at the tail-end of the string
159                if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
160                $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
161            }
162        }
163
164        if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
165        return $match[1];
166    }
167
168    // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
169    /**
170     * Unicode aware replacement for substr_replace()
171     *
172     * @param string $string input string
173     * @param string $replacement the replacement
174     * @param int $start the replacing will begin at the start'th offset into string.
175     * @param int $length If given and is positive, it represents the length of the portion of string which is
176     *                            to be replaced. If length is zero then this function will have the effect of inserting
177     *                            replacement into string at the given start offset.
178     * @return string
179     * @see    substr_replace()
180     *
181     * @author Andreas Gohr <andi@splitbrain.org>
182     */
183    public static function substr_replace($string, $replacement, $start, $length = 0)
184    {
185        $ret = '';
186        if ($start > 0) $ret .= self::substr($string, 0, $start);
187        $ret .= $replacement;
188        $ret .= self::substr($string, $start + $length);
189        return $ret;
190    }
191    // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
192
193    /**
194     * Unicode aware replacement for ltrim()
195     *
196     * @param string $str
197     * @param string $charlist
198     * @return string
199     * @see    ltrim()
200     *
201     * @author Andreas Gohr <andi@splitbrain.org>
202     */
203    public static function ltrim($str, $charlist = '')
204    {
205        if ($charlist === '') return ltrim($str);
206
207        //quote charlist for use in a characterclass
208        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
209
210        return preg_replace('/^[' . $charlist . ']+/u', '', $str);
211    }
212
213    /**
214     * Unicode aware replacement for rtrim()
215     *
216     * @param string $str
217     * @param string $charlist
218     * @return string
219     * @see    rtrim()
220     *
221     * @author Andreas Gohr <andi@splitbrain.org>
222     */
223    public static function rtrim($str, $charlist = '')
224    {
225        if ($charlist === '') return rtrim($str);
226
227        //quote charlist for use in a characterclass
228        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
229
230        return preg_replace('/[' . $charlist . ']+$/u', '', $str);
231    }
232
233    /**
234     * Unicode aware replacement for trim()
235     *
236     * @param string $str
237     * @param string $charlist
238     * @return string
239     * @see    trim()
240     *
241     * @author Andreas Gohr <andi@splitbrain.org>
242     */
243    public static function trim($str, $charlist = '')
244    {
245        if ($charlist === '') return trim($str);
246
247        return self::ltrim(self::rtrim($str, $charlist), $charlist);
248    }
249
250    /**
251     * This is a unicode aware replacement for strtolower()
252     *
253     * Uses mb_string extension if available
254     *
255     * @param string $string
256     * @return string
257     * @see    \dokuwiki\Utf8\PhpString::strtoupper()
258     *
259     * @author Leo Feyer <leo@typolight.org>
260     * @see    strtolower()
261     */
262    public static function strtolower($string)
263    {
264        if($string === null) return ''; // pre-8.1 behaviour
265        if (UTF8_MBSTRING) {
266            if (class_exists('Normalizer', $autoload = false)) {
267                return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
268            }
269            return (mb_strtolower($string, 'utf-8'));
270        }
271        return strtr($string, Table::upperCaseToLowerCase());
272    }
273
274    /**
275     * This is a unicode aware replacement for strtoupper()
276     *
277     * Uses mb_string extension if available
278     *
279     * @param string $string
280     * @return string
281     * @see    \dokuwiki\Utf8\PhpString::strtoupper()
282     *
283     * @author Leo Feyer <leo@typolight.org>
284     * @see    strtoupper()
285     */
286    public static function strtoupper($string)
287    {
288        if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
289
290        return strtr($string, Table::lowerCaseToUpperCase());
291    }
292
293
294    /**
295     * UTF-8 aware alternative to ucfirst
296     * Make a string's first character uppercase
297     *
298     * @param string $str
299     * @return string with first character as upper case (if applicable)
300     * @author Harry Fuecks
301     *
302     */
303    public static function ucfirst($str)
304    {
305        switch (self::strlen($str)) {
306            case 0:
307                return '';
308            case 1:
309                return self::strtoupper($str);
310            default:
311                preg_match('/^(.{1})(.*)$/us', $str, $matches);
312                return self::strtoupper($matches[1]) . $matches[2];
313        }
314    }
315
316    /**
317     * UTF-8 aware alternative to ucwords
318     * Uppercase the first character of each word in a string
319     *
320     * @param string $str
321     * @return string with first char of each word uppercase
322     * @author Harry Fuecks
323     * @see http://php.net/ucwords
324     *
325     */
326    public static function ucwords($str)
327    {
328        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
329        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
330        // This corresponds to the definition of a "word" defined at http://php.net/ucwords
331        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
332
333        return preg_replace_callback(
334            $pattern,
335            function ($matches) {
336                $leadingws = $matches[2];
337                $ucfirst = self::strtoupper($matches[3]);
338                $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
339                return $leadingws . $ucword;
340            },
341            $str
342        );
343    }
344
345    /**
346     * This is an Unicode aware replacement for strpos
347     *
348     * @param string $haystack
349     * @param string $needle
350     * @param integer $offset
351     * @return integer
352     * @author Leo Feyer <leo@typolight.org>
353     * @see    strpos()
354     *
355     */
356    public static function strpos($haystack, $needle, $offset = 0)
357    {
358        $comp = 0;
359        $length = null;
360
361        while ($length === null || $length < $offset) {
362            $pos = strpos($haystack, $needle, $offset + $comp);
363
364            if ($pos === false)
365                return false;
366
367            $length = self::strlen(substr($haystack, 0, $pos));
368
369            if ($length < $offset)
370                $comp = $pos - $length;
371        }
372
373        return $length;
374    }
375}
376