1<?php
2
3namespace dokuwiki\Utf8;
4
5/**
6 * UTF-8 aware equivalents to PHP's string functions
7 */
8class PhpString
9{
10    /**
11     * A locale independent basename() implementation
12     *
13     * works around a bug in PHP's basename() implementation
14     *
15     * @param string $path A path
16     * @param string $suffix If the name component ends in suffix this will also be cut off
17     * @return string
18     * @link   https://bugs.php.net/bug.php?id=37738
19     *
20     * @see basename()
21     */
22    public static function basename($path, $suffix = '')
23    {
24        $path = trim($path, '\\/');
25        $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
26        if ($rpos) {
27            $path = substr($path, $rpos + 1);
28        }
29
30        $suflen = strlen($suffix);
31        if ($suflen && str_ends_with($path, $suffix)) {
32            $path = substr($path, 0, -$suflen);
33        }
34
35        return $path;
36    }
37
38    /**
39     * Unicode aware replacement for strlen()
40     *
41     * utf8_decode() converts characters that are not in ISO-8859-1
42     * to '?', which, for the purpose of counting, is alright
43     *
44     * @param string $string
45     * @return int
46     * @see    utf8_decode()
47     *
48     * @author <chernyshevsky at hotmail dot com>
49     * @see    strlen()
50     */
51    public static function strlen($string)
52    {
53        if (UTF8_MBSTRING) {
54            return mb_strlen($string, 'UTF-8');
55        }
56
57        if (function_exists('iconv_strlen')) {
58            return iconv_strlen($string, 'UTF-8');
59        }
60
61        // utf8_decode is deprecated
62        if (function_exists('utf8_decode')) {
63            return strlen(utf8_decode($string));
64        }
65
66        return strlen($string);
67    }
68
69    /**
70     * UTF-8 aware alternative to substr
71     *
72     * Return part of a string given character offset (and optionally length)
73     *
74     * @param string $str
75     * @param int $offset number of UTF-8 characters offset (from left)
76     * @param int $length (optional) length in UTF-8 characters from offset
77     * @return string
78     * @author Harry Fuecks <hfuecks@gmail.com>
79     * @author Chris Smith <chris@jalakai.co.uk>
80     *
81     */
82    public static function substr($str, $offset, $length = null)
83    {
84        if (UTF8_MBSTRING) {
85            if ($length === null) {
86                return mb_substr($str, $offset);
87            }
88
89            return mb_substr($str, $offset, $length);
90        }
91
92        /*
93         * Notes:
94         *
95         * no mb string support, so we'll use pcre regex's with 'u' flag
96         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
97         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
98         *
99         * substr documentation states false can be returned in some cases (e.g. offset > string length)
100         * mb_substr never returns false, it will return an empty string instead.
101         *
102         * calculating the number of characters in the string is a relatively expensive operation, so
103         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
104         */
105
106        // cast parameters to appropriate types to avoid multiple notices/warnings
107        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
108        $offset = (int)$offset;
109        if ($length !== null) $length = (int)$length;
110
111        // handle trivial cases
112        if ($length === 0) return '';
113        if ($offset < 0 && $length < 0 && $length < $offset) return '';
114
115        $offset_pattern = '';
116        $length_pattern = '';
117
118        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
119        if ($offset < 0) {
120            $strlen = self::strlen($str);        // see notes
121            $offset = $strlen + $offset;
122            if ($offset < 0) $offset = 0;
123        }
124
125        // establish a pattern for offset, a non-captured group equal in length to offset
126        if ($offset > 0) {
127            $Ox = (int)($offset / 65535);
128            $Oy = $offset % 65535;
129
130            if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
131            $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
132        } else {
133            $offset_pattern = '^';                      // offset == 0; just anchor the pattern
134        }
135
136        // establish a pattern for length
137        if ($length === null) {
138            $length_pattern = '(.*)$';                  // the rest of the string
139        } else {
140            if (!isset($strlen)) $strlen = self::strlen($str);    // see notes
141            if ($offset > $strlen) return '';           // another trivial case
142
143            if ($length > 0) {
144                // reduce any length that would go past the end of the string
145                $length = min($strlen - $offset, $length);
146                $Lx = (int)($length / 65535);
147                $Ly = $length % 65535;
148                // +ve length requires ... a captured group of length characters
149                if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
150                $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
151            } elseif ($length < 0) {
152                if ($length < ($offset - $strlen)) return '';
153                $Lx = (int)((-$length) / 65535);
154                $Ly = (-$length) % 65535;
155                // -ve length requires ... capture everything except a group of -length characters
156                //                         anchored at the tail-end of the string
157                if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
158                $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
159            }
160        }
161
162        if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
163        return $match[1];
164    }
165
166    // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
167    /**
168     * Unicode aware replacement for substr_replace()
169     *
170     * @param string $string input string
171     * @param string $replacement the replacement
172     * @param int $start the replacing will begin at the start'th offset into string.
173     * @param int $length If given and is positive, it represents the length of the portion of string which is
174     *                            to be replaced. If length is zero then this function will have the effect of inserting
175     *                            replacement into string at the given start offset.
176     * @return string
177     * @see    substr_replace()
178     *
179     * @author Andreas Gohr <andi@splitbrain.org>
180     */
181    public static function substr_replace($string, $replacement, $start, $length = 0)
182    {
183        $ret = '';
184        if ($start > 0) $ret .= self::substr($string, 0, $start);
185        $ret .= $replacement;
186        $ret .= self::substr($string, $start + $length);
187        return $ret;
188    }
189    // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
190
191    /**
192     * Unicode aware replacement for ltrim()
193     *
194     * @param string $str
195     * @param string $charlist
196     * @return string
197     * @see    ltrim()
198     *
199     * @author Andreas Gohr <andi@splitbrain.org>
200     */
201    public static function ltrim($str, $charlist = '')
202    {
203        if ($charlist === '') return ltrim($str);
204
205        //quote charlist for use in a characterclass
206        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
207
208        return preg_replace('/^[' . $charlist . ']+/u', '', $str);
209    }
210
211    /**
212     * Unicode aware replacement for rtrim()
213     *
214     * @param string $str
215     * @param string $charlist
216     * @return string
217     * @see    rtrim()
218     *
219     * @author Andreas Gohr <andi@splitbrain.org>
220     */
221    public static function rtrim($str, $charlist = '')
222    {
223        if ($charlist === '') return rtrim($str);
224
225        //quote charlist for use in a characterclass
226        $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
227
228        return preg_replace('/[' . $charlist . ']+$/u', '', $str);
229    }
230
231    /**
232     * Unicode aware replacement for trim()
233     *
234     * @param string $str
235     * @param string $charlist
236     * @return string
237     * @see    trim()
238     *
239     * @author Andreas Gohr <andi@splitbrain.org>
240     */
241    public static function trim($str, $charlist = '')
242    {
243        if ($charlist === '') return trim($str);
244
245        return self::ltrim(self::rtrim($str, $charlist), $charlist);
246    }
247
248    /**
249     * This is a unicode aware replacement for strtolower()
250     *
251     * Uses mb_string extension if available
252     *
253     * @param string $string
254     * @return string
255     * @see    \dokuwiki\Utf8\PhpString::strtoupper()
256     *
257     * @author Leo Feyer <leo@typolight.org>
258     * @see    strtolower()
259     */
260    public static function strtolower($string)
261    {
262        if ($string === null) return ''; // pre-8.1 behaviour
263        if (UTF8_MBSTRING) {
264            if (class_exists('Normalizer', $autoload = false)) {
265                return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
266            }
267            return (mb_strtolower($string, 'utf-8'));
268        }
269        return strtr($string, Table::upperCaseToLowerCase());
270    }
271
272    /**
273     * This is a unicode aware replacement for strtoupper()
274     *
275     * Uses mb_string extension if available
276     *
277     * @param string $string
278     * @return string
279     * @see    \dokuwiki\Utf8\PhpString::strtoupper()
280     *
281     * @author Leo Feyer <leo@typolight.org>
282     * @see    strtoupper()
283     */
284    public static function strtoupper($string)
285    {
286        if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
287
288        return strtr($string, Table::lowerCaseToUpperCase());
289    }
290
291
292    /**
293     * UTF-8 aware alternative to ucfirst
294     * Make a string's first character uppercase
295     *
296     * @param string $str
297     * @return string with first character as upper case (if applicable)
298     * @author Harry Fuecks
299     *
300     */
301    public static function ucfirst($str)
302    {
303        switch (self::strlen($str)) {
304            case 0:
305                return '';
306            case 1:
307                return self::strtoupper($str);
308            default:
309                preg_match('/^(.{1})(.*)$/us', $str, $matches);
310                return self::strtoupper($matches[1]) . $matches[2];
311        }
312    }
313
314    /**
315     * UTF-8 aware alternative to ucwords
316     * Uppercase the first character of each word in a string
317     *
318     * @param string $str
319     * @return string with first char of each word uppercase
320     * @author Harry Fuecks
321     * @see http://php.net/ucwords
322     *
323     */
324    public static function ucwords($str)
325    {
326        // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
327        // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
328        // This corresponds to the definition of a "word" defined at http://php.net/ucwords
329        $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
330
331        return preg_replace_callback(
332            $pattern,
333            function ($matches) {
334                $leadingws = $matches[2];
335                $ucfirst = self::strtoupper($matches[3]);
336                $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
337                return $leadingws . $ucword;
338            },
339            $str
340        );
341    }
342
343    /**
344     * This is an Unicode aware replacement for strpos
345     *
346     * @param string $haystack
347     * @param string $needle
348     * @param integer $offset
349     * @return integer
350     * @author Leo Feyer <leo@typolight.org>
351     * @see    strpos()
352     *
353     */
354    public static function strpos($haystack, $needle, $offset = 0)
355    {
356        $comp = 0;
357        $length = null;
358
359        while ($length === null || $length < $offset) {
360            $pos = strpos($haystack, $needle, $offset + $comp);
361
362            if ($pos === false)
363                return false;
364
365            $length = self::strlen(substr($haystack, 0, $pos));
366
367            if ($length < $offset)
368                $comp = $pos - $length;
369        }
370
371        return $length;
372    }
373}
374