1 <?php
2 
3 namespace dokuwiki\Utf8;
4 
5 /**
6  * UTF-8 aware equivalents to PHP's string functions
7  */
8 class PhpString
9 {
10     /**
11      * A locale independent basename() implementation
12      *
13      * works around a bug in PHP's basename() implementation
14      *
15      * @param string $path A path
16      * @param string $suffix If the name component ends in suffix this will also be cut off
17      * @return string
18      * @link   https://bugs.php.net/bug.php?id=37738
19      *
20      * @see basename()
21      */
22     public static function basename($path, $suffix = '')
23     {
24         $path = trim($path, '\\/');
25         $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
26         if ($rpos) {
27             $path = substr($path, $rpos + 1);
28         }
29 
30         $suflen = strlen($suffix);
31         if ($suflen && str_ends_with($path, $suffix)) {
32             $path = substr($path, 0, -$suflen);
33         }
34 
35         return $path;
36     }
37 
38     /**
39      * Unicode aware replacement for strlen()
40      *
41      * utf8_decode() converts characters that are not in ISO-8859-1
42      * to '?', which, for the purpose of counting, is alright
43      *
44      * @param string $string
45      * @return int
46      * @see    utf8_decode()
47      *
48      * @author <chernyshevsky at hotmail dot com>
49      * @see    strlen()
50      */
51     public static function strlen($string)
52     {
53         if (UTF8_MBSTRING) {
54             return mb_strlen($string, 'UTF-8');
55         }
56 
57         if (function_exists('iconv_strlen')) {
58             return iconv_strlen($string, 'UTF-8');
59         }
60 
61         // utf8_decode is deprecated
62         if (function_exists('utf8_decode')) {
63             return strlen(utf8_decode($string));
64         }
65 
66         return strlen($string);
67     }
68 
69     /**
70      * UTF-8 aware alternative to substr
71      *
72      * Return part of a string given character offset (and optionally length)
73      *
74      * @param string $str
75      * @param int $offset number of UTF-8 characters offset (from left)
76      * @param int $length (optional) length in UTF-8 characters from offset
77      * @return string
78      * @author Harry Fuecks <hfuecks@gmail.com>
79      * @author Chris Smith <chris@jalakai.co.uk>
80      *
81      */
82     public static function substr($str, $offset, $length = null)
83     {
84         if (UTF8_MBSTRING) {
85             if ($length === null) {
86                 return mb_substr($str, $offset);
87             }
88 
89             return mb_substr($str, $offset, $length);
90         }
91 
92         /*
93          * Notes:
94          *
95          * no mb string support, so we'll use pcre regex's with 'u' flag
96          * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
97          * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
98          *
99          * substr documentation states false can be returned in some cases (e.g. offset > string length)
100          * mb_substr never returns false, it will return an empty string instead.
101          *
102          * calculating the number of characters in the string is a relatively expensive operation, so
103          * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
104          */
105 
106         // cast parameters to appropriate types to avoid multiple notices/warnings
107         $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
108         $offset = (int)$offset;
109         if ($length !== null) $length = (int)$length;
110 
111         // handle trivial cases
112         if ($length === 0) return '';
113         if ($offset < 0 && $length < 0 && $length < $offset) return '';
114 
115         $offset_pattern = '';
116         $length_pattern = '';
117 
118         // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
119         if ($offset < 0) {
120             $strlen = self::strlen($str);        // see notes
121             $offset = $strlen + $offset;
122             if ($offset < 0) $offset = 0;
123         }
124 
125         // establish a pattern for offset, a non-captured group equal in length to offset
126         if ($offset > 0) {
127             $Ox = (int)($offset / 65535);
128             $Oy = $offset % 65535;
129 
130             if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
131             $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
132         } else {
133             $offset_pattern = '^';                      // offset == 0; just anchor the pattern
134         }
135 
136         // establish a pattern for length
137         if ($length === null) {
138             $length_pattern = '(.*)$';                  // the rest of the string
139         } else {
140             if (!isset($strlen)) $strlen = self::strlen($str);    // see notes
141             if ($offset > $strlen) return '';           // another trivial case
142 
143             if ($length > 0) {
144                 // reduce any length that would go past the end of the string
145                 $length = min($strlen - $offset, $length);
146                 $Lx = (int)($length / 65535);
147                 $Ly = $length % 65535;
148                 // +ve length requires ... a captured group of length characters
149                 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
150                 $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
151             } elseif ($length < 0) {
152                 if ($length < ($offset - $strlen)) return '';
153                 $Lx = (int)((-$length) / 65535);
154                 $Ly = (-$length) % 65535;
155                 // -ve length requires ... capture everything except a group of -length characters
156                 //                         anchored at the tail-end of the string
157                 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
158                 $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
159             }
160         }
161 
162         if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
163         return $match[1];
164     }
165 
166     // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
167     /**
168      * Unicode aware replacement for substr_replace()
169      *
170      * @param string $string input string
171      * @param string $replacement the replacement
172      * @param int $start the replacing will begin at the start'th offset into string.
173      * @param int $length If given and is positive, it represents the length of the portion of string which is
174      *                            to be replaced. If length is zero then this function will have the effect of inserting
175      *                            replacement into string at the given start offset.
176      * @return string
177      * @see    substr_replace()
178      *
179      * @author Andreas Gohr <andi@splitbrain.org>
180      */
181     public static function substr_replace($string, $replacement, $start, $length = 0)
182     {
183         $ret = '';
184         if ($start > 0) $ret .= self::substr($string, 0, $start);
185         $ret .= $replacement;
186         $ret .= self::substr($string, $start + $length);
187         return $ret;
188     }
189     // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
190 
191     /**
192      * Unicode aware replacement for ltrim()
193      *
194      * @param string $str
195      * @param string $charlist
196      * @return string
197      * @see    ltrim()
198      *
199      * @author Andreas Gohr <andi@splitbrain.org>
200      */
201     public static function ltrim($str, $charlist = '')
202     {
203         if ($charlist === '') return ltrim($str);
204 
205         //quote charlist for use in a characterclass
206         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
207 
208         return preg_replace('/^[' . $charlist . ']+/u', '', $str);
209     }
210 
211     /**
212      * Unicode aware replacement for rtrim()
213      *
214      * @param string $str
215      * @param string $charlist
216      * @return string
217      * @see    rtrim()
218      *
219      * @author Andreas Gohr <andi@splitbrain.org>
220      */
221     public static function rtrim($str, $charlist = '')
222     {
223         if ($charlist === '') return rtrim($str);
224 
225         //quote charlist for use in a characterclass
226         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
227 
228         return preg_replace('/[' . $charlist . ']+$/u', '', $str);
229     }
230 
231     /**
232      * Unicode aware replacement for trim()
233      *
234      * @param string $str
235      * @param string $charlist
236      * @return string
237      * @see    trim()
238      *
239      * @author Andreas Gohr <andi@splitbrain.org>
240      */
241     public static function trim($str, $charlist = '')
242     {
243         if ($charlist === '') return trim($str);
244 
245         return self::ltrim(self::rtrim($str, $charlist), $charlist);
246     }
247 
248     /**
249      * This is a unicode aware replacement for strtolower()
250      *
251      * Uses mb_string extension if available
252      *
253      * @param string $string
254      * @return string
255      * @see    \dokuwiki\Utf8\PhpString::strtoupper()
256      *
257      * @author Leo Feyer <leo@typolight.org>
258      * @see    strtolower()
259      */
260     public static function strtolower($string)
261     {
262         if ($string === null) return ''; // pre-8.1 behaviour
263         if (UTF8_MBSTRING) {
264             if (class_exists('Normalizer', $autoload = false)) {
265                 return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
266             }
267             return (mb_strtolower($string, 'utf-8'));
268         }
269         return strtr($string, Table::upperCaseToLowerCase());
270     }
271 
272     /**
273      * This is a unicode aware replacement for strtoupper()
274      *
275      * Uses mb_string extension if available
276      *
277      * @param string $string
278      * @return string
279      * @see    \dokuwiki\Utf8\PhpString::strtoupper()
280      *
281      * @author Leo Feyer <leo@typolight.org>
282      * @see    strtoupper()
283      */
284     public static function strtoupper($string)
285     {
286         if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
287 
288         return strtr($string, Table::lowerCaseToUpperCase());
289     }
290 
291 
292     /**
293      * UTF-8 aware alternative to ucfirst
294      * Make a string's first character uppercase
295      *
296      * @param string $str
297      * @return string with first character as upper case (if applicable)
298      * @author Harry Fuecks
299      *
300      */
301     public static function ucfirst($str)
302     {
303         switch (self::strlen($str)) {
304             case 0:
305                 return '';
306             case 1:
307                 return self::strtoupper($str);
308             default:
309                 preg_match('/^(.{1})(.*)$/us', $str, $matches);
310                 return self::strtoupper($matches[1]) . $matches[2];
311         }
312     }
313 
314     /**
315      * UTF-8 aware alternative to ucwords
316      * Uppercase the first character of each word in a string
317      *
318      * @param string $str
319      * @return string with first char of each word uppercase
320      * @author Harry Fuecks
321      * @see http://php.net/ucwords
322      *
323      */
324     public static function ucwords($str)
325     {
326         // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
327         // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
328         // This corresponds to the definition of a "word" defined at http://php.net/ucwords
329         $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
330 
331         return preg_replace_callback(
332             $pattern,
333             function ($matches) {
334                 $leadingws = $matches[2];
335                 $ucfirst = self::strtoupper($matches[3]);
336                 $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
337                 return $leadingws . $ucword;
338             },
339             $str
340         );
341     }
342 
343     /**
344      * This is an Unicode aware replacement for strpos
345      *
346      * @param string $haystack
347      * @param string $needle
348      * @param integer $offset
349      * @return integer
350      * @author Leo Feyer <leo@typolight.org>
351      * @see    strpos()
352      *
353      */
354     public static function strpos($haystack, $needle, $offset = 0)
355     {
356         $comp = 0;
357         $length = null;
358 
359         while ($length === null || $length < $offset) {
360             $pos = strpos($haystack, $needle, $offset + $comp);
361 
362             if ($pos === false)
363                 return false;
364 
365             $length = self::strlen(substr($haystack, 0, $pos));
366 
367             if ($length < $offset)
368                 $comp = $pos - $length;
369         }
370 
371         return $length;
372     }
373 }
374