1<?php 2 3namespace dokuwiki\Utf8; 4 5/** 6 * UTF-8 aware equivalents to PHP's string functions 7 */ 8class PhpString 9{ 10 /** 11 * A locale independent basename() implementation 12 * 13 * works around a bug in PHP's basename() implementation 14 * 15 * @param string $path A path 16 * @param string $suffix If the name component ends in suffix this will also be cut off 17 * @return string 18 * @link https://bugs.php.net/bug.php?id=37738 19 * 20 * @see basename() 21 */ 22 public static function basename($path, $suffix = '') 23 { 24 $path = trim($path, '\\/'); 25 $rpos = max(strrpos($path, '/'), strrpos($path, '\\')); 26 if ($rpos) { 27 $path = substr($path, $rpos + 1); 28 } 29 30 if (str_ends_with($path, $suffix)) { 31 $path = substr($path, 0, -strlen($suffix)); 32 } 33 34 return $path; 35 } 36 37 /** 38 * Unicode aware replacement for strlen() 39 * 40 * utf8_decode() converts characters that are not in ISO-8859-1 41 * to '?', which, for the purpose of counting, is alright 42 * 43 * @param string $string 44 * @return int 45 * @see utf8_decode() 46 * 47 * @author <chernyshevsky at hotmail dot com> 48 * @see strlen() 49 */ 50 public static function strlen($string) 51 { 52 if (UTF8_MBSTRING) { 53 return mb_strlen($string, 'UTF-8'); 54 } 55 56 if (function_exists('iconv_strlen')) { 57 return iconv_strlen($string, 'UTF-8'); 58 } 59 60 // utf8_decode is deprecated 61 if (function_exists('utf8_decode')) { 62 return strlen(utf8_decode($string)); 63 } 64 65 return strlen($string); 66 } 67 68 /** 69 * UTF-8 aware alternative to substr 70 * 71 * Return part of a string given character offset (and optionally length) 72 * 73 * @param string $str 74 * @param int $offset number of UTF-8 characters offset (from left) 75 * @param int $length (optional) length in UTF-8 characters from offset 76 * @return string 77 * @author Harry Fuecks <hfuecks@gmail.com> 78 * @author Chris Smith <chris@jalakai.co.uk> 79 * 80 */ 81 public static function substr($str, $offset, $length = null) 82 { 83 if (UTF8_MBSTRING) { 84 if ($length === null) { 85 return mb_substr($str, $offset); 86 } 87 88 return mb_substr($str, $offset, $length); 89 } 90 91 /* 92 * Notes: 93 * 94 * no mb string support, so we'll use pcre regex's with 'u' flag 95 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 96 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 97 * 98 * substr documentation states false can be returned in some cases (e.g. offset > string length) 99 * mb_substr never returns false, it will return an empty string instead. 100 * 101 * calculating the number of characters in the string is a relatively expensive operation, so 102 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 103 */ 104 105 // cast parameters to appropriate types to avoid multiple notices/warnings 106 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 107 $offset = (int)$offset; 108 if ($length !== null) $length = (int)$length; 109 110 // handle trivial cases 111 if ($length === 0) return ''; 112 if ($offset < 0 && $length < 0 && $length < $offset) return ''; 113 114 $offset_pattern = ''; 115 $length_pattern = ''; 116 117 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 118 if ($offset < 0) { 119 $strlen = self::strlen($str); // see notes 120 $offset = $strlen + $offset; 121 if ($offset < 0) $offset = 0; 122 } 123 124 // establish a pattern for offset, a non-captured group equal in length to offset 125 if ($offset > 0) { 126 $Ox = (int)($offset / 65535); 127 $Oy = $offset % 65535; 128 129 if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}'; 130 $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})'; 131 } else { 132 $offset_pattern = '^'; // offset == 0; just anchor the pattern 133 } 134 135 // establish a pattern for length 136 if ($length === null) { 137 $length_pattern = '(.*)$'; // the rest of the string 138 } else { 139 if (!isset($strlen)) $strlen = self::strlen($str); // see notes 140 if ($offset > $strlen) return ''; // another trivial case 141 142 if ($length > 0) { 143 // reduce any length that would go past the end of the string 144 $length = min($strlen - $offset, $length); 145 $Lx = (int)($length / 65535); 146 $Ly = $length % 65535; 147 // +ve length requires ... a captured group of length characters 148 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}'; 149 $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})'; 150 } elseif ($length < 0) { 151 if ($length < ($offset - $strlen)) return ''; 152 $Lx = (int)((-$length) / 65535); 153 $Ly = (-$length) % 65535; 154 // -ve length requires ... capture everything except a group of -length characters 155 // anchored at the tail-end of the string 156 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}'; 157 $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$'; 158 } 159 } 160 161 if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return ''; 162 return $match[1]; 163 } 164 165 // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps 166 /** 167 * Unicode aware replacement for substr_replace() 168 * 169 * @param string $string input string 170 * @param string $replacement the replacement 171 * @param int $start the replacing will begin at the start'th offset into string. 172 * @param int $length If given and is positive, it represents the length of the portion of string which is 173 * to be replaced. If length is zero then this function will have the effect of inserting 174 * replacement into string at the given start offset. 175 * @return string 176 * @see substr_replace() 177 * 178 * @author Andreas Gohr <andi@splitbrain.org> 179 */ 180 public static function substr_replace($string, $replacement, $start, $length = 0) 181 { 182 $ret = ''; 183 if ($start > 0) $ret .= self::substr($string, 0, $start); 184 $ret .= $replacement; 185 $ret .= self::substr($string, $start + $length); 186 return $ret; 187 } 188 // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps 189 190 /** 191 * Unicode aware replacement for ltrim() 192 * 193 * @param string $str 194 * @param string $charlist 195 * @return string 196 * @see ltrim() 197 * 198 * @author Andreas Gohr <andi@splitbrain.org> 199 */ 200 public static function ltrim($str, $charlist = '') 201 { 202 if ($charlist === '') return ltrim($str); 203 204 //quote charlist for use in a characterclass 205 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist); 206 207 return preg_replace('/^[' . $charlist . ']+/u', '', $str); 208 } 209 210 /** 211 * Unicode aware replacement for rtrim() 212 * 213 * @param string $str 214 * @param string $charlist 215 * @return string 216 * @see rtrim() 217 * 218 * @author Andreas Gohr <andi@splitbrain.org> 219 */ 220 public static function rtrim($str, $charlist = '') 221 { 222 if ($charlist === '') return rtrim($str); 223 224 //quote charlist for use in a characterclass 225 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist); 226 227 return preg_replace('/[' . $charlist . ']+$/u', '', $str); 228 } 229 230 /** 231 * Unicode aware replacement for trim() 232 * 233 * @param string $str 234 * @param string $charlist 235 * @return string 236 * @see trim() 237 * 238 * @author Andreas Gohr <andi@splitbrain.org> 239 */ 240 public static function trim($str, $charlist = '') 241 { 242 if ($charlist === '') return trim($str); 243 244 return self::ltrim(self::rtrim($str, $charlist), $charlist); 245 } 246 247 /** 248 * This is a unicode aware replacement for strtolower() 249 * 250 * Uses mb_string extension if available 251 * 252 * @param string $string 253 * @return string 254 * @see \dokuwiki\Utf8\PhpString::strtoupper() 255 * 256 * @author Leo Feyer <leo@typolight.org> 257 * @see strtolower() 258 */ 259 public static function strtolower($string) 260 { 261 if ($string === null) return ''; // pre-8.1 behaviour 262 if (UTF8_MBSTRING) { 263 if (class_exists('Normalizer', $autoload = false)) { 264 return \Normalizer::normalize(mb_strtolower($string, 'utf-8')); 265 } 266 return (mb_strtolower($string, 'utf-8')); 267 } 268 return strtr($string, Table::upperCaseToLowerCase()); 269 } 270 271 /** 272 * This is a unicode aware replacement for strtoupper() 273 * 274 * Uses mb_string extension if available 275 * 276 * @param string $string 277 * @return string 278 * @see \dokuwiki\Utf8\PhpString::strtoupper() 279 * 280 * @author Leo Feyer <leo@typolight.org> 281 * @see strtoupper() 282 */ 283 public static function strtoupper($string) 284 { 285 if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8'); 286 287 return strtr($string, Table::lowerCaseToUpperCase()); 288 } 289 290 291 /** 292 * UTF-8 aware alternative to ucfirst 293 * Make a string's first character uppercase 294 * 295 * @param string $str 296 * @return string with first character as upper case (if applicable) 297 * @author Harry Fuecks 298 * 299 */ 300 public static function ucfirst($str) 301 { 302 switch (self::strlen($str)) { 303 case 0: 304 return ''; 305 case 1: 306 return self::strtoupper($str); 307 default: 308 preg_match('/^(.{1})(.*)$/us', $str, $matches); 309 return self::strtoupper($matches[1]) . $matches[2]; 310 } 311 } 312 313 /** 314 * UTF-8 aware alternative to ucwords 315 * Uppercase the first character of each word in a string 316 * 317 * @param string $str 318 * @return string with first char of each word uppercase 319 * @author Harry Fuecks 320 * @see http://php.net/ucwords 321 * 322 */ 323 public static function ucwords($str) 324 { 325 // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches; 326 // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns 327 // This corresponds to the definition of a "word" defined at http://php.net/ucwords 328 $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u'; 329 330 return preg_replace_callback( 331 $pattern, 332 function ($matches) { 333 $leadingws = $matches[2]; 334 $ucfirst = self::strtoupper($matches[3]); 335 $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1); 336 return $leadingws . $ucword; 337 }, 338 $str 339 ); 340 } 341 342 /** 343 * This is an Unicode aware replacement for strpos 344 * 345 * @param string $haystack 346 * @param string $needle 347 * @param integer $offset 348 * @return integer 349 * @author Leo Feyer <leo@typolight.org> 350 * @see strpos() 351 * 352 */ 353 public static function strpos($haystack, $needle, $offset = 0) 354 { 355 $comp = 0; 356 $length = null; 357 358 while ($length === null || $length < $offset) { 359 $pos = strpos($haystack, $needle, $offset + $comp); 360 361 if ($pos === false) 362 return false; 363 364 $length = self::strlen(substr($haystack, 0, $pos)); 365 366 if ($length < $offset) 367 $comp = $pos - $length; 368 } 369 370 return $length; 371 } 372} 373