1<?php 2 3namespace dokuwiki\Utf8; 4 5/** 6 * UTF-8 aware equivalents to PHP's string functions 7 */ 8class PhpString 9{ 10 /** 11 * A locale independent basename() implementation 12 * 13 * works around a bug in PHP's basename() implementation 14 * 15 * @param string $path A path 16 * @param string $suffix If the name component ends in suffix this will also be cut off 17 * @return string 18 * @link https://bugs.php.net/bug.php?id=37738 19 * 20 * @see basename() 21 */ 22 public static function basename($path, $suffix = '') 23 { 24 $path = trim($path, '\\/'); 25 $rpos = max(strrpos($path, '/'), strrpos($path, '\\')); 26 if ($rpos) { 27 $path = substr($path, $rpos + 1); 28 } 29 30 $suflen = strlen($suffix); 31 if ($suflen && str_ends_with($path, $suffix)) { 32 $path = substr($path, 0, -$suflen); 33 } 34 35 return $path; 36 } 37 38 /** 39 * Unicode aware replacement for strlen() 40 * 41 * utf8_decode() converts characters that are not in ISO-8859-1 42 * to '?', which, for the purpose of counting, is alright 43 * 44 * @param string $string 45 * @return int 46 * @see utf8_decode() 47 * 48 * @author <chernyshevsky at hotmail dot com> 49 * @see strlen() 50 */ 51 public static function strlen($string) 52 { 53 if (UTF8_MBSTRING) { 54 return mb_strlen($string, 'UTF-8'); 55 } 56 57 if (function_exists('iconv_strlen')) { 58 return iconv_strlen($string, 'UTF-8'); 59 } 60 61 // utf8_decode is deprecated 62 if (function_exists('utf8_decode')) { 63 return strlen(utf8_decode($string)); 64 } 65 66 return strlen($string); 67 } 68 69 /** 70 * UTF-8 aware alternative to substr 71 * 72 * Return part of a string given character offset (and optionally length) 73 * 74 * @param string $str 75 * @param int $offset number of UTF-8 characters offset (from left) 76 * @param int $length (optional) length in UTF-8 characters from offset 77 * @return string 78 * @author Harry Fuecks <hfuecks@gmail.com> 79 * @author Chris Smith <chris@jalakai.co.uk> 80 * 81 */ 82 public static function substr($str, $offset, $length = null) 83 { 84 if (UTF8_MBSTRING) { 85 if ($length === null) { 86 return mb_substr($str, $offset); 87 } 88 89 return mb_substr($str, $offset, $length); 90 } 91 92 /* 93 * Notes: 94 * 95 * no mb string support, so we'll use pcre regex's with 'u' flag 96 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 97 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 98 * 99 * substr documentation states false can be returned in some cases (e.g. offset > string length) 100 * mb_substr never returns false, it will return an empty string instead. 101 * 102 * calculating the number of characters in the string is a relatively expensive operation, so 103 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 104 */ 105 106 // cast parameters to appropriate types to avoid multiple notices/warnings 107 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 108 $offset = (int)$offset; 109 if ($length !== null) $length = (int)$length; 110 111 // handle trivial cases 112 if ($length === 0) return ''; 113 if ($offset < 0 && $length < 0 && $length < $offset) return ''; 114 115 $offset_pattern = ''; 116 $length_pattern = ''; 117 118 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 119 if ($offset < 0) { 120 $strlen = self::strlen($str); // see notes 121 $offset = $strlen + $offset; 122 if ($offset < 0) $offset = 0; 123 } 124 125 // establish a pattern for offset, a non-captured group equal in length to offset 126 if ($offset > 0) { 127 $Ox = (int)($offset / 65535); 128 $Oy = $offset % 65535; 129 130 if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}'; 131 $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})'; 132 } else { 133 $offset_pattern = '^'; // offset == 0; just anchor the pattern 134 } 135 136 // establish a pattern for length 137 if ($length === null) { 138 $length_pattern = '(.*)$'; // the rest of the string 139 } else { 140 if (!isset($strlen)) $strlen = self::strlen($str); // see notes 141 if ($offset > $strlen) return ''; // another trivial case 142 143 if ($length > 0) { 144 // reduce any length that would go past the end of the string 145 $length = min($strlen - $offset, $length); 146 $Lx = (int)($length / 65535); 147 $Ly = $length % 65535; 148 // +ve length requires ... a captured group of length characters 149 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}'; 150 $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})'; 151 } elseif ($length < 0) { 152 if ($length < ($offset - $strlen)) return ''; 153 $Lx = (int)((-$length) / 65535); 154 $Ly = (-$length) % 65535; 155 // -ve length requires ... capture everything except a group of -length characters 156 // anchored at the tail-end of the string 157 if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}'; 158 $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$'; 159 } 160 } 161 162 if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return ''; 163 return $match[1]; 164 } 165 166 // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps 167 /** 168 * Unicode aware replacement for substr_replace() 169 * 170 * @param string $string input string 171 * @param string $replacement the replacement 172 * @param int $start the replacing will begin at the start'th offset into string. 173 * @param int $length If given and is positive, it represents the length of the portion of string which is 174 * to be replaced. If length is zero then this function will have the effect of inserting 175 * replacement into string at the given start offset. 176 * @return string 177 * @see substr_replace() 178 * 179 * @author Andreas Gohr <andi@splitbrain.org> 180 */ 181 public static function substr_replace($string, $replacement, $start, $length = 0) 182 { 183 $ret = ''; 184 if ($start > 0) $ret .= self::substr($string, 0, $start); 185 $ret .= $replacement; 186 $ret .= self::substr($string, $start + $length); 187 return $ret; 188 } 189 // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps 190 191 /** 192 * Unicode aware replacement for ltrim() 193 * 194 * @param string $str 195 * @param string $charlist 196 * @return string 197 * @see ltrim() 198 * 199 * @author Andreas Gohr <andi@splitbrain.org> 200 */ 201 public static function ltrim($str, $charlist = '') 202 { 203 if ($charlist === '') return ltrim($str); 204 205 //quote charlist for use in a characterclass 206 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist); 207 208 return preg_replace('/^[' . $charlist . ']+/u', '', $str); 209 } 210 211 /** 212 * Unicode aware replacement for rtrim() 213 * 214 * @param string $str 215 * @param string $charlist 216 * @return string 217 * @see rtrim() 218 * 219 * @author Andreas Gohr <andi@splitbrain.org> 220 */ 221 public static function rtrim($str, $charlist = '') 222 { 223 if ($charlist === '') return rtrim($str); 224 225 //quote charlist for use in a characterclass 226 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist); 227 228 return preg_replace('/[' . $charlist . ']+$/u', '', $str); 229 } 230 231 /** 232 * Unicode aware replacement for trim() 233 * 234 * @param string $str 235 * @param string $charlist 236 * @return string 237 * @see trim() 238 * 239 * @author Andreas Gohr <andi@splitbrain.org> 240 */ 241 public static function trim($str, $charlist = '') 242 { 243 if ($charlist === '') return trim($str); 244 245 return self::ltrim(self::rtrim($str, $charlist), $charlist); 246 } 247 248 /** 249 * This is a unicode aware replacement for strtolower() 250 * 251 * Uses mb_string extension if available 252 * 253 * @param string $string 254 * @return string 255 * @see \dokuwiki\Utf8\PhpString::strtoupper() 256 * 257 * @author Leo Feyer <leo@typolight.org> 258 * @see strtolower() 259 */ 260 public static function strtolower($string) 261 { 262 if ($string === null) return ''; // pre-8.1 behaviour 263 if (UTF8_MBSTRING) { 264 if (class_exists('Normalizer', $autoload = false)) { 265 return \Normalizer::normalize(mb_strtolower($string, 'utf-8')); 266 } 267 return (mb_strtolower($string, 'utf-8')); 268 } 269 return strtr($string, Table::upperCaseToLowerCase()); 270 } 271 272 /** 273 * This is a unicode aware replacement for strtoupper() 274 * 275 * Uses mb_string extension if available 276 * 277 * @param string $string 278 * @return string 279 * @see \dokuwiki\Utf8\PhpString::strtoupper() 280 * 281 * @author Leo Feyer <leo@typolight.org> 282 * @see strtoupper() 283 */ 284 public static function strtoupper($string) 285 { 286 if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8'); 287 288 return strtr($string, Table::lowerCaseToUpperCase()); 289 } 290 291 292 /** 293 * UTF-8 aware alternative to ucfirst 294 * Make a string's first character uppercase 295 * 296 * @param string $str 297 * @return string with first character as upper case (if applicable) 298 * @author Harry Fuecks 299 * 300 */ 301 public static function ucfirst($str) 302 { 303 switch (self::strlen($str)) { 304 case 0: 305 return ''; 306 case 1: 307 return self::strtoupper($str); 308 default: 309 preg_match('/^(.{1})(.*)$/us', $str, $matches); 310 return self::strtoupper($matches[1]) . $matches[2]; 311 } 312 } 313 314 /** 315 * UTF-8 aware alternative to ucwords 316 * Uppercase the first character of each word in a string 317 * 318 * @param string $str 319 * @return string with first char of each word uppercase 320 * @author Harry Fuecks 321 * @see http://php.net/ucwords 322 * 323 */ 324 public static function ucwords($str) 325 { 326 // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches; 327 // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns 328 // This corresponds to the definition of a "word" defined at http://php.net/ucwords 329 $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u'; 330 331 return preg_replace_callback( 332 $pattern, 333 function ($matches) { 334 $leadingws = $matches[2]; 335 $ucfirst = self::strtoupper($matches[3]); 336 $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1); 337 return $leadingws . $ucword; 338 }, 339 $str 340 ); 341 } 342 343 /** 344 * This is an Unicode aware replacement for strpos 345 * 346 * @param string $haystack 347 * @param string $needle 348 * @param integer $offset 349 * @return integer 350 * @author Leo Feyer <leo@typolight.org> 351 * @see strpos() 352 * 353 */ 354 public static function strpos($haystack, $needle, $offset = 0) 355 { 356 $comp = 0; 357 $length = null; 358 359 while ($length === null || $length < $offset) { 360 $pos = strpos($haystack, $needle, $offset + $comp); 361 362 if ($pos === false) 363 return false; 364 365 $length = self::strlen(substr($haystack, 0, $pos)); 366 367 if ($length < $offset) 368 $comp = $pos - $length; 369 } 370 371 return $length; 372 } 373} 374