1<?php 2 3/** 4 * This file is part of the Nette Framework (https://nette.org) 5 * Copyright (c) 2004 David Grudl (https://davidgrudl.com) 6 */ 7 8declare(strict_types=1); 9 10namespace Nette\Utils; 11 12use JetBrains\PhpStorm\Language; 13use Nette; 14use function is_array, is_object, strlen; 15 16 17/** 18 * String tools library. 19 */ 20class Strings 21{ 22 use Nette\StaticClass; 23 24 public const TrimCharacters = " \t\n\r\0\x0B\u{A0}"; 25 26 /** @deprecated use Strings::TrimCharacters */ 27 public const TRIM_CHARACTERS = self::TrimCharacters; 28 29 30 /** 31 * @deprecated use Nette\Utils\Validator::isUnicode() 32 */ 33 public static function checkEncoding(string $s): bool 34 { 35 return $s === self::fixEncoding($s); 36 } 37 38 39 /** 40 * Removes all invalid UTF-8 characters from a string. 41 */ 42 public static function fixEncoding(string $s): string 43 { 44 // removes xD800-xDFFF, x110000 and higher 45 return htmlspecialchars_decode(htmlspecialchars($s, ENT_NOQUOTES | ENT_IGNORE, 'UTF-8'), ENT_NOQUOTES); 46 } 47 48 49 /** 50 * Returns a specific character in UTF-8 from code point (number in range 0x0000..D7FF or 0xE000..10FFFF). 51 * @throws Nette\InvalidArgumentException if code point is not in valid range 52 */ 53 public static function chr(int $code): string 54 { 55 if ($code < 0 || ($code >= 0xD800 && $code <= 0xDFFF) || $code > 0x10FFFF) { 56 throw new Nette\InvalidArgumentException('Code point must be in range 0x0 to 0xD7FF or 0xE000 to 0x10FFFF.'); 57 } elseif (!extension_loaded('iconv')) { 58 throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.'); 59 } 60 61 return iconv('UTF-32BE', 'UTF-8//IGNORE', pack('N', $code)); 62 } 63 64 65 /** 66 * Returns a code point of specific character in UTF-8 (number in range 0x0000..D7FF or 0xE000..10FFFF). 67 */ 68 public static function ord(string $c): int 69 { 70 if (!extension_loaded('iconv')) { 71 throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.'); 72 } 73 74 $tmp = iconv('UTF-8', 'UTF-32BE//IGNORE', $c); 75 if (!$tmp) { 76 throw new Nette\InvalidArgumentException('Invalid UTF-8 character "' . ($c === '' ? '' : '\x' . strtoupper(bin2hex($c))) . '".'); 77 } 78 79 return unpack('N', $tmp)[1]; 80 } 81 82 83 /** 84 * @deprecated use str_starts_with() 85 */ 86 public static function startsWith(string $haystack, string $needle): bool 87 { 88 return str_starts_with($haystack, $needle); 89 } 90 91 92 /** 93 * @deprecated use str_ends_with() 94 */ 95 public static function endsWith(string $haystack, string $needle): bool 96 { 97 return str_ends_with($haystack, $needle); 98 } 99 100 101 /** 102 * @deprecated use str_contains() 103 */ 104 public static function contains(string $haystack, string $needle): bool 105 { 106 return str_contains($haystack, $needle); 107 } 108 109 110 /** 111 * Returns a part of UTF-8 string specified by starting position and length. If start is negative, 112 * the returned string will start at the start'th character from the end of string. 113 */ 114 public static function substring(string $s, int $start, ?int $length = null): string 115 { 116 if (function_exists('mb_substr')) { 117 return mb_substr($s, $start, $length, 'UTF-8'); // MB is much faster 118 } elseif (!extension_loaded('iconv')) { 119 throw new Nette\NotSupportedException(__METHOD__ . '() requires extension ICONV or MBSTRING, neither is loaded.'); 120 } elseif ($length === null) { 121 $length = self::length($s); 122 } elseif ($start < 0 && $length < 0) { 123 $start += self::length($s); // unifies iconv_substr behavior with mb_substr 124 } 125 126 return iconv_substr($s, $start, $length, 'UTF-8'); 127 } 128 129 130 /** 131 * Removes control characters, normalizes line breaks to `\n`, removes leading and trailing blank lines, 132 * trims end spaces on lines, normalizes UTF-8 to the normal form of NFC. 133 */ 134 public static function normalize(string $s): string 135 { 136 // convert to compressed normal form (NFC) 137 if (class_exists('Normalizer', false) && ($n = \Normalizer::normalize($s, \Normalizer::FORM_C)) !== false) { 138 $s = $n; 139 } 140 141 $s = self::unixNewLines($s); 142 143 // remove control characters; leave \t + \n 144 $s = self::pcre('preg_replace', ['#[\x00-\x08\x0B-\x1F\x7F-\x9F]+#u', '', $s]); 145 146 // right trim 147 $s = self::pcre('preg_replace', ['#[\t ]+$#m', '', $s]); 148 149 // leading and trailing blank lines 150 $s = trim($s, "\n"); 151 152 return $s; 153 } 154 155 156 /** @deprecated use Strings::unixNewLines() */ 157 public static function normalizeNewLines(string $s): string 158 { 159 return self::unixNewLines($s); 160 } 161 162 163 /** 164 * Converts line endings to \n used on Unix-like systems. 165 * Line endings are: \n, \r, \r\n, U+2028 line separator, U+2029 paragraph separator. 166 */ 167 public static function unixNewLines(string $s): string 168 { 169 return preg_replace("~\r\n?|\u{2028}|\u{2029}~", "\n", $s); 170 } 171 172 173 /** 174 * Converts line endings to platform-specific, i.e. \r\n on Windows and \n elsewhere. 175 * Line endings are: \n, \r, \r\n, U+2028 line separator, U+2029 paragraph separator. 176 */ 177 public static function platformNewLines(string $s): string 178 { 179 return preg_replace("~\r\n?|\n|\u{2028}|\u{2029}~", PHP_EOL, $s); 180 } 181 182 183 /** 184 * Converts UTF-8 string to ASCII, ie removes diacritics etc. 185 */ 186 public static function toAscii(string $s): string 187 { 188 $iconv = defined('ICONV_IMPL') ? trim(ICONV_IMPL, '"\'') : null; 189 static $transliterator = null; 190 if ($transliterator === null) { 191 if (class_exists('Transliterator', false)) { 192 $transliterator = \Transliterator::create('Any-Latin; Latin-ASCII'); 193 } else { 194 trigger_error(__METHOD__ . "(): it is recommended to enable PHP extensions 'intl'.", E_USER_NOTICE); 195 $transliterator = false; 196 } 197 } 198 199 // remove control characters and check UTF-8 validity 200 $s = self::pcre('preg_replace', ['#[^\x09\x0A\x0D\x20-\x7E\xA0-\x{2FF}\x{370}-\x{10FFFF}]#u', '', $s]); 201 202 // transliteration (by Transliterator and iconv) is not optimal, replace some characters directly 203 $s = strtr($s, ["\u{201E}" => '"', "\u{201C}" => '"', "\u{201D}" => '"', "\u{201A}" => "'", "\u{2018}" => "'", "\u{2019}" => "'", "\u{B0}" => '^', "\u{42F}" => 'Ya', "\u{44F}" => 'ya', "\u{42E}" => 'Yu', "\u{44E}" => 'yu', "\u{c4}" => 'Ae', "\u{d6}" => 'Oe', "\u{dc}" => 'Ue', "\u{1e9e}" => 'Ss', "\u{e4}" => 'ae', "\u{f6}" => 'oe', "\u{fc}" => 'ue', "\u{df}" => 'ss']); // „ “ ” ‚ ‘ ’ ° Я я Ю ю Ä Ö Ü ẞ ä ö ü ß 204 if ($iconv !== 'libiconv') { 205 $s = strtr($s, ["\u{AE}" => '(R)', "\u{A9}" => '(c)', "\u{2026}" => '...', "\u{AB}" => '<<', "\u{BB}" => '>>', "\u{A3}" => 'lb', "\u{A5}" => 'yen', "\u{B2}" => '^2', "\u{B3}" => '^3', "\u{B5}" => 'u', "\u{B9}" => '^1', "\u{BA}" => 'o', "\u{BF}" => '?', "\u{2CA}" => "'", "\u{2CD}" => '_', "\u{2DD}" => '"', "\u{1FEF}" => '', "\u{20AC}" => 'EUR', "\u{2122}" => 'TM', "\u{212E}" => 'e', "\u{2190}" => '<-', "\u{2191}" => '^', "\u{2192}" => '->', "\u{2193}" => 'V', "\u{2194}" => '<->']); // ® © … « » £ ¥ ² ³ µ ¹ º ¿ ˊ ˍ ˝ ` € ™ ℮ ← ↑ → ↓ ↔ 206 } 207 208 if ($transliterator) { 209 $s = $transliterator->transliterate($s); 210 // use iconv because The transliterator leaves some characters out of ASCII, eg → ʾ 211 if ($iconv === 'glibc') { 212 $s = strtr($s, '?', "\x01"); // temporarily hide ? to distinguish them from the garbage that iconv creates 213 $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); 214 $s = str_replace(['?', "\x01"], ['', '?'], $s); // remove garbage and restore ? characters 215 } elseif ($iconv === 'libiconv') { 216 $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); 217 } else { // null or 'unknown' (#216) 218 $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars 219 } 220 } elseif ($iconv === 'glibc' || $iconv === 'libiconv') { 221 // temporarily hide these characters to distinguish them from the garbage that iconv creates 222 $s = strtr($s, '`\'"^~?', "\x01\x02\x03\x04\x05\x06"); 223 if ($iconv === 'glibc') { 224 // glibc implementation is very limited. transliterate into Windows-1250 and then into ASCII, so most Eastern European characters are preserved 225 $s = iconv('UTF-8', 'WINDOWS-1250//TRANSLIT//IGNORE', $s); 226 $s = strtr( 227 $s, 228 "\xa5\xa3\xbc\x8c\xa7\x8a\xaa\x8d\x8f\x8e\xaf\xb9\xb3\xbe\x9c\x9a\xba\x9d\x9f\x9e\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\x96\xa0\x8b\x97\x9b\xa6\xad\xb7", 229 'ALLSSSSTZZZallssstzzzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTsraaaalccceeeeiiddnnooooruuuuyt- <->|-.', 230 ); 231 $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); 232 } else { 233 $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); 234 } 235 236 // remove garbage that iconv creates during transliteration (eg Ý -> Y') 237 $s = str_replace(['`', "'", '"', '^', '~', '?'], '', $s); 238 // restore temporarily hidden characters 239 $s = strtr($s, "\x01\x02\x03\x04\x05\x06", '`\'"^~?'); 240 } else { 241 $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars 242 } 243 244 return $s; 245 } 246 247 248 /** 249 * Modifies the UTF-8 string to the form used in the URL, ie removes diacritics and replaces all characters 250 * except letters of the English alphabet and numbers with a hyphens. 251 */ 252 public static function webalize(string $s, ?string $charlist = null, bool $lower = true): string 253 { 254 $s = self::toAscii($s); 255 if ($lower) { 256 $s = strtolower($s); 257 } 258 259 $s = self::pcre('preg_replace', ['#[^a-z0-9' . ($charlist !== null ? preg_quote($charlist, '#') : '') . ']+#i', '-', $s]); 260 $s = trim($s, '-'); 261 return $s; 262 } 263 264 265 /** 266 * Truncates a UTF-8 string to given maximal length, while trying not to split whole words. Only if the string is truncated, 267 * an ellipsis (or something else set with third argument) is appended to the string. 268 */ 269 public static function truncate(string $s, int $maxLen, string $append = "\u{2026}"): string 270 { 271 if (self::length($s) > $maxLen) { 272 $maxLen -= self::length($append); 273 if ($maxLen < 1) { 274 return $append; 275 276 } elseif ($matches = self::match($s, '#^.{1,' . $maxLen . '}(?=[\s\x00-/:-@\[-`{-~])#us')) { 277 return $matches[0] . $append; 278 279 } else { 280 return self::substring($s, 0, $maxLen) . $append; 281 } 282 } 283 284 return $s; 285 } 286 287 288 /** 289 * Indents a multiline text from the left. Second argument sets how many indentation chars should be used, 290 * while the indent itself is the third argument (*tab* by default). 291 */ 292 public static function indent(string $s, int $level = 1, string $chars = "\t"): string 293 { 294 if ($level > 0) { 295 $s = self::replace($s, '#(?:^|[\r\n]+)(?=[^\r\n])#', '$0' . str_repeat($chars, $level)); 296 } 297 298 return $s; 299 } 300 301 302 /** 303 * Converts all characters of UTF-8 string to lower case. 304 */ 305 public static function lower(string $s): string 306 { 307 return mb_strtolower($s, 'UTF-8'); 308 } 309 310 311 /** 312 * Converts the first character of a UTF-8 string to lower case and leaves the other characters unchanged. 313 */ 314 public static function firstLower(string $s): string 315 { 316 return self::lower(self::substring($s, 0, 1)) . self::substring($s, 1); 317 } 318 319 320 /** 321 * Converts all characters of a UTF-8 string to upper case. 322 */ 323 public static function upper(string $s): string 324 { 325 return mb_strtoupper($s, 'UTF-8'); 326 } 327 328 329 /** 330 * Converts the first character of a UTF-8 string to upper case and leaves the other characters unchanged. 331 */ 332 public static function firstUpper(string $s): string 333 { 334 return self::upper(self::substring($s, 0, 1)) . self::substring($s, 1); 335 } 336 337 338 /** 339 * Converts the first character of every word of a UTF-8 string to upper case and the others to lower case. 340 */ 341 public static function capitalize(string $s): string 342 { 343 return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8'); 344 } 345 346 347 /** 348 * Compares two UTF-8 strings or their parts, without taking character case into account. If length is null, whole strings are compared, 349 * if it is negative, the corresponding number of characters from the end of the strings is compared, 350 * otherwise the appropriate number of characters from the beginning is compared. 351 */ 352 public static function compare(string $left, string $right, ?int $length = null): bool 353 { 354 if (class_exists('Normalizer', false)) { 355 $left = \Normalizer::normalize($left, \Normalizer::FORM_D); // form NFD is faster 356 $right = \Normalizer::normalize($right, \Normalizer::FORM_D); // form NFD is faster 357 } 358 359 if ($length < 0) { 360 $left = self::substring($left, $length, -$length); 361 $right = self::substring($right, $length, -$length); 362 } elseif ($length !== null) { 363 $left = self::substring($left, 0, $length); 364 $right = self::substring($right, 0, $length); 365 } 366 367 return self::lower($left) === self::lower($right); 368 } 369 370 371 /** 372 * Finds the common prefix of strings or returns empty string if the prefix was not found. 373 * @param string[] $strings 374 */ 375 public static function findPrefix(array $strings): string 376 { 377 $first = array_shift($strings); 378 for ($i = 0; $i < strlen($first); $i++) { 379 foreach ($strings as $s) { 380 if (!isset($s[$i]) || $first[$i] !== $s[$i]) { 381 while ($i && $first[$i - 1] >= "\x80" && $first[$i] >= "\x80" && $first[$i] < "\xC0") { 382 $i--; 383 } 384 385 return substr($first, 0, $i); 386 } 387 } 388 } 389 390 return $first; 391 } 392 393 394 /** 395 * Returns number of characters (not bytes) in UTF-8 string. 396 * That is the number of Unicode code points which may differ from the number of graphemes. 397 */ 398 public static function length(string $s): int 399 { 400 return function_exists('mb_strlen') 401 ? mb_strlen($s, 'UTF-8') 402 : strlen(utf8_decode($s)); 403 } 404 405 406 /** 407 * Removes all left and right side spaces (or the characters passed as second argument) from a UTF-8 encoded string. 408 */ 409 public static function trim(string $s, string $charlist = self::TrimCharacters): string 410 { 411 $charlist = preg_quote($charlist, '#'); 412 return self::replace($s, '#^[' . $charlist . ']+|[' . $charlist . ']+$#Du', ''); 413 } 414 415 416 /** 417 * Pads a UTF-8 string to given length by prepending the $pad string to the beginning. 418 * @param non-empty-string $pad 419 */ 420 public static function padLeft(string $s, int $length, string $pad = ' '): string 421 { 422 $length = max(0, $length - self::length($s)); 423 $padLen = self::length($pad); 424 return str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen) . $s; 425 } 426 427 428 /** 429 * Pads UTF-8 string to given length by appending the $pad string to the end. 430 * @param non-empty-string $pad 431 */ 432 public static function padRight(string $s, int $length, string $pad = ' '): string 433 { 434 $length = max(0, $length - self::length($s)); 435 $padLen = self::length($pad); 436 return $s . str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen); 437 } 438 439 440 /** 441 * Reverses UTF-8 string. 442 */ 443 public static function reverse(string $s): string 444 { 445 if (!extension_loaded('iconv')) { 446 throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.'); 447 } 448 449 return iconv('UTF-32LE', 'UTF-8', strrev(iconv('UTF-8', 'UTF-32BE', $s))); 450 } 451 452 453 /** 454 * Returns part of $haystack before $nth occurence of $needle or returns null if the needle was not found. 455 * Negative value means searching from the end. 456 */ 457 public static function before(string $haystack, string $needle, int $nth = 1): ?string 458 { 459 $pos = self::pos($haystack, $needle, $nth); 460 return $pos === null 461 ? null 462 : substr($haystack, 0, $pos); 463 } 464 465 466 /** 467 * Returns part of $haystack after $nth occurence of $needle or returns null if the needle was not found. 468 * Negative value means searching from the end. 469 */ 470 public static function after(string $haystack, string $needle, int $nth = 1): ?string 471 { 472 $pos = self::pos($haystack, $needle, $nth); 473 return $pos === null 474 ? null 475 : substr($haystack, $pos + strlen($needle)); 476 } 477 478 479 /** 480 * Returns position in characters of $nth occurence of $needle in $haystack or null if the $needle was not found. 481 * Negative value of `$nth` means searching from the end. 482 */ 483 public static function indexOf(string $haystack, string $needle, int $nth = 1): ?int 484 { 485 $pos = self::pos($haystack, $needle, $nth); 486 return $pos === null 487 ? null 488 : self::length(substr($haystack, 0, $pos)); 489 } 490 491 492 /** 493 * Returns position in characters of $nth occurence of $needle in $haystack or null if the needle was not found. 494 */ 495 private static function pos(string $haystack, string $needle, int $nth = 1): ?int 496 { 497 if (!$nth) { 498 return null; 499 } elseif ($nth > 0) { 500 if ($needle === '') { 501 return 0; 502 } 503 504 $pos = 0; 505 while (($pos = strpos($haystack, $needle, $pos)) !== false && --$nth) { 506 $pos++; 507 } 508 } else { 509 $len = strlen($haystack); 510 if ($needle === '') { 511 return $len; 512 } elseif ($len === 0) { 513 return null; 514 } 515 516 $pos = $len - 1; 517 while (($pos = strrpos($haystack, $needle, $pos - $len)) !== false && ++$nth) { 518 $pos--; 519 } 520 } 521 522 return Helpers::falseToNull($pos); 523 } 524 525 526 /** 527 * Divides the string into arrays according to the regular expression. Expressions in parentheses will be captured and returned as well. 528 */ 529 public static function split( 530 string $subject, 531 #[Language('RegExp')] 532 string $pattern, 533 bool|int $captureOffset = false, 534 bool $skipEmpty = false, 535 int $limit = -1, 536 bool $utf8 = false, 537 ): array 538 { 539 $flags = is_int($captureOffset) // back compatibility 540 ? $captureOffset 541 : ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0); 542 543 $pattern .= $utf8 ? 'u' : ''; 544 $m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]); 545 return $utf8 && $captureOffset 546 ? self::bytesToChars($subject, [$m])[0] 547 : $m; 548 549 } 550 551 552 /** 553 * Searches the string for the part matching the regular expression and returns 554 * an array with the found expression and individual subexpressions, or `null`. 555 */ 556 public static function match( 557 string $subject, 558 #[Language('RegExp')] 559 string $pattern, 560 bool|int $captureOffset = false, 561 int $offset = 0, 562 bool $unmatchedAsNull = false, 563 bool $utf8 = false, 564 ): ?array 565 { 566 $flags = is_int($captureOffset) // back compatibility 567 ? $captureOffset 568 : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); 569 570 if ($utf8) { 571 $offset = strlen(self::substring($subject, 0, $offset)); 572 $pattern .= 'u'; 573 } 574 575 if ($offset > strlen($subject)) { 576 return null; 577 } elseif (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) { 578 return null; 579 } elseif ($utf8 && $captureOffset) { 580 return self::bytesToChars($subject, [$m])[0]; 581 } else { 582 return $m; 583 } 584 } 585 586 587 /** 588 * Searches the string for all occurrences matching the regular expression and 589 * returns an array of arrays containing the found expression and each subexpression. 590 */ 591 public static function matchAll( 592 string $subject, 593 #[Language('RegExp')] 594 string $pattern, 595 bool|int $captureOffset = false, 596 int $offset = 0, 597 bool $unmatchedAsNull = false, 598 bool $patternOrder = false, 599 bool $utf8 = false, 600 ): array 601 { 602 $flags = is_int($captureOffset) // back compatibility 603 ? $captureOffset 604 : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0); 605 606 if ($utf8) { 607 $offset = strlen(self::substring($subject, 0, $offset)); 608 $pattern .= 'u'; 609 } 610 611 if ($offset > strlen($subject)) { 612 return []; 613 } 614 615 self::pcre('preg_match_all', [ 616 $pattern, $subject, &$m, 617 ($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER), 618 $offset, 619 ]); 620 return $utf8 && $captureOffset 621 ? self::bytesToChars($subject, $m) 622 : $m; 623 624 } 625 626 627 /** 628 * Replaces all occurrences matching regular expression $pattern which can be string or array in the form `pattern => replacement`. 629 */ 630 public static function replace( 631 string $subject, 632 #[Language('RegExp')] 633 string|array $pattern, 634 string|callable $replacement = '', 635 int $limit = -1, 636 bool $captureOffset = false, 637 bool $unmatchedAsNull = false, 638 bool $utf8 = false, 639 ): string 640 { 641 if (is_object($replacement) || is_array($replacement)) { 642 if (!is_callable($replacement, false, $textual)) { 643 throw new Nette\InvalidStateException("Callback '$textual' is not callable."); 644 } 645 646 $flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); 647 if ($utf8) { 648 $pattern .= 'u'; 649 if ($captureOffset) { 650 $replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]); 651 } 652 } 653 654 return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]); 655 656 } elseif (is_array($pattern) && is_string(key($pattern))) { 657 $replacement = array_values($pattern); 658 $pattern = array_keys($pattern); 659 } 660 661 if ($utf8) { 662 $pattern = array_map(fn($item) => $item . 'u', (array) $pattern); 663 } 664 665 return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]); 666 } 667 668 669 private static function bytesToChars(string $s, array $groups): array 670 { 671 $lastBytes = $lastChars = 0; 672 foreach ($groups as &$matches) { 673 foreach ($matches as &$match) { 674 if ($match[1] > $lastBytes) { 675 $lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes)); 676 } elseif ($match[1] < $lastBytes) { 677 $lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1])); 678 } 679 680 $lastBytes = $match[1]; 681 $match[1] = $lastChars; 682 } 683 } 684 685 return $groups; 686 } 687 688 689 /** @internal */ 690 public static function pcre(string $func, array $args) 691 { 692 $res = Callback::invokeSafe($func, $args, function (string $message) use ($args): void { 693 // compile-time error, not detectable by preg_last_error 694 throw new RegexpException($message . ' in pattern: ' . implode(' or ', (array) $args[0])); 695 }); 696 697 if (($code = preg_last_error()) // run-time error, but preg_last_error & return code are liars 698 && ($res === null || !in_array($func, ['preg_filter', 'preg_replace_callback', 'preg_replace'], true)) 699 ) { 700 throw new RegexpException(preg_last_error_msg() 701 . ' (pattern: ' . implode(' or ', (array) $args[0]) . ')', $code); 702 } 703 704 return $res; 705 } 706} 707