1<?php
2
3/**
4 * This file is part of the Nette Framework (https://nette.org)
5 * Copyright (c) 2004 David Grudl (https://davidgrudl.com)
6 */
7
8declare(strict_types=1);
9
10namespace Nette\Utils;
11
12use JetBrains\PhpStorm\Language;
13use Nette;
14use function is_array, is_object, strlen;
15
16
17/**
18 * String tools library.
19 */
20class Strings
21{
22	use Nette\StaticClass;
23
24	public const TrimCharacters = " \t\n\r\0\x0B\u{A0}";
25
26	/** @deprecated use Strings::TrimCharacters */
27	public const TRIM_CHARACTERS = self::TrimCharacters;
28
29
30	/**
31	 * @deprecated use Nette\Utils\Validator::isUnicode()
32	 */
33	public static function checkEncoding(string $s): bool
34	{
35		return $s === self::fixEncoding($s);
36	}
37
38
39	/**
40	 * Removes all invalid UTF-8 characters from a string.
41	 */
42	public static function fixEncoding(string $s): string
43	{
44		// removes xD800-xDFFF, x110000 and higher
45		return htmlspecialchars_decode(htmlspecialchars($s, ENT_NOQUOTES | ENT_IGNORE, 'UTF-8'), ENT_NOQUOTES);
46	}
47
48
49	/**
50	 * Returns a specific character in UTF-8 from code point (number in range 0x0000..D7FF or 0xE000..10FFFF).
51	 * @throws Nette\InvalidArgumentException if code point is not in valid range
52	 */
53	public static function chr(int $code): string
54	{
55		if ($code < 0 || ($code >= 0xD800 && $code <= 0xDFFF) || $code > 0x10FFFF) {
56			throw new Nette\InvalidArgumentException('Code point must be in range 0x0 to 0xD7FF or 0xE000 to 0x10FFFF.');
57		} elseif (!extension_loaded('iconv')) {
58			throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.');
59		}
60
61		return iconv('UTF-32BE', 'UTF-8//IGNORE', pack('N', $code));
62	}
63
64
65	/**
66	 * Returns a code point of specific character in UTF-8 (number in range 0x0000..D7FF or 0xE000..10FFFF).
67	 */
68	public static function ord(string $c): int
69	{
70		if (!extension_loaded('iconv')) {
71			throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.');
72		}
73
74		$tmp = iconv('UTF-8', 'UTF-32BE//IGNORE', $c);
75		if (!$tmp) {
76			throw new Nette\InvalidArgumentException('Invalid UTF-8 character "' . ($c === '' ? '' : '\x' . strtoupper(bin2hex($c))) . '".');
77		}
78
79		return unpack('N', $tmp)[1];
80	}
81
82
83	/**
84	 * @deprecated use str_starts_with()
85	 */
86	public static function startsWith(string $haystack, string $needle): bool
87	{
88		return str_starts_with($haystack, $needle);
89	}
90
91
92	/**
93	 * @deprecated use str_ends_with()
94	 */
95	public static function endsWith(string $haystack, string $needle): bool
96	{
97		return str_ends_with($haystack, $needle);
98	}
99
100
101	/**
102	 * @deprecated use str_contains()
103	 */
104	public static function contains(string $haystack, string $needle): bool
105	{
106		return str_contains($haystack, $needle);
107	}
108
109
110	/**
111	 * Returns a part of UTF-8 string specified by starting position and length. If start is negative,
112	 * the returned string will start at the start'th character from the end of string.
113	 */
114	public static function substring(string $s, int $start, ?int $length = null): string
115	{
116		if (function_exists('mb_substr')) {
117			return mb_substr($s, $start, $length, 'UTF-8'); // MB is much faster
118		} elseif (!extension_loaded('iconv')) {
119			throw new Nette\NotSupportedException(__METHOD__ . '() requires extension ICONV or MBSTRING, neither is loaded.');
120		} elseif ($length === null) {
121			$length = self::length($s);
122		} elseif ($start < 0 && $length < 0) {
123			$start += self::length($s); // unifies iconv_substr behavior with mb_substr
124		}
125
126		return iconv_substr($s, $start, $length, 'UTF-8');
127	}
128
129
130	/**
131	 * Removes control characters, normalizes line breaks to `\n`, removes leading and trailing blank lines,
132	 * trims end spaces on lines, normalizes UTF-8 to the normal form of NFC.
133	 */
134	public static function normalize(string $s): string
135	{
136		// convert to compressed normal form (NFC)
137		if (class_exists('Normalizer', false) && ($n = \Normalizer::normalize($s, \Normalizer::FORM_C)) !== false) {
138			$s = $n;
139		}
140
141		$s = self::unixNewLines($s);
142
143		// remove control characters; leave \t + \n
144		$s = self::pcre('preg_replace', ['#[\x00-\x08\x0B-\x1F\x7F-\x9F]+#u', '', $s]);
145
146		// right trim
147		$s = self::pcre('preg_replace', ['#[\t ]+$#m', '', $s]);
148
149		// leading and trailing blank lines
150		$s = trim($s, "\n");
151
152		return $s;
153	}
154
155
156	/** @deprecated use Strings::unixNewLines() */
157	public static function normalizeNewLines(string $s): string
158	{
159		return self::unixNewLines($s);
160	}
161
162
163	/**
164	 * Converts line endings to \n used on Unix-like systems.
165	 * Line endings are: \n, \r, \r\n, U+2028 line separator, U+2029 paragraph separator.
166	 */
167	public static function unixNewLines(string $s): string
168	{
169		return preg_replace("~\r\n?|\u{2028}|\u{2029}~", "\n", $s);
170	}
171
172
173	/**
174	 * Converts line endings to platform-specific, i.e. \r\n on Windows and \n elsewhere.
175	 * Line endings are: \n, \r, \r\n, U+2028 line separator, U+2029 paragraph separator.
176	 */
177	public static function platformNewLines(string $s): string
178	{
179		return preg_replace("~\r\n?|\n|\u{2028}|\u{2029}~", PHP_EOL, $s);
180	}
181
182
183	/**
184	 * Converts UTF-8 string to ASCII, ie removes diacritics etc.
185	 */
186	public static function toAscii(string $s): string
187	{
188		$iconv = defined('ICONV_IMPL') ? trim(ICONV_IMPL, '"\'') : null;
189		static $transliterator = null;
190		if ($transliterator === null) {
191			if (class_exists('Transliterator', false)) {
192				$transliterator = \Transliterator::create('Any-Latin; Latin-ASCII');
193			} else {
194				trigger_error(__METHOD__ . "(): it is recommended to enable PHP extensions 'intl'.", E_USER_NOTICE);
195				$transliterator = false;
196			}
197		}
198
199		// remove control characters and check UTF-8 validity
200		$s = self::pcre('preg_replace', ['#[^\x09\x0A\x0D\x20-\x7E\xA0-\x{2FF}\x{370}-\x{10FFFF}]#u', '', $s]);
201
202		// transliteration (by Transliterator and iconv) is not optimal, replace some characters directly
203		$s = strtr($s, ["\u{201E}" => '"', "\u{201C}" => '"', "\u{201D}" => '"', "\u{201A}" => "'", "\u{2018}" => "'", "\u{2019}" => "'", "\u{B0}" => '^', "\u{42F}" => 'Ya', "\u{44F}" => 'ya', "\u{42E}" => 'Yu', "\u{44E}" => 'yu', "\u{c4}" => 'Ae', "\u{d6}" => 'Oe', "\u{dc}" => 'Ue', "\u{1e9e}" => 'Ss', "\u{e4}" => 'ae', "\u{f6}" => 'oe', "\u{fc}" => 'ue', "\u{df}" => 'ss']); // „ “ ” ‚ ‘ ’ ° Я я Ю ю Ä Ö Ü ẞ ä ö ü ß
204		if ($iconv !== 'libiconv') {
205			$s = strtr($s, ["\u{AE}" => '(R)', "\u{A9}" => '(c)', "\u{2026}" => '...', "\u{AB}" => '<<', "\u{BB}" => '>>', "\u{A3}" => 'lb', "\u{A5}" => 'yen', "\u{B2}" => '^2', "\u{B3}" => '^3', "\u{B5}" => 'u', "\u{B9}" => '^1', "\u{BA}" => 'o', "\u{BF}" => '?', "\u{2CA}" => "'", "\u{2CD}" => '_', "\u{2DD}" => '"', "\u{1FEF}" => '', "\u{20AC}" => 'EUR', "\u{2122}" => 'TM', "\u{212E}" => 'e', "\u{2190}" => '<-', "\u{2191}" => '^', "\u{2192}" => '->', "\u{2193}" => 'V', "\u{2194}" => '<->']); // ® © … « » £ ¥ ² ³ µ ¹ º ¿ ˊ ˍ ˝ ` € ™ ℮ ← ↑ → ↓ ↔
206		}
207
208		if ($transliterator) {
209			$s = $transliterator->transliterate($s);
210			// use iconv because The transliterator leaves some characters out of ASCII, eg → ʾ
211			if ($iconv === 'glibc') {
212				$s = strtr($s, '?', "\x01"); // temporarily hide ? to distinguish them from the garbage that iconv creates
213				$s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
214				$s = str_replace(['?', "\x01"], ['', '?'], $s); // remove garbage and restore ? characters
215			} elseif ($iconv === 'libiconv') {
216				$s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
217			} else { // null or 'unknown' (#216)
218				$s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars
219			}
220		} elseif ($iconv === 'glibc' || $iconv === 'libiconv') {
221			// temporarily hide these characters to distinguish them from the garbage that iconv creates
222			$s = strtr($s, '`\'"^~?', "\x01\x02\x03\x04\x05\x06");
223			if ($iconv === 'glibc') {
224				// glibc implementation is very limited. transliterate into Windows-1250 and then into ASCII, so most Eastern European characters are preserved
225				$s = iconv('UTF-8', 'WINDOWS-1250//TRANSLIT//IGNORE', $s);
226				$s = strtr(
227					$s,
228					"\xa5\xa3\xbc\x8c\xa7\x8a\xaa\x8d\x8f\x8e\xaf\xb9\xb3\xbe\x9c\x9a\xba\x9d\x9f\x9e\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\x96\xa0\x8b\x97\x9b\xa6\xad\xb7",
229					'ALLSSSSTZZZallssstzzzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTsraaaalccceeeeiiddnnooooruuuuyt- <->|-.',
230				);
231				$s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]);
232			} else {
233				$s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
234			}
235
236			// remove garbage that iconv creates during transliteration (eg Ý -> Y')
237			$s = str_replace(['`', "'", '"', '^', '~', '?'], '', $s);
238			// restore temporarily hidden characters
239			$s = strtr($s, "\x01\x02\x03\x04\x05\x06", '`\'"^~?');
240		} else {
241			$s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars
242		}
243
244		return $s;
245	}
246
247
248	/**
249	 * Modifies the UTF-8 string to the form used in the URL, ie removes diacritics and replaces all characters
250	 * except letters of the English alphabet and numbers with a hyphens.
251	 */
252	public static function webalize(string $s, ?string $charlist = null, bool $lower = true): string
253	{
254		$s = self::toAscii($s);
255		if ($lower) {
256			$s = strtolower($s);
257		}
258
259		$s = self::pcre('preg_replace', ['#[^a-z0-9' . ($charlist !== null ? preg_quote($charlist, '#') : '') . ']+#i', '-', $s]);
260		$s = trim($s, '-');
261		return $s;
262	}
263
264
265	/**
266	 * Truncates a UTF-8 string to given maximal length, while trying not to split whole words. Only if the string is truncated,
267	 * an ellipsis (or something else set with third argument) is appended to the string.
268	 */
269	public static function truncate(string $s, int $maxLen, string $append = "\u{2026}"): string
270	{
271		if (self::length($s) > $maxLen) {
272			$maxLen -= self::length($append);
273			if ($maxLen < 1) {
274				return $append;
275
276			} elseif ($matches = self::match($s, '#^.{1,' . $maxLen . '}(?=[\s\x00-/:-@\[-`{-~])#us')) {
277				return $matches[0] . $append;
278
279			} else {
280				return self::substring($s, 0, $maxLen) . $append;
281			}
282		}
283
284		return $s;
285	}
286
287
288	/**
289	 * Indents a multiline text from the left. Second argument sets how many indentation chars should be used,
290	 * while the indent itself is the third argument (*tab* by default).
291	 */
292	public static function indent(string $s, int $level = 1, string $chars = "\t"): string
293	{
294		if ($level > 0) {
295			$s = self::replace($s, '#(?:^|[\r\n]+)(?=[^\r\n])#', '$0' . str_repeat($chars, $level));
296		}
297
298		return $s;
299	}
300
301
302	/**
303	 * Converts all characters of UTF-8 string to lower case.
304	 */
305	public static function lower(string $s): string
306	{
307		return mb_strtolower($s, 'UTF-8');
308	}
309
310
311	/**
312	 * Converts the first character of a UTF-8 string to lower case and leaves the other characters unchanged.
313	 */
314	public static function firstLower(string $s): string
315	{
316		return self::lower(self::substring($s, 0, 1)) . self::substring($s, 1);
317	}
318
319
320	/**
321	 * Converts all characters of a UTF-8 string to upper case.
322	 */
323	public static function upper(string $s): string
324	{
325		return mb_strtoupper($s, 'UTF-8');
326	}
327
328
329	/**
330	 * Converts the first character of a UTF-8 string to upper case and leaves the other characters unchanged.
331	 */
332	public static function firstUpper(string $s): string
333	{
334		return self::upper(self::substring($s, 0, 1)) . self::substring($s, 1);
335	}
336
337
338	/**
339	 * Converts the first character of every word of a UTF-8 string to upper case and the others to lower case.
340	 */
341	public static function capitalize(string $s): string
342	{
343		return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8');
344	}
345
346
347	/**
348	 * Compares two UTF-8 strings or their parts, without taking character case into account. If length is null, whole strings are compared,
349	 * if it is negative, the corresponding number of characters from the end of the strings is compared,
350	 * otherwise the appropriate number of characters from the beginning is compared.
351	 */
352	public static function compare(string $left, string $right, ?int $length = null): bool
353	{
354		if (class_exists('Normalizer', false)) {
355			$left = \Normalizer::normalize($left, \Normalizer::FORM_D); // form NFD is faster
356			$right = \Normalizer::normalize($right, \Normalizer::FORM_D); // form NFD is faster
357		}
358
359		if ($length < 0) {
360			$left = self::substring($left, $length, -$length);
361			$right = self::substring($right, $length, -$length);
362		} elseif ($length !== null) {
363			$left = self::substring($left, 0, $length);
364			$right = self::substring($right, 0, $length);
365		}
366
367		return self::lower($left) === self::lower($right);
368	}
369
370
371	/**
372	 * Finds the common prefix of strings or returns empty string if the prefix was not found.
373	 * @param  string[]  $strings
374	 */
375	public static function findPrefix(array $strings): string
376	{
377		$first = array_shift($strings);
378		for ($i = 0; $i < strlen($first); $i++) {
379			foreach ($strings as $s) {
380				if (!isset($s[$i]) || $first[$i] !== $s[$i]) {
381					while ($i && $first[$i - 1] >= "\x80" && $first[$i] >= "\x80" && $first[$i] < "\xC0") {
382						$i--;
383					}
384
385					return substr($first, 0, $i);
386				}
387			}
388		}
389
390		return $first;
391	}
392
393
394	/**
395	 * Returns number of characters (not bytes) in UTF-8 string.
396	 * That is the number of Unicode code points which may differ from the number of graphemes.
397	 */
398	public static function length(string $s): int
399	{
400		return function_exists('mb_strlen')
401			? mb_strlen($s, 'UTF-8')
402			: strlen(utf8_decode($s));
403	}
404
405
406	/**
407	 * Removes all left and right side spaces (or the characters passed as second argument) from a UTF-8 encoded string.
408	 */
409	public static function trim(string $s, string $charlist = self::TrimCharacters): string
410	{
411		$charlist = preg_quote($charlist, '#');
412		return self::replace($s, '#^[' . $charlist . ']+|[' . $charlist . ']+$#Du', '');
413	}
414
415
416	/**
417	 * Pads a UTF-8 string to given length by prepending the $pad string to the beginning.
418	 * @param  non-empty-string  $pad
419	 */
420	public static function padLeft(string $s, int $length, string $pad = ' '): string
421	{
422		$length = max(0, $length - self::length($s));
423		$padLen = self::length($pad);
424		return str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen) . $s;
425	}
426
427
428	/**
429	 * Pads UTF-8 string to given length by appending the $pad string to the end.
430	 * @param  non-empty-string  $pad
431	 */
432	public static function padRight(string $s, int $length, string $pad = ' '): string
433	{
434		$length = max(0, $length - self::length($s));
435		$padLen = self::length($pad);
436		return $s . str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen);
437	}
438
439
440	/**
441	 * Reverses UTF-8 string.
442	 */
443	public static function reverse(string $s): string
444	{
445		if (!extension_loaded('iconv')) {
446			throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.');
447		}
448
449		return iconv('UTF-32LE', 'UTF-8', strrev(iconv('UTF-8', 'UTF-32BE', $s)));
450	}
451
452
453	/**
454	 * Returns part of $haystack before $nth occurence of $needle or returns null if the needle was not found.
455	 * Negative value means searching from the end.
456	 */
457	public static function before(string $haystack, string $needle, int $nth = 1): ?string
458	{
459		$pos = self::pos($haystack, $needle, $nth);
460		return $pos === null
461			? null
462			: substr($haystack, 0, $pos);
463	}
464
465
466	/**
467	 * Returns part of $haystack after $nth occurence of $needle or returns null if the needle was not found.
468	 * Negative value means searching from the end.
469	 */
470	public static function after(string $haystack, string $needle, int $nth = 1): ?string
471	{
472		$pos = self::pos($haystack, $needle, $nth);
473		return $pos === null
474			? null
475			: substr($haystack, $pos + strlen($needle));
476	}
477
478
479	/**
480	 * Returns position in characters of $nth occurence of $needle in $haystack or null if the $needle was not found.
481	 * Negative value of `$nth` means searching from the end.
482	 */
483	public static function indexOf(string $haystack, string $needle, int $nth = 1): ?int
484	{
485		$pos = self::pos($haystack, $needle, $nth);
486		return $pos === null
487			? null
488			: self::length(substr($haystack, 0, $pos));
489	}
490
491
492	/**
493	 * Returns position in characters of $nth occurence of $needle in $haystack or null if the needle was not found.
494	 */
495	private static function pos(string $haystack, string $needle, int $nth = 1): ?int
496	{
497		if (!$nth) {
498			return null;
499		} elseif ($nth > 0) {
500			if ($needle === '') {
501				return 0;
502			}
503
504			$pos = 0;
505			while (($pos = strpos($haystack, $needle, $pos)) !== false && --$nth) {
506				$pos++;
507			}
508		} else {
509			$len = strlen($haystack);
510			if ($needle === '') {
511				return $len;
512			} elseif ($len === 0) {
513				return null;
514			}
515
516			$pos = $len - 1;
517			while (($pos = strrpos($haystack, $needle, $pos - $len)) !== false && ++$nth) {
518				$pos--;
519			}
520		}
521
522		return Helpers::falseToNull($pos);
523	}
524
525
526	/**
527	 * Divides the string into arrays according to the regular expression. Expressions in parentheses will be captured and returned as well.
528	 */
529	public static function split(
530		string $subject,
531		#[Language('RegExp')]
532		string $pattern,
533		bool|int $captureOffset = false,
534		bool $skipEmpty = false,
535		int $limit = -1,
536		bool $utf8 = false,
537	): array
538	{
539		$flags = is_int($captureOffset)  // back compatibility
540			? $captureOffset
541			: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
542
543		$pattern .= $utf8 ? 'u' : '';
544		$m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
545		return $utf8 && $captureOffset
546			? self::bytesToChars($subject, [$m])[0]
547			: $m;
548
549	}
550
551
552	/**
553	 * Searches the string for the part matching the regular expression and returns
554	 * an array with the found expression and individual subexpressions, or `null`.
555	 */
556	public static function match(
557		string $subject,
558		#[Language('RegExp')]
559		string $pattern,
560		bool|int $captureOffset = false,
561		int $offset = 0,
562		bool $unmatchedAsNull = false,
563		bool $utf8 = false,
564	): ?array
565	{
566		$flags = is_int($captureOffset) // back compatibility
567			? $captureOffset
568			: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
569
570		if ($utf8) {
571			$offset = strlen(self::substring($subject, 0, $offset));
572			$pattern .= 'u';
573		}
574
575		if ($offset > strlen($subject)) {
576			return null;
577		} elseif (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
578			return null;
579		} elseif ($utf8 && $captureOffset) {
580			return self::bytesToChars($subject, [$m])[0];
581		} else {
582			return $m;
583		}
584	}
585
586
587	/**
588	 * Searches the string for all occurrences matching the regular expression and
589	 * returns an array of arrays containing the found expression and each subexpression.
590	 */
591	public static function matchAll(
592		string $subject,
593		#[Language('RegExp')]
594		string $pattern,
595		bool|int $captureOffset = false,
596		int $offset = 0,
597		bool $unmatchedAsNull = false,
598		bool $patternOrder = false,
599		bool $utf8 = false,
600	): array
601	{
602		$flags = is_int($captureOffset) // back compatibility
603			? $captureOffset
604			: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
605
606		if ($utf8) {
607			$offset = strlen(self::substring($subject, 0, $offset));
608			$pattern .= 'u';
609		}
610
611		if ($offset > strlen($subject)) {
612			return [];
613		}
614
615		self::pcre('preg_match_all', [
616			$pattern, $subject, &$m,
617			($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
618			$offset,
619		]);
620		return $utf8 && $captureOffset
621			? self::bytesToChars($subject, $m)
622			: $m;
623
624	}
625
626
627	/**
628	 * Replaces all occurrences matching regular expression $pattern which can be string or array in the form `pattern => replacement`.
629	 */
630	public static function replace(
631		string $subject,
632		#[Language('RegExp')]
633		string|array $pattern,
634		string|callable $replacement = '',
635		int $limit = -1,
636		bool $captureOffset = false,
637		bool $unmatchedAsNull = false,
638		bool $utf8 = false,
639	): string
640	{
641		if (is_object($replacement) || is_array($replacement)) {
642			if (!is_callable($replacement, false, $textual)) {
643				throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
644			}
645
646			$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
647			if ($utf8) {
648				$pattern .= 'u';
649				if ($captureOffset) {
650					$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
651				}
652			}
653
654			return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
655
656		} elseif (is_array($pattern) && is_string(key($pattern))) {
657			$replacement = array_values($pattern);
658			$pattern = array_keys($pattern);
659		}
660
661		if ($utf8) {
662			$pattern = array_map(fn($item) => $item . 'u', (array) $pattern);
663		}
664
665		return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
666	}
667
668
669	private static function bytesToChars(string $s, array $groups): array
670	{
671		$lastBytes = $lastChars = 0;
672		foreach ($groups as &$matches) {
673			foreach ($matches as &$match) {
674				if ($match[1] > $lastBytes) {
675					$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
676				} elseif ($match[1] < $lastBytes) {
677					$lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1]));
678				}
679
680				$lastBytes = $match[1];
681				$match[1] = $lastChars;
682			}
683		}
684
685		return $groups;
686	}
687
688
689	/** @internal */
690	public static function pcre(string $func, array $args)
691	{
692		$res = Callback::invokeSafe($func, $args, function (string $message) use ($args): void {
693			// compile-time error, not detectable by preg_last_error
694			throw new RegexpException($message . ' in pattern: ' . implode(' or ', (array) $args[0]));
695		});
696
697		if (($code = preg_last_error()) // run-time error, but preg_last_error & return code are liars
698			&& ($res === null || !in_array($func, ['preg_filter', 'preg_replace_callback', 'preg_replace'], true))
699		) {
700			throw new RegexpException(preg_last_error_msg()
701				. ' (pattern: ' . implode(' or ', (array) $args[0]) . ')', $code);
702		}
703
704		return $res;
705	}
706}
707