1<?php 2 3/* 4 * This file is part of the Symfony package. 5 * 6 * (c) Fabien Potencier <fabien@symfony.com> and Trevor Rowbotham <trevor.rowbotham@pm.me> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12namespace Symfony\Polyfill\Intl\Idn; 13 14use Exception; 15use Normalizer; 16use Symfony\Polyfill\Intl\Idn\Resources\unidata\DisallowedRanges; 17use Symfony\Polyfill\Intl\Idn\Resources\unidata\Regex; 18 19/** 20 * @see https://www.unicode.org/reports/tr46/ 21 * 22 * @internal 23 */ 24final class Idn 25{ 26 public const ERROR_EMPTY_LABEL = 1; 27 public const ERROR_LABEL_TOO_LONG = 2; 28 public const ERROR_DOMAIN_NAME_TOO_LONG = 4; 29 public const ERROR_LEADING_HYPHEN = 8; 30 public const ERROR_TRAILING_HYPHEN = 0x10; 31 public const ERROR_HYPHEN_3_4 = 0x20; 32 public const ERROR_LEADING_COMBINING_MARK = 0x40; 33 public const ERROR_DISALLOWED = 0x80; 34 public const ERROR_PUNYCODE = 0x100; 35 public const ERROR_LABEL_HAS_DOT = 0x200; 36 public const ERROR_INVALID_ACE_LABEL = 0x400; 37 public const ERROR_BIDI = 0x800; 38 public const ERROR_CONTEXTJ = 0x1000; 39 public const ERROR_CONTEXTO_PUNCTUATION = 0x2000; 40 public const ERROR_CONTEXTO_DIGITS = 0x4000; 41 42 public const INTL_IDNA_VARIANT_2003 = 0; 43 public const INTL_IDNA_VARIANT_UTS46 = 1; 44 45 public const IDNA_DEFAULT = 0; 46 public const IDNA_ALLOW_UNASSIGNED = 1; 47 public const IDNA_USE_STD3_RULES = 2; 48 public const IDNA_CHECK_BIDI = 4; 49 public const IDNA_CHECK_CONTEXTJ = 8; 50 public const IDNA_NONTRANSITIONAL_TO_ASCII = 16; 51 public const IDNA_NONTRANSITIONAL_TO_UNICODE = 32; 52 53 public const MAX_DOMAIN_SIZE = 253; 54 public const MAX_LABEL_SIZE = 63; 55 56 public const BASE = 36; 57 public const TMIN = 1; 58 public const TMAX = 26; 59 public const SKEW = 38; 60 public const DAMP = 700; 61 public const INITIAL_BIAS = 72; 62 public const INITIAL_N = 128; 63 public const DELIMITER = '-'; 64 public const MAX_INT = 2147483647; 65 66 /** 67 * Contains the numeric value of a basic code point (for use in representing integers) in the 68 * range 0 to BASE-1, or -1 if b is does not represent a value. 69 * 70 * @var array<int, int> 71 */ 72 private static $basicToDigit = [ 73 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 74 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 75 76 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 77 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 78 79 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 80 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 81 82 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 83 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 84 85 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 86 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 87 88 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 90 91 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 92 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 93 94 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 95 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 96 ]; 97 98 /** 99 * @var array<int, int> 100 */ 101 private static $virama; 102 103 /** 104 * @var array<int, string> 105 */ 106 private static $mapped; 107 108 /** 109 * @var array<int, bool> 110 */ 111 private static $ignored; 112 113 /** 114 * @var array<int, string> 115 */ 116 private static $deviation; 117 118 /** 119 * @var array<int, bool> 120 */ 121 private static $disallowed; 122 123 /** 124 * @var array<int, string> 125 */ 126 private static $disallowed_STD3_mapped; 127 128 /** 129 * @var array<int, bool> 130 */ 131 private static $disallowed_STD3_valid; 132 133 /** 134 * @var bool 135 */ 136 private static $mappingTableLoaded = false; 137 138 /** 139 * @see https://www.unicode.org/reports/tr46/#ToASCII 140 * 141 * @param string $domainName 142 * @param int $options 143 * @param int $variant 144 * @param array $idna_info 145 * 146 * @return string|false 147 */ 148 public static function idn_to_ascii($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = []) 149 { 150 if (\PHP_VERSION_ID >= 70200 && self::INTL_IDNA_VARIANT_2003 === $variant) { 151 @trigger_error('idn_to_ascii(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED); 152 } 153 154 $options = [ 155 'CheckHyphens' => true, 156 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI), 157 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ), 158 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES), 159 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_ASCII), 160 'VerifyDnsLength' => true, 161 ]; 162 $info = new Info(); 163 $labels = self::process((string) $domainName, $options, $info); 164 165 foreach ($labels as $i => $label) { 166 // Only convert labels to punycode that contain non-ASCII code points 167 if (1 === preg_match('/[^\x00-\x7F]/', $label)) { 168 try { 169 $label = 'xn--'.self::punycodeEncode($label); 170 } catch (Exception $e) { 171 $info->errors |= self::ERROR_PUNYCODE; 172 } 173 174 $labels[$i] = $label; 175 } 176 } 177 178 if ($options['VerifyDnsLength']) { 179 self::validateDomainAndLabelLength($labels, $info); 180 } 181 182 $idna_info = [ 183 'result' => implode('.', $labels), 184 'isTransitionalDifferent' => $info->transitionalDifferent, 185 'errors' => $info->errors, 186 ]; 187 188 return 0 === $info->errors ? $idna_info['result'] : false; 189 } 190 191 /** 192 * @see https://www.unicode.org/reports/tr46/#ToUnicode 193 * 194 * @param string $domainName 195 * @param int $options 196 * @param int $variant 197 * @param array $idna_info 198 * 199 * @return string|false 200 */ 201 public static function idn_to_utf8($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = []) 202 { 203 if (\PHP_VERSION_ID >= 70200 && self::INTL_IDNA_VARIANT_2003 === $variant) { 204 @trigger_error('idn_to_utf8(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED); 205 } 206 207 $info = new Info(); 208 $labels = self::process((string) $domainName, [ 209 'CheckHyphens' => true, 210 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI), 211 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ), 212 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES), 213 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_UNICODE), 214 ], $info); 215 $idna_info = [ 216 'result' => implode('.', $labels), 217 'isTransitionalDifferent' => $info->transitionalDifferent, 218 'errors' => $info->errors, 219 ]; 220 221 return 0 === $info->errors ? $idna_info['result'] : false; 222 } 223 224 /** 225 * @param string $label 226 * 227 * @return bool 228 */ 229 private static function isValidContextJ(array $codePoints, $label) 230 { 231 if (!isset(self::$virama)) { 232 self::$virama = require __DIR__.\DIRECTORY_SEPARATOR.'Resources'.\DIRECTORY_SEPARATOR.'unidata'.\DIRECTORY_SEPARATOR.'virama.php'; 233 } 234 235 $offset = 0; 236 237 foreach ($codePoints as $i => $codePoint) { 238 if (0x200C !== $codePoint && 0x200D !== $codePoint) { 239 continue; 240 } 241 242 if (!isset($codePoints[$i - 1])) { 243 return false; 244 } 245 246 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 247 if (isset(self::$virama[$codePoints[$i - 1]])) { 248 continue; 249 } 250 251 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C(Joining_Type:T)*(Joining_Type:{R,D})) Then 252 // True; 253 // Generated RegExp = ([Joining_Type:{L,D}][Joining_Type:T]*\u200C[Joining_Type:T]*)[Joining_Type:{R,D}] 254 if (0x200C === $codePoint && 1 === preg_match(Regex::ZWNJ, $label, $matches, \PREG_OFFSET_CAPTURE, $offset)) { 255 $offset += \strlen($matches[1][0]); 256 257 continue; 258 } 259 260 return false; 261 } 262 263 return true; 264 } 265 266 /** 267 * @see https://www.unicode.org/reports/tr46/#ProcessingStepMap 268 * 269 * @param string $input 270 * @param array<string, bool> $options 271 * 272 * @return string 273 */ 274 private static function mapCodePoints($input, array $options, Info $info) 275 { 276 $str = ''; 277 $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules']; 278 $transitional = $options['Transitional_Processing']; 279 280 foreach (self::utf8Decode($input) as $codePoint) { 281 $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules); 282 283 switch ($data['status']) { 284 case 'disallowed': 285 $info->errors |= self::ERROR_DISALLOWED; 286 287 // no break. 288 289 case 'valid': 290 $str .= mb_chr($codePoint, 'utf-8'); 291 292 break; 293 294 case 'ignored': 295 // Do nothing. 296 break; 297 298 case 'mapped': 299 $str .= $data['mapping']; 300 301 break; 302 303 case 'deviation': 304 $info->transitionalDifferent = true; 305 $str .= ($transitional ? $data['mapping'] : mb_chr($codePoint, 'utf-8')); 306 307 break; 308 } 309 } 310 311 return $str; 312 } 313 314 /** 315 * @see https://www.unicode.org/reports/tr46/#Processing 316 * 317 * @param string $domain 318 * @param array<string, bool> $options 319 * 320 * @return array<int, string> 321 */ 322 private static function process($domain, array $options, Info $info) 323 { 324 // If VerifyDnsLength is not set, we are doing ToUnicode otherwise we are doing ToASCII and 325 // we need to respect the VerifyDnsLength option. 326 $checkForEmptyLabels = !isset($options['VerifyDnsLength']) || $options['VerifyDnsLength']; 327 328 if ($checkForEmptyLabels && '' === $domain) { 329 $info->errors |= self::ERROR_EMPTY_LABEL; 330 331 return [$domain]; 332 } 333 334 // Step 1. Map each code point in the domain name string 335 $domain = self::mapCodePoints($domain, $options, $info); 336 337 // Step 2. Normalize the domain name string to Unicode Normalization Form C. 338 if (!Normalizer::isNormalized($domain, Normalizer::FORM_C)) { 339 $domain = Normalizer::normalize($domain, Normalizer::FORM_C); 340 } 341 342 // Step 3. Break the string into labels at U+002E (.) FULL STOP. 343 $labels = explode('.', $domain); 344 $lastLabelIndex = \count($labels) - 1; 345 346 // Step 4. Convert and validate each label in the domain name string. 347 foreach ($labels as $i => $label) { 348 $validationOptions = $options; 349 350 if ('xn--' === substr($label, 0, 4)) { 351 try { 352 $label = self::punycodeDecode(substr($label, 4)); 353 } catch (Exception $e) { 354 $info->errors |= self::ERROR_PUNYCODE; 355 356 continue; 357 } 358 359 $validationOptions['Transitional_Processing'] = false; 360 $labels[$i] = $label; 361 } 362 363 self::validateLabel($label, $info, $validationOptions, $i > 0 && $i === $lastLabelIndex); 364 } 365 366 if ($info->bidiDomain && !$info->validBidiDomain) { 367 $info->errors |= self::ERROR_BIDI; 368 } 369 370 // Any input domain name string that does not record an error has been successfully 371 // processed according to this specification. Conversely, if an input domain_name string 372 // causes an error, then the processing of the input domain_name string fails. Determining 373 // what to do with error input is up to the caller, and not in the scope of this document. 374 return $labels; 375 } 376 377 /** 378 * @see https://tools.ietf.org/html/rfc5893#section-2 379 * 380 * @param string $label 381 */ 382 private static function validateBidiLabel($label, Info $info) 383 { 384 if (1 === preg_match(Regex::RTL_LABEL, $label)) { 385 $info->bidiDomain = true; 386 387 // Step 1. The first character must be a character with Bidi property L, R, or AL. 388 // If it has the R or AL property, it is an RTL label 389 if (1 !== preg_match(Regex::BIDI_STEP_1_RTL, $label)) { 390 $info->validBidiDomain = false; 391 392 return; 393 } 394 395 // Step 2. In an RTL label, only characters with the Bidi properties R, AL, AN, EN, ES, 396 // CS, ET, ON, BN, or NSM are allowed. 397 if (1 === preg_match(Regex::BIDI_STEP_2, $label)) { 398 $info->validBidiDomain = false; 399 400 return; 401 } 402 403 // Step 3. In an RTL label, the end of the label must be a character with Bidi property 404 // R, AL, EN, or AN, followed by zero or more characters with Bidi property NSM. 405 if (1 !== preg_match(Regex::BIDI_STEP_3, $label)) { 406 $info->validBidiDomain = false; 407 408 return; 409 } 410 411 // Step 4. In an RTL label, if an EN is present, no AN may be present, and vice versa. 412 if (1 === preg_match(Regex::BIDI_STEP_4_AN, $label) && 1 === preg_match(Regex::BIDI_STEP_4_EN, $label)) { 413 $info->validBidiDomain = false; 414 415 return; 416 } 417 418 return; 419 } 420 421 // We are a LTR label 422 // Step 1. The first character must be a character with Bidi property L, R, or AL. 423 // If it has the L property, it is an LTR label. 424 if (1 !== preg_match(Regex::BIDI_STEP_1_LTR, $label)) { 425 $info->validBidiDomain = false; 426 427 return; 428 } 429 430 // Step 5. In an LTR label, only characters with the Bidi properties L, EN, 431 // ES, CS, ET, ON, BN, or NSM are allowed. 432 if (1 === preg_match(Regex::BIDI_STEP_5, $label)) { 433 $info->validBidiDomain = false; 434 435 return; 436 } 437 438 // Step 6.In an LTR label, the end of the label must be a character with Bidi property L or 439 // EN, followed by zero or more characters with Bidi property NSM. 440 if (1 !== preg_match(Regex::BIDI_STEP_6, $label)) { 441 $info->validBidiDomain = false; 442 443 return; 444 } 445 } 446 447 /** 448 * @param array<int, string> $labels 449 */ 450 private static function validateDomainAndLabelLength(array $labels, Info $info) 451 { 452 $maxDomainSize = self::MAX_DOMAIN_SIZE; 453 $length = \count($labels); 454 455 // Number of "." delimiters. 456 $domainLength = $length - 1; 457 458 // If the last label is empty and it is not the first label, then it is the root label. 459 // Increase the max size by 1, making it 254, to account for the root label's "." 460 // delimiter. This also means we don't need to check the last label's length for being too 461 // long. 462 if ($length > 1 && '' === $labels[$length - 1]) { 463 ++$maxDomainSize; 464 --$length; 465 } 466 467 for ($i = 0; $i < $length; ++$i) { 468 $bytes = \strlen($labels[$i]); 469 $domainLength += $bytes; 470 471 if ($bytes > self::MAX_LABEL_SIZE) { 472 $info->errors |= self::ERROR_LABEL_TOO_LONG; 473 } 474 } 475 476 if ($domainLength > $maxDomainSize) { 477 $info->errors |= self::ERROR_DOMAIN_NAME_TOO_LONG; 478 } 479 } 480 481 /** 482 * @see https://www.unicode.org/reports/tr46/#Validity_Criteria 483 * 484 * @param string $label 485 * @param array<string, bool> $options 486 * @param bool $canBeEmpty 487 */ 488 private static function validateLabel($label, Info $info, array $options, $canBeEmpty) 489 { 490 if ('' === $label) { 491 if (!$canBeEmpty && (!isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'])) { 492 $info->errors |= self::ERROR_EMPTY_LABEL; 493 } 494 495 return; 496 } 497 498 // Step 1. The label must be in Unicode Normalization Form C. 499 if (!Normalizer::isNormalized($label, Normalizer::FORM_C)) { 500 $info->errors |= self::ERROR_INVALID_ACE_LABEL; 501 } 502 503 $codePoints = self::utf8Decode($label); 504 505 if ($options['CheckHyphens']) { 506 // Step 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character 507 // in both the thrid and fourth positions. 508 if (isset($codePoints[2], $codePoints[3]) && 0x002D === $codePoints[2] && 0x002D === $codePoints[3]) { 509 $info->errors |= self::ERROR_HYPHEN_3_4; 510 } 511 512 // Step 3. If CheckHyphens, the label must neither begin nor end with a U+002D 513 // HYPHEN-MINUS character. 514 if ('-' === substr($label, 0, 1)) { 515 $info->errors |= self::ERROR_LEADING_HYPHEN; 516 } 517 518 if ('-' === substr($label, -1, 1)) { 519 $info->errors |= self::ERROR_TRAILING_HYPHEN; 520 } 521 } 522 523 // Step 4. The label must not contain a U+002E (.) FULL STOP. 524 if (false !== strpos($label, '.')) { 525 $info->errors |= self::ERROR_LABEL_HAS_DOT; 526 } 527 528 // Step 5. The label must not begin with a combining mark, that is: General_Category=Mark. 529 if (1 === preg_match(Regex::COMBINING_MARK, $label)) { 530 $info->errors |= self::ERROR_LEADING_COMBINING_MARK; 531 } 532 533 // Step 6. Each code point in the label must only have certain status values according to 534 // Section 5, IDNA Mapping Table: 535 $transitional = $options['Transitional_Processing']; 536 $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules']; 537 538 foreach ($codePoints as $codePoint) { 539 $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules); 540 $status = $data['status']; 541 542 if ('valid' === $status || (!$transitional && 'deviation' === $status)) { 543 continue; 544 } 545 546 $info->errors |= self::ERROR_DISALLOWED; 547 548 break; 549 } 550 551 // Step 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in 552 // The Unicode Code Points and Internationalized Domain Names for Applications (IDNA) 553 // [IDNA2008]. 554 if ($options['CheckJoiners'] && !self::isValidContextJ($codePoints, $label)) { 555 $info->errors |= self::ERROR_CONTEXTJ; 556 } 557 558 // Step 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must 559 // satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2. 560 if ($options['CheckBidi'] && (!$info->bidiDomain || $info->validBidiDomain)) { 561 self::validateBidiLabel($label, $info); 562 } 563 } 564 565 /** 566 * @see https://tools.ietf.org/html/rfc3492#section-6.2 567 * 568 * @param string $input 569 * 570 * @return string 571 */ 572 private static function punycodeDecode($input) 573 { 574 $n = self::INITIAL_N; 575 $out = 0; 576 $i = 0; 577 $bias = self::INITIAL_BIAS; 578 $lastDelimIndex = strrpos($input, self::DELIMITER); 579 $b = false === $lastDelimIndex ? 0 : $lastDelimIndex; 580 $inputLength = \strlen($input); 581 $output = []; 582 $bytes = array_map('ord', str_split($input)); 583 584 for ($j = 0; $j < $b; ++$j) { 585 if ($bytes[$j] > 0x7F) { 586 throw new Exception('Invalid input'); 587 } 588 589 $output[$out++] = $input[$j]; 590 } 591 592 if ($b > 0) { 593 ++$b; 594 } 595 596 for ($in = $b; $in < $inputLength; ++$out) { 597 $oldi = $i; 598 $w = 1; 599 600 for ($k = self::BASE; /* no condition */; $k += self::BASE) { 601 if ($in >= $inputLength) { 602 throw new Exception('Invalid input'); 603 } 604 605 $digit = self::$basicToDigit[$bytes[$in++] & 0xFF]; 606 607 if ($digit < 0) { 608 throw new Exception('Invalid input'); 609 } 610 611 if ($digit > intdiv(self::MAX_INT - $i, $w)) { 612 throw new Exception('Integer overflow'); 613 } 614 615 $i += $digit * $w; 616 617 if ($k <= $bias) { 618 $t = self::TMIN; 619 } elseif ($k >= $bias + self::TMAX) { 620 $t = self::TMAX; 621 } else { 622 $t = $k - $bias; 623 } 624 625 if ($digit < $t) { 626 break; 627 } 628 629 $baseMinusT = self::BASE - $t; 630 631 if ($w > intdiv(self::MAX_INT, $baseMinusT)) { 632 throw new Exception('Integer overflow'); 633 } 634 635 $w *= $baseMinusT; 636 } 637 638 $outPlusOne = $out + 1; 639 $bias = self::adaptBias($i - $oldi, $outPlusOne, 0 === $oldi); 640 641 if (intdiv($i, $outPlusOne) > self::MAX_INT - $n) { 642 throw new Exception('Integer overflow'); 643 } 644 645 $n += intdiv($i, $outPlusOne); 646 $i %= $outPlusOne; 647 array_splice($output, $i++, 0, [mb_chr($n, 'utf-8')]); 648 } 649 650 return implode('', $output); 651 } 652 653 /** 654 * @see https://tools.ietf.org/html/rfc3492#section-6.3 655 * 656 * @param string $input 657 * 658 * @return string 659 */ 660 private static function punycodeEncode($input) 661 { 662 $n = self::INITIAL_N; 663 $delta = 0; 664 $out = 0; 665 $bias = self::INITIAL_BIAS; 666 $inputLength = 0; 667 $output = ''; 668 $iter = self::utf8Decode($input); 669 670 foreach ($iter as $codePoint) { 671 ++$inputLength; 672 673 if ($codePoint < 0x80) { 674 $output .= \chr($codePoint); 675 ++$out; 676 } 677 } 678 679 $h = $out; 680 $b = $out; 681 682 if ($b > 0) { 683 $output .= self::DELIMITER; 684 ++$out; 685 } 686 687 while ($h < $inputLength) { 688 $m = self::MAX_INT; 689 690 foreach ($iter as $codePoint) { 691 if ($codePoint >= $n && $codePoint < $m) { 692 $m = $codePoint; 693 } 694 } 695 696 if ($m - $n > intdiv(self::MAX_INT - $delta, $h + 1)) { 697 throw new Exception('Integer overflow'); 698 } 699 700 $delta += ($m - $n) * ($h + 1); 701 $n = $m; 702 703 foreach ($iter as $codePoint) { 704 if ($codePoint < $n && 0 === ++$delta) { 705 throw new Exception('Integer overflow'); 706 } 707 708 if ($codePoint === $n) { 709 $q = $delta; 710 711 for ($k = self::BASE; /* no condition */; $k += self::BASE) { 712 if ($k <= $bias) { 713 $t = self::TMIN; 714 } elseif ($k >= $bias + self::TMAX) { 715 $t = self::TMAX; 716 } else { 717 $t = $k - $bias; 718 } 719 720 if ($q < $t) { 721 break; 722 } 723 724 $qMinusT = $q - $t; 725 $baseMinusT = self::BASE - $t; 726 $output .= self::encodeDigit($t + ($qMinusT) % ($baseMinusT), false); 727 ++$out; 728 $q = intdiv($qMinusT, $baseMinusT); 729 } 730 731 $output .= self::encodeDigit($q, false); 732 ++$out; 733 $bias = self::adaptBias($delta, $h + 1, $h === $b); 734 $delta = 0; 735 ++$h; 736 } 737 } 738 739 ++$delta; 740 ++$n; 741 } 742 743 return $output; 744 } 745 746 /** 747 * @see https://tools.ietf.org/html/rfc3492#section-6.1 748 * 749 * @param int $delta 750 * @param int $numPoints 751 * @param bool $firstTime 752 * 753 * @return int 754 */ 755 private static function adaptBias($delta, $numPoints, $firstTime) 756 { 757 // xxx >> 1 is a faster way of doing intdiv(xxx, 2) 758 $delta = $firstTime ? intdiv($delta, self::DAMP) : $delta >> 1; 759 $delta += intdiv($delta, $numPoints); 760 $k = 0; 761 762 while ($delta > ((self::BASE - self::TMIN) * self::TMAX) >> 1) { 763 $delta = intdiv($delta, self::BASE - self::TMIN); 764 $k += self::BASE; 765 } 766 767 return $k + intdiv((self::BASE - self::TMIN + 1) * $delta, $delta + self::SKEW); 768 } 769 770 /** 771 * @param int $d 772 * @param bool $flag 773 * 774 * @return string 775 */ 776 private static function encodeDigit($d, $flag) 777 { 778 return \chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5)); 779 } 780 781 /** 782 * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any 783 * invalid byte sequences will be replaced by a U+FFFD replacement code point. 784 * 785 * @see https://encoding.spec.whatwg.org/#utf-8-decoder 786 * 787 * @param string $input 788 * 789 * @return array<int, int> 790 */ 791 private static function utf8Decode($input) 792 { 793 $bytesSeen = 0; 794 $bytesNeeded = 0; 795 $lowerBoundary = 0x80; 796 $upperBoundary = 0xBF; 797 $codePoint = 0; 798 $codePoints = []; 799 $length = \strlen($input); 800 801 for ($i = 0; $i < $length; ++$i) { 802 $byte = \ord($input[$i]); 803 804 if (0 === $bytesNeeded) { 805 if ($byte >= 0x00 && $byte <= 0x7F) { 806 $codePoints[] = $byte; 807 808 continue; 809 } 810 811 if ($byte >= 0xC2 && $byte <= 0xDF) { 812 $bytesNeeded = 1; 813 $codePoint = $byte & 0x1F; 814 } elseif ($byte >= 0xE0 && $byte <= 0xEF) { 815 if (0xE0 === $byte) { 816 $lowerBoundary = 0xA0; 817 } elseif (0xED === $byte) { 818 $upperBoundary = 0x9F; 819 } 820 821 $bytesNeeded = 2; 822 $codePoint = $byte & 0xF; 823 } elseif ($byte >= 0xF0 && $byte <= 0xF4) { 824 if (0xF0 === $byte) { 825 $lowerBoundary = 0x90; 826 } elseif (0xF4 === $byte) { 827 $upperBoundary = 0x8F; 828 } 829 830 $bytesNeeded = 3; 831 $codePoint = $byte & 0x7; 832 } else { 833 $codePoints[] = 0xFFFD; 834 } 835 836 continue; 837 } 838 839 if ($byte < $lowerBoundary || $byte > $upperBoundary) { 840 $codePoint = 0; 841 $bytesNeeded = 0; 842 $bytesSeen = 0; 843 $lowerBoundary = 0x80; 844 $upperBoundary = 0xBF; 845 --$i; 846 $codePoints[] = 0xFFFD; 847 848 continue; 849 } 850 851 $lowerBoundary = 0x80; 852 $upperBoundary = 0xBF; 853 $codePoint = ($codePoint << 6) | ($byte & 0x3F); 854 855 if (++$bytesSeen !== $bytesNeeded) { 856 continue; 857 } 858 859 $codePoints[] = $codePoint; 860 $codePoint = 0; 861 $bytesNeeded = 0; 862 $bytesSeen = 0; 863 } 864 865 // String unexpectedly ended, so append a U+FFFD code point. 866 if (0 !== $bytesNeeded) { 867 $codePoints[] = 0xFFFD; 868 } 869 870 return $codePoints; 871 } 872 873 /** 874 * @param int $codePoint 875 * @param bool $useSTD3ASCIIRules 876 * 877 * @return array{status: string, mapping?: string} 878 */ 879 private static function lookupCodePointStatus($codePoint, $useSTD3ASCIIRules) 880 { 881 if (!self::$mappingTableLoaded) { 882 self::$mappingTableLoaded = true; 883 self::$mapped = require __DIR__.'/Resources/unidata/mapped.php'; 884 self::$ignored = require __DIR__.'/Resources/unidata/ignored.php'; 885 self::$deviation = require __DIR__.'/Resources/unidata/deviation.php'; 886 self::$disallowed = require __DIR__.'/Resources/unidata/disallowed.php'; 887 self::$disallowed_STD3_mapped = require __DIR__.'/Resources/unidata/disallowed_STD3_mapped.php'; 888 self::$disallowed_STD3_valid = require __DIR__.'/Resources/unidata/disallowed_STD3_valid.php'; 889 } 890 891 if (isset(self::$mapped[$codePoint])) { 892 return ['status' => 'mapped', 'mapping' => self::$mapped[$codePoint]]; 893 } 894 895 if (isset(self::$ignored[$codePoint])) { 896 return ['status' => 'ignored']; 897 } 898 899 if (isset(self::$deviation[$codePoint])) { 900 return ['status' => 'deviation', 'mapping' => self::$deviation[$codePoint]]; 901 } 902 903 if (isset(self::$disallowed[$codePoint]) || DisallowedRanges::inRange($codePoint)) { 904 return ['status' => 'disallowed']; 905 } 906 907 $isDisallowedMapped = isset(self::$disallowed_STD3_mapped[$codePoint]); 908 909 if ($isDisallowedMapped || isset(self::$disallowed_STD3_valid[$codePoint])) { 910 $status = 'disallowed'; 911 912 if (!$useSTD3ASCIIRules) { 913 $status = $isDisallowedMapped ? 'mapped' : 'valid'; 914 } 915 916 if ($isDisallowedMapped) { 917 return ['status' => $status, 'mapping' => self::$disallowed_STD3_mapped[$codePoint]]; 918 } 919 920 return ['status' => $status]; 921 } 922 923 return ['status' => 'valid']; 924 } 925} 926