self::decodeOne($m[0]), $text ); } /** * Decode a single entity reference. The caller must have already * verified that the input matches self::PATTERN — this is the cheap * path for callers that have one match in hand (e.g. the lexer * mode), avoiding the preg_replace_callback scan that decode() does. * * @param string $match A single entity reference, e.g. # or © * @return string The decoded codepoint(s), or the original literal * bytes if the named entity is not recognised */ public static function decodeOne(string $match): string { if ($match[1] === '#') { // Numeric refs are decoded explicitly rather than via // html_entity_decode: PHP returns the input unchanged for // U+0000, surrogates, and codepoints it considers unsafe // (including U+10FFFF and BMP noncharacters), where // CommonMark requires U+FFFD or the literal codepoint. return self::decodeNumeric(substr($match, 2, -1)); } // Unknown names round-trip unchanged; the renderer's &-escape // turns them back into &xxx; on output. return html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); } /** * Decode the numeric portion of a numeric character reference, with * the CommonMark-mandated U+FFFD substitution for invalid codepoints * (zero, surrogate range, above U+10FFFF). * * @param string $body The digits between &# and ; — decimal digits, * or x/X followed by hex digits * @return string The corresponding UTF-8 codepoint, or U+FFFD when * the codepoint is invalid */ protected static function decodeNumeric(string $body): string { if ($body[0] === 'x' || $body[0] === 'X') { $cp = hexdec(substr($body, 1)); } else { $cp = (int) $body; } if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) { return self::REPLACEMENT; } $char = Unicode::toUtf8([$cp]); if ($char === false || $char === '') { return self::REPLACEMENT; } return $char; } }