allowedModes = []; } /** @inheritdoc */ public function getSort() { return 255; } /** @inheritdoc */ public function connectTo($mode) { $this->Lexer->addSpecialPattern( '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});', $mode, 'gfm_html_entity' ); } /** @inheritdoc */ public function handle($match, $state, $pos, Handler $handler) { if ($match[1] === '#') { // Numeric refs are decoded explicitly rather than via // html_entity_decode: PHP returns the input unchanged for // U+0000, surrogates, and codepoints it considers unsafe // (including U+10FFFF and BMP noncharacters), where // CommonMark requires U+FFFD or the literal codepoint. $char = $this->decodeNumeric(substr($match, 2, -1)); } else { // Unknown names round-trip unchanged; the renderer's &-escape // turns them back into &xxx; on output. $char = html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); } $handler->addCall('cdata', [$char], $pos); return true; } protected function decodeNumeric(string $body): string { if ($body[0] === 'x' || $body[0] === 'X') { $cp = hexdec(substr($body, 1)); } else { $cp = (int) $body; } if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) { return self::REPLACEMENT; } $char = Unicode::toUtf8([$cp]); if ($char === false || $char === '') { return self::REPLACEMENT; } return $char; } }