1*eb15e634SAndreas Gohr<?php 2*eb15e634SAndreas Gohr 3*eb15e634SAndreas Gohrnamespace dokuwiki\Parsing\Helpers; 4*eb15e634SAndreas Gohr 5*eb15e634SAndreas Gohruse dokuwiki\Utf8\Unicode; 6*eb15e634SAndreas Gohr 7*eb15e634SAndreas Gohr/** 8*eb15e634SAndreas Gohr * Pure helper for decoding HTML entity references - numeric (`&#nnn;`, 9*eb15e634SAndreas Gohr * `&#xhhh;`) and HTML5 named (`©`, `Æ`, ...) - to their 10*eb15e634SAndreas Gohr * Unicode codepoint(s). 11*eb15e634SAndreas Gohr * 12*eb15e634SAndreas Gohr * Whole-span PROTECTED modes (GfmCode, GfmLink, ...) capture their body 13*eb15e634SAndreas Gohr * in a single regex match, so the inline GfmHtmlEntity pattern never 14*eb15e634SAndreas Gohr * sees the bytes inside. For the slots GFM still wants entity-decoded - 15*eb15e634SAndreas Gohr * fenced code info strings, link destinations, link titles - call 16*eb15e634SAndreas Gohr * decode() after extracting the literal substring. 17*eb15e634SAndreas Gohr * 18*eb15e634SAndreas Gohr * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the 19*eb15e634SAndreas Gohr * surrogate range U+D800..U+DFFF map to U+FFFD (REPLACEMENT CHARACTER) 20*eb15e634SAndreas Gohr * for numeric references. Unknown named references are returned 21*eb15e634SAndreas Gohr * unchanged - the caller emits them literally and the renderer's 22*eb15e634SAndreas Gohr * &-escaping turns them back into `&xxx;` on output. 23*eb15e634SAndreas Gohr */ 24*eb15e634SAndreas Gohrclass HtmlEntity 25*eb15e634SAndreas Gohr{ 26*eb15e634SAndreas Gohr /** 27*eb15e634SAndreas Gohr * Regex matching one HTML entity reference. Shared by GfmHtmlEntity 28*eb15e634SAndreas Gohr * (as the lexer special-pattern) and decode() (as the scan 29*eb15e634SAndreas Gohr * pattern), so the two stay in lockstep. 30*eb15e634SAndreas Gohr */ 31*eb15e634SAndreas Gohr public const PATTERN = '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});'; 32*eb15e634SAndreas Gohr 33*eb15e634SAndreas Gohr protected const REPLACEMENT = "\u{FFFD}"; 34*eb15e634SAndreas Gohr 35*eb15e634SAndreas Gohr /** 36*eb15e634SAndreas Gohr * Decode every HTML entity reference in the given text to its 37*eb15e634SAndreas Gohr * corresponding Unicode codepoint(s). Non-entity bytes pass through 38*eb15e634SAndreas Gohr * unchanged. 39*eb15e634SAndreas Gohr * 40*eb15e634SAndreas Gohr * @param string $text Source text that may contain entity references 41*eb15e634SAndreas Gohr * @return string Text with all recognised entities decoded 42*eb15e634SAndreas Gohr */ 43*eb15e634SAndreas Gohr public static function decode(string $text): string 44*eb15e634SAndreas Gohr { 45*eb15e634SAndreas Gohr return preg_replace_callback( 46*eb15e634SAndreas Gohr '/' . self::PATTERN . '/', 47*eb15e634SAndreas Gohr static fn($m) => self::decodeOne($m[0]), 48*eb15e634SAndreas Gohr $text 49*eb15e634SAndreas Gohr ); 50*eb15e634SAndreas Gohr } 51*eb15e634SAndreas Gohr 52*eb15e634SAndreas Gohr /** 53*eb15e634SAndreas Gohr * Decode a single entity reference. The caller must have already 54*eb15e634SAndreas Gohr * verified that the input matches self::PATTERN — this is the cheap 55*eb15e634SAndreas Gohr * path for callers that have one match in hand (e.g. the lexer 56*eb15e634SAndreas Gohr * mode), avoiding the preg_replace_callback scan that decode() does. 57*eb15e634SAndreas Gohr * 58*eb15e634SAndreas Gohr * @param string $match A single entity reference, e.g. # or © 59*eb15e634SAndreas Gohr * @return string The decoded codepoint(s), or the original literal 60*eb15e634SAndreas Gohr * bytes if the named entity is not recognised 61*eb15e634SAndreas Gohr */ 62*eb15e634SAndreas Gohr public static function decodeOne(string $match): string 63*eb15e634SAndreas Gohr { 64*eb15e634SAndreas Gohr if ($match[1] === '#') { 65*eb15e634SAndreas Gohr // Numeric refs are decoded explicitly rather than via 66*eb15e634SAndreas Gohr // html_entity_decode: PHP returns the input unchanged for 67*eb15e634SAndreas Gohr // U+0000, surrogates, and codepoints it considers unsafe 68*eb15e634SAndreas Gohr // (including U+10FFFF and BMP noncharacters), where 69*eb15e634SAndreas Gohr // CommonMark requires U+FFFD or the literal codepoint. 70*eb15e634SAndreas Gohr return self::decodeNumeric(substr($match, 2, -1)); 71*eb15e634SAndreas Gohr } 72*eb15e634SAndreas Gohr // Unknown names round-trip unchanged; the renderer's &-escape 73*eb15e634SAndreas Gohr // turns them back into &xxx; on output. 74*eb15e634SAndreas Gohr return html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); 75*eb15e634SAndreas Gohr } 76*eb15e634SAndreas Gohr 77*eb15e634SAndreas Gohr /** 78*eb15e634SAndreas Gohr * Decode the numeric portion of a numeric character reference, with 79*eb15e634SAndreas Gohr * the CommonMark-mandated U+FFFD substitution for invalid codepoints 80*eb15e634SAndreas Gohr * (zero, surrogate range, above U+10FFFF). 81*eb15e634SAndreas Gohr * 82*eb15e634SAndreas Gohr * @param string $body The digits between &# and ; — decimal digits, 83*eb15e634SAndreas Gohr * or x/X followed by hex digits 84*eb15e634SAndreas Gohr * @return string The corresponding UTF-8 codepoint, or U+FFFD when 85*eb15e634SAndreas Gohr * the codepoint is invalid 86*eb15e634SAndreas Gohr */ 87*eb15e634SAndreas Gohr protected static function decodeNumeric(string $body): string 88*eb15e634SAndreas Gohr { 89*eb15e634SAndreas Gohr if ($body[0] === 'x' || $body[0] === 'X') { 90*eb15e634SAndreas Gohr $cp = hexdec(substr($body, 1)); 91*eb15e634SAndreas Gohr } else { 92*eb15e634SAndreas Gohr $cp = (int) $body; 93*eb15e634SAndreas Gohr } 94*eb15e634SAndreas Gohr 95*eb15e634SAndreas Gohr if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) { 96*eb15e634SAndreas Gohr return self::REPLACEMENT; 97*eb15e634SAndreas Gohr } 98*eb15e634SAndreas Gohr 99*eb15e634SAndreas Gohr $char = Unicode::toUtf8([$cp]); 100*eb15e634SAndreas Gohr if ($char === false || $char === '') { 101*eb15e634SAndreas Gohr return self::REPLACEMENT; 102*eb15e634SAndreas Gohr } 103*eb15e634SAndreas Gohr return $char; 104*eb15e634SAndreas Gohr } 105*eb15e634SAndreas Gohr} 106