1<?php 2 3namespace dokuwiki\Parsing\Helpers; 4 5use dokuwiki\Utf8\Unicode; 6 7/** 8 * Pure helper for decoding HTML entity references - numeric (`&#nnn;`, 9 * `&#xhhh;`) and HTML5 named (`©`, `Æ`, ...) - to their 10 * Unicode codepoint(s). 11 * 12 * Whole-span PROTECTED modes (GfmCode, GfmLink, ...) capture their body 13 * in a single regex match, so the inline GfmHtmlEntity pattern never 14 * sees the bytes inside. For the slots GFM still wants entity-decoded - 15 * fenced code info strings, link destinations, link titles - call 16 * decode() after extracting the literal substring. 17 * 18 * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the 19 * surrogate range U+D800..U+DFFF map to U+FFFD (REPLACEMENT CHARACTER) 20 * for numeric references. Unknown named references are returned 21 * unchanged - the caller emits them literally and the renderer's 22 * &-escaping turns them back into `&xxx;` on output. 23 */ 24class HtmlEntity 25{ 26 /** 27 * Regex matching one HTML entity reference. Shared by GfmHtmlEntity 28 * (as the lexer special-pattern) and decode() (as the scan 29 * pattern), so the two stay in lockstep. 30 */ 31 public const PATTERN = '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});'; 32 33 protected const REPLACEMENT = "\u{FFFD}"; 34 35 /** 36 * Decode every HTML entity reference in the given text to its 37 * corresponding Unicode codepoint(s). Non-entity bytes pass through 38 * unchanged. 39 * 40 * @param string $text Source text that may contain entity references 41 * @return string Text with all recognised entities decoded 42 */ 43 public static function decode(string $text): string 44 { 45 return preg_replace_callback( 46 '/' . self::PATTERN . '/', 47 static fn($m) => self::decodeOne($m[0]), 48 $text 49 ); 50 } 51 52 /** 53 * Decode a single entity reference. The caller must have already 54 * verified that the input matches self::PATTERN — this is the cheap 55 * path for callers that have one match in hand (e.g. the lexer 56 * mode), avoiding the preg_replace_callback scan that decode() does. 57 * 58 * @param string $match A single entity reference, e.g. # or © 59 * @return string The decoded codepoint(s), or the original literal 60 * bytes if the named entity is not recognised 61 */ 62 public static function decodeOne(string $match): string 63 { 64 if ($match[1] === '#') { 65 // Numeric refs are decoded explicitly rather than via 66 // html_entity_decode: PHP returns the input unchanged for 67 // U+0000, surrogates, and codepoints it considers unsafe 68 // (including U+10FFFF and BMP noncharacters), where 69 // CommonMark requires U+FFFD or the literal codepoint. 70 return self::decodeNumeric(substr($match, 2, -1)); 71 } 72 // Unknown names round-trip unchanged; the renderer's &-escape 73 // turns them back into &xxx; on output. 74 return html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); 75 } 76 77 /** 78 * Decode the numeric portion of a numeric character reference, with 79 * the CommonMark-mandated U+FFFD substitution for invalid codepoints 80 * (zero, surrogate range, above U+10FFFF). 81 * 82 * @param string $body The digits between &# and ; — decimal digits, 83 * or x/X followed by hex digits 84 * @return string The corresponding UTF-8 codepoint, or U+FFFD when 85 * the codepoint is invalid 86 */ 87 protected static function decodeNumeric(string $body): string 88 { 89 if ($body[0] === 'x' || $body[0] === 'X') { 90 $cp = hexdec(substr($body, 1)); 91 } else { 92 $cp = (int) $body; 93 } 94 95 if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) { 96 return self::REPLACEMENT; 97 } 98 99 $char = Unicode::toUtf8([$cp]); 100 if ($char === false || $char === '') { 101 return self::REPLACEMENT; 102 } 103 return $char; 104 } 105} 106