xref: /dokuwiki/inc/Parsing/Helpers/HtmlEntity.php (revision 2e43b79909f3bc04928779d886f68c1242b5d436)
1<?php
2
3namespace dokuwiki\Parsing\Helpers;
4
5use dokuwiki\Utf8\Unicode;
6
7/**
8 * Pure helper for decoding HTML entity references - numeric (`&#nnn;`,
9 * `&#xhhh;`) and HTML5 named (`&copy;`, `&AElig;`, ...) - to their
10 * Unicode codepoint(s).
11 *
12 * Whole-span PROTECTED modes (GfmCode, GfmLink, ...) capture their body
13 * in a single regex match, so the inline GfmHtmlEntity pattern never
14 * sees the bytes inside. For the slots GFM still wants entity-decoded -
15 * fenced code info strings, link destinations, link titles - call
16 * decode() after extracting the literal substring.
17 *
18 * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the
19 * surrogate range U+D800..U+DFFF map to U+FFFD (REPLACEMENT CHARACTER)
20 * for numeric references. Unknown named references are returned
21 * unchanged - the caller emits them literally and the renderer's
22 * &-escaping turns them back into `&amp;xxx;` on output.
23 */
24class HtmlEntity
25{
26    /**
27     * Regex matching one HTML entity reference. Shared by GfmHtmlEntity
28     * (as the lexer special-pattern) and decode() (as the scan
29     * pattern), so the two stay in lockstep.
30     */
31    public const PATTERN = '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});';
32
33    protected const REPLACEMENT = "\u{FFFD}";
34
35    /**
36     * Decode every HTML entity reference in the given text to its
37     * corresponding Unicode codepoint(s). Non-entity bytes pass through
38     * unchanged.
39     *
40     * @param string $text Source text that may contain entity references
41     * @return string Text with all recognised entities decoded
42     */
43    public static function decode(string $text): string
44    {
45        return preg_replace_callback(
46            '/' . self::PATTERN . '/',
47            static fn($m) => self::decodeOne($m[0]),
48            $text
49        );
50    }
51
52    /**
53     * Decode a single entity reference. The caller must have already
54     * verified that the input matches self::PATTERN — this is the cheap
55     * path for callers that have one match in hand (e.g. the lexer
56     * mode), avoiding the preg_replace_callback scan that decode() does.
57     *
58     * @param string $match A single entity reference, e.g. &#35; or &copy;
59     * @return string The decoded codepoint(s), or the original literal
60     *                bytes if the named entity is not recognised
61     */
62    public static function decodeOne(string $match): string
63    {
64        if ($match[1] === '#') {
65            // Numeric refs are decoded explicitly rather than via
66            // html_entity_decode: PHP returns the input unchanged for
67            // U+0000, surrogates, and codepoints it considers unsafe
68            // (including U+10FFFF and BMP noncharacters), where
69            // CommonMark requires U+FFFD or the literal codepoint.
70            return self::decodeNumeric(substr($match, 2, -1));
71        }
72        // Unknown names round-trip unchanged; the renderer's &-escape
73        // turns them back into &amp;xxx; on output.
74        return html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8');
75    }
76
77    /**
78     * Decode the numeric portion of a numeric character reference, with
79     * the CommonMark-mandated U+FFFD substitution for invalid codepoints
80     * (zero, surrogate range, above U+10FFFF).
81     *
82     * @param string $body The digits between &# and ; — decimal digits,
83     *                     or x/X followed by hex digits
84     * @return string The corresponding UTF-8 codepoint, or U+FFFD when
85     *                the codepoint is invalid
86     */
87    protected static function decodeNumeric(string $body): string
88    {
89        if ($body[0] === 'x' || $body[0] === 'X') {
90            $cp = hexdec(substr($body, 1));
91        } else {
92            $cp = (int) $body;
93        }
94
95        if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) {
96            return self::REPLACEMENT;
97        }
98
99        $char = Unicode::toUtf8([$cp]);
100        if ($char === false || $char === '') {
101            return self::REPLACEMENT;
102        }
103        return $char;
104    }
105}
106