xref: /dokuwiki/inc/Parsing/Helpers/HtmlEntity.php (revision eb15e634e1400f6c4d78f5fb40179ca25f41574d)
1*eb15e634SAndreas Gohr<?php
2*eb15e634SAndreas Gohr
3*eb15e634SAndreas Gohrnamespace dokuwiki\Parsing\Helpers;
4*eb15e634SAndreas Gohr
5*eb15e634SAndreas Gohruse dokuwiki\Utf8\Unicode;
6*eb15e634SAndreas Gohr
7*eb15e634SAndreas Gohr/**
8*eb15e634SAndreas Gohr * Pure helper for decoding HTML entity references - numeric (`&#nnn;`,
9*eb15e634SAndreas Gohr * `&#xhhh;`) and HTML5 named (`&copy;`, `&AElig;`, ...) - to their
10*eb15e634SAndreas Gohr * Unicode codepoint(s).
11*eb15e634SAndreas Gohr *
12*eb15e634SAndreas Gohr * Whole-span PROTECTED modes (GfmCode, GfmLink, ...) capture their body
13*eb15e634SAndreas Gohr * in a single regex match, so the inline GfmHtmlEntity pattern never
14*eb15e634SAndreas Gohr * sees the bytes inside. For the slots GFM still wants entity-decoded -
15*eb15e634SAndreas Gohr * fenced code info strings, link destinations, link titles - call
16*eb15e634SAndreas Gohr * decode() after extracting the literal substring.
17*eb15e634SAndreas Gohr *
18*eb15e634SAndreas Gohr * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the
19*eb15e634SAndreas Gohr * surrogate range U+D800..U+DFFF map to U+FFFD (REPLACEMENT CHARACTER)
20*eb15e634SAndreas Gohr * for numeric references. Unknown named references are returned
21*eb15e634SAndreas Gohr * unchanged - the caller emits them literally and the renderer's
22*eb15e634SAndreas Gohr * &-escaping turns them back into `&amp;xxx;` on output.
23*eb15e634SAndreas Gohr */
24*eb15e634SAndreas Gohrclass HtmlEntity
25*eb15e634SAndreas Gohr{
26*eb15e634SAndreas Gohr    /**
27*eb15e634SAndreas Gohr     * Regex matching one HTML entity reference. Shared by GfmHtmlEntity
28*eb15e634SAndreas Gohr     * (as the lexer special-pattern) and decode() (as the scan
29*eb15e634SAndreas Gohr     * pattern), so the two stay in lockstep.
30*eb15e634SAndreas Gohr     */
31*eb15e634SAndreas Gohr    public const PATTERN = '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});';
32*eb15e634SAndreas Gohr
33*eb15e634SAndreas Gohr    protected const REPLACEMENT = "\u{FFFD}";
34*eb15e634SAndreas Gohr
35*eb15e634SAndreas Gohr    /**
36*eb15e634SAndreas Gohr     * Decode every HTML entity reference in the given text to its
37*eb15e634SAndreas Gohr     * corresponding Unicode codepoint(s). Non-entity bytes pass through
38*eb15e634SAndreas Gohr     * unchanged.
39*eb15e634SAndreas Gohr     *
40*eb15e634SAndreas Gohr     * @param string $text Source text that may contain entity references
41*eb15e634SAndreas Gohr     * @return string Text with all recognised entities decoded
42*eb15e634SAndreas Gohr     */
43*eb15e634SAndreas Gohr    public static function decode(string $text): string
44*eb15e634SAndreas Gohr    {
45*eb15e634SAndreas Gohr        return preg_replace_callback(
46*eb15e634SAndreas Gohr            '/' . self::PATTERN . '/',
47*eb15e634SAndreas Gohr            static fn($m) => self::decodeOne($m[0]),
48*eb15e634SAndreas Gohr            $text
49*eb15e634SAndreas Gohr        );
50*eb15e634SAndreas Gohr    }
51*eb15e634SAndreas Gohr
52*eb15e634SAndreas Gohr    /**
53*eb15e634SAndreas Gohr     * Decode a single entity reference. The caller must have already
54*eb15e634SAndreas Gohr     * verified that the input matches self::PATTERN — this is the cheap
55*eb15e634SAndreas Gohr     * path for callers that have one match in hand (e.g. the lexer
56*eb15e634SAndreas Gohr     * mode), avoiding the preg_replace_callback scan that decode() does.
57*eb15e634SAndreas Gohr     *
58*eb15e634SAndreas Gohr     * @param string $match A single entity reference, e.g. &#35; or &copy;
59*eb15e634SAndreas Gohr     * @return string The decoded codepoint(s), or the original literal
60*eb15e634SAndreas Gohr     *                bytes if the named entity is not recognised
61*eb15e634SAndreas Gohr     */
62*eb15e634SAndreas Gohr    public static function decodeOne(string $match): string
63*eb15e634SAndreas Gohr    {
64*eb15e634SAndreas Gohr        if ($match[1] === '#') {
65*eb15e634SAndreas Gohr            // Numeric refs are decoded explicitly rather than via
66*eb15e634SAndreas Gohr            // html_entity_decode: PHP returns the input unchanged for
67*eb15e634SAndreas Gohr            // U+0000, surrogates, and codepoints it considers unsafe
68*eb15e634SAndreas Gohr            // (including U+10FFFF and BMP noncharacters), where
69*eb15e634SAndreas Gohr            // CommonMark requires U+FFFD or the literal codepoint.
70*eb15e634SAndreas Gohr            return self::decodeNumeric(substr($match, 2, -1));
71*eb15e634SAndreas Gohr        }
72*eb15e634SAndreas Gohr        // Unknown names round-trip unchanged; the renderer's &-escape
73*eb15e634SAndreas Gohr        // turns them back into &amp;xxx; on output.
74*eb15e634SAndreas Gohr        return html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8');
75*eb15e634SAndreas Gohr    }
76*eb15e634SAndreas Gohr
77*eb15e634SAndreas Gohr    /**
78*eb15e634SAndreas Gohr     * Decode the numeric portion of a numeric character reference, with
79*eb15e634SAndreas Gohr     * the CommonMark-mandated U+FFFD substitution for invalid codepoints
80*eb15e634SAndreas Gohr     * (zero, surrogate range, above U+10FFFF).
81*eb15e634SAndreas Gohr     *
82*eb15e634SAndreas Gohr     * @param string $body The digits between &# and ; — decimal digits,
83*eb15e634SAndreas Gohr     *                     or x/X followed by hex digits
84*eb15e634SAndreas Gohr     * @return string The corresponding UTF-8 codepoint, or U+FFFD when
85*eb15e634SAndreas Gohr     *                the codepoint is invalid
86*eb15e634SAndreas Gohr     */
87*eb15e634SAndreas Gohr    protected static function decodeNumeric(string $body): string
88*eb15e634SAndreas Gohr    {
89*eb15e634SAndreas Gohr        if ($body[0] === 'x' || $body[0] === 'X') {
90*eb15e634SAndreas Gohr            $cp = hexdec(substr($body, 1));
91*eb15e634SAndreas Gohr        } else {
92*eb15e634SAndreas Gohr            $cp = (int) $body;
93*eb15e634SAndreas Gohr        }
94*eb15e634SAndreas Gohr
95*eb15e634SAndreas Gohr        if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) {
96*eb15e634SAndreas Gohr            return self::REPLACEMENT;
97*eb15e634SAndreas Gohr        }
98*eb15e634SAndreas Gohr
99*eb15e634SAndreas Gohr        $char = Unicode::toUtf8([$cp]);
100*eb15e634SAndreas Gohr        if ($char === false || $char === '') {
101*eb15e634SAndreas Gohr            return self::REPLACEMENT;
102*eb15e634SAndreas Gohr        }
103*eb15e634SAndreas Gohr        return $char;
104*eb15e634SAndreas Gohr    }
105*eb15e634SAndreas Gohr}
106