xref: /dokuwiki/inc/Parsing/ParserMode/GfmHtmlEntity.php (revision d20858669cbb910f566e0b7d1ba9da293d1b794e)
1*d2085866SAndreas Gohr<?php
2*d2085866SAndreas Gohr
3*d2085866SAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4*d2085866SAndreas Gohr
5*d2085866SAndreas Gohruse dokuwiki\Parsing\Handler;
6*d2085866SAndreas Gohruse dokuwiki\Utf8\Unicode;
7*d2085866SAndreas Gohr
8*d2085866SAndreas Gohr/**
9*d2085866SAndreas Gohr * GFM HTML entity references: numeric (`&#nnn;` and `&#xhhh;`) and
10*d2085866SAndreas Gohr * HTML5 named (`&copy;`, `&AElig;`, `&ngE;`, ...) decode to the
11*d2085866SAndreas Gohr * corresponding Unicode codepoint(s) and ride out as cdata.
12*d2085866SAndreas Gohr *
13*d2085866SAndreas Gohr * Distinct from the typography Entity mode, which is renderer-side
14*d2085866SAndreas Gohr * configurable (entities.conf maps `(c)` to `©` etc.). HTML entity
15*d2085866SAndreas Gohr * references are not configurable - their meaning is fixed by the
16*d2085866SAndreas Gohr * HTML5 / Unicode specs - so decoding happens at parse time and the
17*d2085866SAndreas Gohr * renderer needs no changes.
18*d2085866SAndreas Gohr *
19*d2085866SAndreas Gohr * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the
20*d2085866SAndreas Gohr * surrogate range U+D800..U+DFFF all map to U+FFFD (REPLACEMENT
21*d2085866SAndreas Gohr * CHARACTER) for numeric references. Unknown named references stay
22*d2085866SAndreas Gohr * literal: the original `&xxx;` is emitted as cdata and the renderer's
23*d2085866SAndreas Gohr * &-escaping turns it into `&amp;xxx;` on output.
24*d2085866SAndreas Gohr *
25*d2085866SAndreas Gohr * Category SUBSTITION so the mode is reachable in every container
26*d2085866SAndreas Gohr * that allows substitutions (paragraphs, formatting, list items,
27*d2085866SAndreas Gohr * table cells, headers). Code spans and code blocks live in
28*d2085866SAndreas Gohr * CATEGORY_PROTECTED and reject SUBSTITION, so entities stay literal
29*d2085866SAndreas Gohr * there - matching CommonMark's rule that entities are not recognized
30*d2085866SAndreas Gohr * in code.
31*d2085866SAndreas Gohr *
32*d2085866SAndreas Gohr * Side benefit: by consuming the entire entity run before any
33*d2085866SAndreas Gohr * structural pattern sees it, this mode automatically enforces the
34*d2085866SAndreas Gohr * spec rule that numeric references cannot stand in for structural
35*d2085866SAndreas Gohr * markers. `&#42;foo&#42;` decodes to literal `*foo*` text and never
36*d2085866SAndreas Gohr * triggers emphasis; `&#42; foo` decodes to literal `* foo` and never
37*d2085866SAndreas Gohr * starts a list.
38*d2085866SAndreas Gohr */
39*d2085866SAndreas Gohrclass GfmHtmlEntity extends AbstractMode
40*d2085866SAndreas Gohr{
41*d2085866SAndreas Gohr    protected const REPLACEMENT = "\u{FFFD}";
42*d2085866SAndreas Gohr
43*d2085866SAndreas Gohr    public function __construct()
44*d2085866SAndreas Gohr    {
45*d2085866SAndreas Gohr        $this->allowedModes = [];
46*d2085866SAndreas Gohr    }
47*d2085866SAndreas Gohr
48*d2085866SAndreas Gohr    /** @inheritdoc */
49*d2085866SAndreas Gohr    public function getSort()
50*d2085866SAndreas Gohr    {
51*d2085866SAndreas Gohr        return 255;
52*d2085866SAndreas Gohr    }
53*d2085866SAndreas Gohr
54*d2085866SAndreas Gohr    /** @inheritdoc */
55*d2085866SAndreas Gohr    public function connectTo($mode)
56*d2085866SAndreas Gohr    {
57*d2085866SAndreas Gohr        $this->Lexer->addSpecialPattern(
58*d2085866SAndreas Gohr            '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});',
59*d2085866SAndreas Gohr            $mode,
60*d2085866SAndreas Gohr            'gfm_html_entity'
61*d2085866SAndreas Gohr        );
62*d2085866SAndreas Gohr    }
63*d2085866SAndreas Gohr
64*d2085866SAndreas Gohr    /** @inheritdoc */
65*d2085866SAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
66*d2085866SAndreas Gohr    {
67*d2085866SAndreas Gohr        if ($match[1] === '#') {
68*d2085866SAndreas Gohr            // Numeric refs are decoded explicitly rather than via
69*d2085866SAndreas Gohr            // html_entity_decode: PHP returns the input unchanged for
70*d2085866SAndreas Gohr            // U+0000, surrogates, and codepoints it considers unsafe
71*d2085866SAndreas Gohr            // (including U+10FFFF and BMP noncharacters), where
72*d2085866SAndreas Gohr            // CommonMark requires U+FFFD or the literal codepoint.
73*d2085866SAndreas Gohr            $char = $this->decodeNumeric(substr($match, 2, -1));
74*d2085866SAndreas Gohr        } else {
75*d2085866SAndreas Gohr            // Unknown names round-trip unchanged; the renderer's &-escape
76*d2085866SAndreas Gohr            // turns them back into &xxx; on output.
77*d2085866SAndreas Gohr            $char = html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8');
78*d2085866SAndreas Gohr        }
79*d2085866SAndreas Gohr
80*d2085866SAndreas Gohr        $handler->addCall('cdata', [$char], $pos);
81*d2085866SAndreas Gohr        return true;
82*d2085866SAndreas Gohr    }
83*d2085866SAndreas Gohr
84*d2085866SAndreas Gohr    protected function decodeNumeric(string $body): string
85*d2085866SAndreas Gohr    {
86*d2085866SAndreas Gohr        if ($body[0] === 'x' || $body[0] === 'X') {
87*d2085866SAndreas Gohr            $cp = hexdec(substr($body, 1));
88*d2085866SAndreas Gohr        } else {
89*d2085866SAndreas Gohr            $cp = (int) $body;
90*d2085866SAndreas Gohr        }
91*d2085866SAndreas Gohr
92*d2085866SAndreas Gohr        if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) {
93*d2085866SAndreas Gohr            return self::REPLACEMENT;
94*d2085866SAndreas Gohr        }
95*d2085866SAndreas Gohr
96*d2085866SAndreas Gohr        $char = Unicode::toUtf8([$cp]);
97*d2085866SAndreas Gohr        if ($char === false || $char === '') {
98*d2085866SAndreas Gohr            return self::REPLACEMENT;
99*d2085866SAndreas Gohr        }
100*d2085866SAndreas Gohr        return $char;
101*d2085866SAndreas Gohr    }
102*d2085866SAndreas Gohr}
103