xref: /dokuwiki/inc/Parsing/ParserMode/GfmHtmlEntity.php (revision d20858669cbb910f566e0b7d1ba9da293d1b794e)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Utf8\Unicode;
7
8/**
9 * GFM HTML entity references: numeric (`&#nnn;` and `&#xhhh;`) and
10 * HTML5 named (`&copy;`, `&AElig;`, `&ngE;`, ...) decode to the
11 * corresponding Unicode codepoint(s) and ride out as cdata.
12 *
13 * Distinct from the typography Entity mode, which is renderer-side
14 * configurable (entities.conf maps `(c)` to `©` etc.). HTML entity
15 * references are not configurable - their meaning is fixed by the
16 * HTML5 / Unicode specs - so decoding happens at parse time and the
17 * renderer needs no changes.
18 *
19 * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the
20 * surrogate range U+D800..U+DFFF all map to U+FFFD (REPLACEMENT
21 * CHARACTER) for numeric references. Unknown named references stay
22 * literal: the original `&xxx;` is emitted as cdata and the renderer's
23 * &-escaping turns it into `&amp;xxx;` on output.
24 *
25 * Category SUBSTITION so the mode is reachable in every container
26 * that allows substitutions (paragraphs, formatting, list items,
27 * table cells, headers). Code spans and code blocks live in
28 * CATEGORY_PROTECTED and reject SUBSTITION, so entities stay literal
29 * there - matching CommonMark's rule that entities are not recognized
30 * in code.
31 *
32 * Side benefit: by consuming the entire entity run before any
33 * structural pattern sees it, this mode automatically enforces the
34 * spec rule that numeric references cannot stand in for structural
35 * markers. `&#42;foo&#42;` decodes to literal `*foo*` text and never
36 * triggers emphasis; `&#42; foo` decodes to literal `* foo` and never
37 * starts a list.
38 */
39class GfmHtmlEntity extends AbstractMode
40{
41    protected const REPLACEMENT = "\u{FFFD}";
42
43    public function __construct()
44    {
45        $this->allowedModes = [];
46    }
47
48    /** @inheritdoc */
49    public function getSort()
50    {
51        return 255;
52    }
53
54    /** @inheritdoc */
55    public function connectTo($mode)
56    {
57        $this->Lexer->addSpecialPattern(
58            '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});',
59            $mode,
60            'gfm_html_entity'
61        );
62    }
63
64    /** @inheritdoc */
65    public function handle($match, $state, $pos, Handler $handler)
66    {
67        if ($match[1] === '#') {
68            // Numeric refs are decoded explicitly rather than via
69            // html_entity_decode: PHP returns the input unchanged for
70            // U+0000, surrogates, and codepoints it considers unsafe
71            // (including U+10FFFF and BMP noncharacters), where
72            // CommonMark requires U+FFFD or the literal codepoint.
73            $char = $this->decodeNumeric(substr($match, 2, -1));
74        } else {
75            // Unknown names round-trip unchanged; the renderer's &-escape
76            // turns them back into &xxx; on output.
77            $char = html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8');
78        }
79
80        $handler->addCall('cdata', [$char], $pos);
81        return true;
82    }
83
84    protected function decodeNumeric(string $body): string
85    {
86        if ($body[0] === 'x' || $body[0] === 'X') {
87            $cp = hexdec(substr($body, 1));
88        } else {
89            $cp = (int) $body;
90        }
91
92        if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) {
93            return self::REPLACEMENT;
94        }
95
96        $char = Unicode::toUtf8([$cp]);
97        if ($char === false || $char === '') {
98            return self::REPLACEMENT;
99        }
100        return $char;
101    }
102}
103