1*d2085866SAndreas Gohr<?php 2*d2085866SAndreas Gohr 3*d2085866SAndreas Gohrnamespace dokuwiki\Parsing\ParserMode; 4*d2085866SAndreas Gohr 5*d2085866SAndreas Gohruse dokuwiki\Parsing\Handler; 6*d2085866SAndreas Gohruse dokuwiki\Utf8\Unicode; 7*d2085866SAndreas Gohr 8*d2085866SAndreas Gohr/** 9*d2085866SAndreas Gohr * GFM HTML entity references: numeric (`&#nnn;` and `&#xhhh;`) and 10*d2085866SAndreas Gohr * HTML5 named (`©`, `Æ`, `≧̸`, ...) decode to the 11*d2085866SAndreas Gohr * corresponding Unicode codepoint(s) and ride out as cdata. 12*d2085866SAndreas Gohr * 13*d2085866SAndreas Gohr * Distinct from the typography Entity mode, which is renderer-side 14*d2085866SAndreas Gohr * configurable (entities.conf maps `(c)` to `©` etc.). HTML entity 15*d2085866SAndreas Gohr * references are not configurable - their meaning is fixed by the 16*d2085866SAndreas Gohr * HTML5 / Unicode specs - so decoding happens at parse time and the 17*d2085866SAndreas Gohr * renderer needs no changes. 18*d2085866SAndreas Gohr * 19*d2085866SAndreas Gohr * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the 20*d2085866SAndreas Gohr * surrogate range U+D800..U+DFFF all map to U+FFFD (REPLACEMENT 21*d2085866SAndreas Gohr * CHARACTER) for numeric references. Unknown named references stay 22*d2085866SAndreas Gohr * literal: the original `&xxx;` is emitted as cdata and the renderer's 23*d2085866SAndreas Gohr * &-escaping turns it into `&xxx;` on output. 24*d2085866SAndreas Gohr * 25*d2085866SAndreas Gohr * Category SUBSTITION so the mode is reachable in every container 26*d2085866SAndreas Gohr * that allows substitutions (paragraphs, formatting, list items, 27*d2085866SAndreas Gohr * table cells, headers). Code spans and code blocks live in 28*d2085866SAndreas Gohr * CATEGORY_PROTECTED and reject SUBSTITION, so entities stay literal 29*d2085866SAndreas Gohr * there - matching CommonMark's rule that entities are not recognized 30*d2085866SAndreas Gohr * in code. 31*d2085866SAndreas Gohr * 32*d2085866SAndreas Gohr * Side benefit: by consuming the entire entity run before any 33*d2085866SAndreas Gohr * structural pattern sees it, this mode automatically enforces the 34*d2085866SAndreas Gohr * spec rule that numeric references cannot stand in for structural 35*d2085866SAndreas Gohr * markers. `*foo*` decodes to literal `*foo*` text and never 36*d2085866SAndreas Gohr * triggers emphasis; `* foo` decodes to literal `* foo` and never 37*d2085866SAndreas Gohr * starts a list. 38*d2085866SAndreas Gohr */ 39*d2085866SAndreas Gohrclass GfmHtmlEntity extends AbstractMode 40*d2085866SAndreas Gohr{ 41*d2085866SAndreas Gohr protected const REPLACEMENT = "\u{FFFD}"; 42*d2085866SAndreas Gohr 43*d2085866SAndreas Gohr public function __construct() 44*d2085866SAndreas Gohr { 45*d2085866SAndreas Gohr $this->allowedModes = []; 46*d2085866SAndreas Gohr } 47*d2085866SAndreas Gohr 48*d2085866SAndreas Gohr /** @inheritdoc */ 49*d2085866SAndreas Gohr public function getSort() 50*d2085866SAndreas Gohr { 51*d2085866SAndreas Gohr return 255; 52*d2085866SAndreas Gohr } 53*d2085866SAndreas Gohr 54*d2085866SAndreas Gohr /** @inheritdoc */ 55*d2085866SAndreas Gohr public function connectTo($mode) 56*d2085866SAndreas Gohr { 57*d2085866SAndreas Gohr $this->Lexer->addSpecialPattern( 58*d2085866SAndreas Gohr '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});', 59*d2085866SAndreas Gohr $mode, 60*d2085866SAndreas Gohr 'gfm_html_entity' 61*d2085866SAndreas Gohr ); 62*d2085866SAndreas Gohr } 63*d2085866SAndreas Gohr 64*d2085866SAndreas Gohr /** @inheritdoc */ 65*d2085866SAndreas Gohr public function handle($match, $state, $pos, Handler $handler) 66*d2085866SAndreas Gohr { 67*d2085866SAndreas Gohr if ($match[1] === '#') { 68*d2085866SAndreas Gohr // Numeric refs are decoded explicitly rather than via 69*d2085866SAndreas Gohr // html_entity_decode: PHP returns the input unchanged for 70*d2085866SAndreas Gohr // U+0000, surrogates, and codepoints it considers unsafe 71*d2085866SAndreas Gohr // (including U+10FFFF and BMP noncharacters), where 72*d2085866SAndreas Gohr // CommonMark requires U+FFFD or the literal codepoint. 73*d2085866SAndreas Gohr $char = $this->decodeNumeric(substr($match, 2, -1)); 74*d2085866SAndreas Gohr } else { 75*d2085866SAndreas Gohr // Unknown names round-trip unchanged; the renderer's &-escape 76*d2085866SAndreas Gohr // turns them back into &xxx; on output. 77*d2085866SAndreas Gohr $char = html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); 78*d2085866SAndreas Gohr } 79*d2085866SAndreas Gohr 80*d2085866SAndreas Gohr $handler->addCall('cdata', [$char], $pos); 81*d2085866SAndreas Gohr return true; 82*d2085866SAndreas Gohr } 83*d2085866SAndreas Gohr 84*d2085866SAndreas Gohr protected function decodeNumeric(string $body): string 85*d2085866SAndreas Gohr { 86*d2085866SAndreas Gohr if ($body[0] === 'x' || $body[0] === 'X') { 87*d2085866SAndreas Gohr $cp = hexdec(substr($body, 1)); 88*d2085866SAndreas Gohr } else { 89*d2085866SAndreas Gohr $cp = (int) $body; 90*d2085866SAndreas Gohr } 91*d2085866SAndreas Gohr 92*d2085866SAndreas Gohr if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) { 93*d2085866SAndreas Gohr return self::REPLACEMENT; 94*d2085866SAndreas Gohr } 95*d2085866SAndreas Gohr 96*d2085866SAndreas Gohr $char = Unicode::toUtf8([$cp]); 97*d2085866SAndreas Gohr if ($char === false || $char === '') { 98*d2085866SAndreas Gohr return self::REPLACEMENT; 99*d2085866SAndreas Gohr } 100*d2085866SAndreas Gohr return $char; 101*d2085866SAndreas Gohr } 102*d2085866SAndreas Gohr} 103