1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Utf8\Unicode; 7 8/** 9 * GFM HTML entity references: numeric (`&#nnn;` and `&#xhhh;`) and 10 * HTML5 named (`©`, `Æ`, `≧̸`, ...) decode to the 11 * corresponding Unicode codepoint(s) and ride out as cdata. 12 * 13 * Distinct from the typography Entity mode, which is renderer-side 14 * configurable (entities.conf maps `(c)` to `©` etc.). HTML entity 15 * references are not configurable - their meaning is fixed by the 16 * HTML5 / Unicode specs - so decoding happens at parse time and the 17 * renderer needs no changes. 18 * 19 * Per CommonMark, codepoint 0, codepoints above U+10FFFF, and the 20 * surrogate range U+D800..U+DFFF all map to U+FFFD (REPLACEMENT 21 * CHARACTER) for numeric references. Unknown named references stay 22 * literal: the original `&xxx;` is emitted as cdata and the renderer's 23 * &-escaping turns it into `&xxx;` on output. 24 * 25 * Category SUBSTITION so the mode is reachable in every container 26 * that allows substitutions (paragraphs, formatting, list items, 27 * table cells, headers). Code spans and code blocks live in 28 * CATEGORY_PROTECTED and reject SUBSTITION, so entities stay literal 29 * there - matching CommonMark's rule that entities are not recognized 30 * in code. 31 * 32 * Side benefit: by consuming the entire entity run before any 33 * structural pattern sees it, this mode automatically enforces the 34 * spec rule that numeric references cannot stand in for structural 35 * markers. `*foo*` decodes to literal `*foo*` text and never 36 * triggers emphasis; `* foo` decodes to literal `* foo` and never 37 * starts a list. 38 */ 39class GfmHtmlEntity extends AbstractMode 40{ 41 protected const REPLACEMENT = "\u{FFFD}"; 42 43 public function __construct() 44 { 45 $this->allowedModes = []; 46 } 47 48 /** @inheritdoc */ 49 public function getSort() 50 { 51 return 255; 52 } 53 54 /** @inheritdoc */ 55 public function connectTo($mode) 56 { 57 $this->Lexer->addSpecialPattern( 58 '&(?:#(?:[0-9]{1,7}|[xX][0-9a-fA-F]{1,6})|[a-zA-Z][a-zA-Z0-9]{0,30});', 59 $mode, 60 'gfm_html_entity' 61 ); 62 } 63 64 /** @inheritdoc */ 65 public function handle($match, $state, $pos, Handler $handler) 66 { 67 if ($match[1] === '#') { 68 // Numeric refs are decoded explicitly rather than via 69 // html_entity_decode: PHP returns the input unchanged for 70 // U+0000, surrogates, and codepoints it considers unsafe 71 // (including U+10FFFF and BMP noncharacters), where 72 // CommonMark requires U+FFFD or the literal codepoint. 73 $char = $this->decodeNumeric(substr($match, 2, -1)); 74 } else { 75 // Unknown names round-trip unchanged; the renderer's &-escape 76 // turns them back into &xxx; on output. 77 $char = html_entity_decode($match, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); 78 } 79 80 $handler->addCall('cdata', [$char], $pos); 81 return true; 82 } 83 84 protected function decodeNumeric(string $body): string 85 { 86 if ($body[0] === 'x' || $body[0] === 'X') { 87 $cp = hexdec(substr($body, 1)); 88 } else { 89 $cp = (int) $body; 90 } 91 92 if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) { 93 return self::REPLACEMENT; 94 } 95 96 $char = Unicode::toUtf8([$cp]); 97 if ($char === false || $char === '') { 98 return self::REPLACEMENT; 99 } 100 return $char; 101 } 102} 103