xref: /dokuwiki/_test/tests/Parsing/Markdown/GfmSpecTest.php (revision 8a34b0d87864546b9e35ee6a4621d30bf4cd6475)
1<?php
2
3namespace dokuwiki\test\Parsing\Markdown;
4
5use dokuwiki\Parsing\ModeRegistry;
6
7/**
8 * Roundtrip tests driven by GFM's spec.txt.
9 *
10 * Each example in gfm-spec/spec.txt becomes one data-provider case. The
11 * markdown input is run through DokuWiki's full pipeline (parser + XHTML
12 * renderer) and the result is compared to the expected HTML from the spec,
13 * tolerating whitespace differences around block-level tags.
14 *
15 * `gfm-spec/skip.php` lists examples that are deliberately out of scope
16 * for DokuWiki (e.g. CommonMark flanking-delimiter edge cases). Those are
17 * reported as skipped with a reason.
18 */
19class GfmSpecTest extends \DokuWikiTest
20{
21    private const FIXTURE_DIR = __DIR__ . '/gfm-spec/';
22
23    public static function specProvider(): iterable
24    {
25        $reader = new SpecReader(self::FIXTURE_DIR . 'spec.txt');
26        $skip   = require self::FIXTURE_DIR . 'skip.php';
27
28        // Spec convention (spec.txt §"About this document"): the `→`
29        // character in examples represents a literal tab. Restore the
30        // tab in both input and expected output so the corpus exercises
31        // real tab-handling behavior, not arrow-character handling.
32        foreach ($reader->examples() as $ex) {
33            $reason = $skip[$ex['number']] ?? null;
34            $label  = sprintf('#%d %s', $ex['number'], $ex['section']);
35            $md     = strtr($ex['markdown'], ["\u{2192}" => "\t"]);
36            $html   = strtr($ex['html'], ["\u{2192}" => "\t"]);
37            yield $label => [$md, $html, $reason];
38        }
39    }
40
41    /**
42     * @dataProvider specProvider
43     */
44    public function testExample(string $md, string $expected, ?string $skipReason): void
45    {
46        if ($skipReason !== null) {
47            $this->markTestSkipped($skipReason);
48        }
49        $actual = $this->renderMarkdown($md);
50        $this->assertHtmlEquals($expected, $actual);
51    }
52
53    public function tearDown(): void
54    {
55        ModeRegistry::reset();
56        parent::tearDown();
57    }
58
59    /**
60     * Render markdown text through DokuWiki's full parser pipeline under
61     * the `md` syntax setting, using {@see SpecCompatRenderer} —
62     * an XHTML renderer subclass that emits the minimal link/media HTML
63     * shape the GFM spec expects. Production rendering is unchanged;
64     * this override exists so spec output can be compared byte-for-byte.
65     *
66     * Typography is forced off for the spec run: $conf[typography] = 0
67     * keeps the Quotes and MultiplyEntity modes (curly quote pairing,
68     * apostrophe to numeric entity) out of the mode list. Both are
69     * correct for production wiki prose but diverge byte-for-byte from
70     * spec output. SpecCompatRenderer additionally neutralizes the
71     * Entity-table substitutions (--, ---, ->, (c), ...) at render time;
72     * see SpecCompatRenderer::entity().
73     *
74     * The renderer's acronym table is left empty so the parser-emitted
75     * acronym() calls (e.g. for "FTP") fall through to literal text
76     * instead of wrapping in <abbr>, which the spec output never has.
77     */
78    private function renderMarkdown(string $text): string
79    {
80        global $conf;
81        $conf['syntax'] = 'md';
82        $conf['typography'] = 0;
83        ModeRegistry::reset();
84
85        $instructions = p_get_instructions($text);
86
87        $renderer = new SpecCompatRenderer();
88        $renderer->reset();
89        $renderer->smileys   = getSmileys();
90        $renderer->entities  = getEntities();
91        $renderer->acronyms  = [];
92        $renderer->interwiki = getInterwiki();
93
94        foreach ($instructions as $instruction) {
95            if (method_exists($renderer, $instruction[0])) {
96                call_user_func_array([$renderer, $instruction[0]], $instruction[1] ?: []);
97            }
98        }
99        return $renderer->doc;
100    }
101
102    /**
103     * Assert two HTML strings are equivalent after whitespace normalization.
104     *
105     * DokuWiki's XHTML renderer emits extra whitespace around block tags
106     * that the spec's reference HTML omits. The comparator strips whitespace
107     * only around **block-level** tags (p, div, h1-h6, ul/ol/li, table/tr/td,
108     * blockquote, pre, hr). Whitespace around **inline** tags (em, strong,
109     * a, code, span, img, br, etc.) is preserved, because `<em>x</em> y`
110     * and `<em>x</em>y` render differently.
111     */
112    private function assertHtmlEquals(string $expected, string $actual): void
113    {
114        $this->assertEquals(
115            $this->normalizeHtml($expected),
116            $this->normalizeHtml($actual)
117        );
118    }
119
120    /**
121     * Strip whitespace adjacent to block-level tags; leave inline tags alone.
122     *
123     * Additionally drops DokuWiki-specific heading decoration that carries no
124     * semantic meaning for GFM-conformance checks:
125     *
126     * - `<div class="levelN">` / matching `</div>` section wrappers the
127     *   renderer emits after every header call.
128     * - `class="..."` / `id="..."` attributes on h1-h6 (section-edit anchor
129     *   and header-id generation; fine to ignore, the spec output has none).
130     */
131    private function normalizeHtml(string $html): string
132    {
133        $block = 'p|div|h[1-6]|hr|ul|ol|li|blockquote|pre|table|thead|tbody|tfoot|tr|th|td';
134
135        // Drop DokuWiki's `<div class="levelN">` section wrappers and the
136        // HTML comments (`<!-- EDIT... -->`) its section-edit machinery
137        // inserts after each heading. Neither is semantically part of the
138        // heading and GFM reference output never contains them.
139        $html = preg_replace('#<div class="level[1-6]">\s*#', '', $html);
140        $html = preg_replace('#\s*</div>\s*#', '', $html);
141        $html = preg_replace('#<!--[^<]*?-->#', '', $html);
142
143        // Strip sectionedit/id decoration from headings.
144        $html = preg_replace('#<(h[1-6])(?:\s+(?:class|id)="[^"]*")+\s*>#', '<$1>', $html);
145
146        // Whitespace before/after an opening block tag (including attributes)
147        $html = preg_replace('#\s*<(' . $block . ')((?:\s[^>]*)?)>\s*#', '<$1$2>', $html);
148        // Whitespace before/after a closing block tag
149        $html = preg_replace('#\s*</(' . $block . ')>\s*#', '</$1>', $html);
150
151        return trim($html);
152    }
153}
154