xref: /dokuwiki/_test/tests/Parsing/Markdown/GfmSpecTest.php (revision 47a02a102092be9e1e6f1ddaf158bdfffdb13d4f)
1<?php
2
3namespace dokuwiki\test\Parsing\Markdown;
4
5/**
6 * Roundtrip tests driven by GFM's spec.txt.
7 *
8 * Each example in gfm-spec/spec.txt becomes one data-provider case. The
9 * markdown input is run through DokuWiki's full pipeline (parser + XHTML
10 * renderer) and the result is compared to the expected HTML from the spec,
11 * tolerating whitespace differences around block-level tags.
12 *
13 * `gfm-spec/skip.php` lists examples that are deliberately out of scope
14 * for DokuWiki (e.g. CommonMark flanking-delimiter edge cases). Those are
15 * reported as skipped with a reason.
16 */
17class GfmSpecTest extends \DokuWikiTest
18{
19    private const FIXTURE_DIR = __DIR__ . '/gfm-spec/';
20
21    public static function specProvider(): iterable
22    {
23        $reader = new SpecReader(self::FIXTURE_DIR . 'spec.txt');
24        $skip   = require self::FIXTURE_DIR . 'skip.php';
25
26        // Spec convention (spec.txt §"About this document"): the `→`
27        // character in examples represents a literal tab. Restore the
28        // tab in both input and expected output so the corpus exercises
29        // real tab-handling behavior, not arrow-character handling.
30        foreach ($reader->examples() as $ex) {
31            $reason = $skip[$ex['number']] ?? null;
32            $label  = sprintf('#%d %s', $ex['number'], $ex['section']);
33            $md     = strtr($ex['markdown'], ["\u{2192}" => "\t"]);
34            $html   = strtr($ex['html'], ["\u{2192}" => "\t"]);
35            yield $label => [$md, $html, $reason];
36        }
37    }
38
39    /**
40     * @dataProvider specProvider
41     */
42    public function testExample(string $md, string $expected, ?string $skipReason): void
43    {
44        if ($skipReason !== null) {
45            $this->markTestSkipped($skipReason);
46        }
47        $actual = $this->renderMarkdown($md);
48        $this->assertHtmlEquals($expected, $actual);
49    }
50
51    /**
52     * Render markdown text through DokuWiki's full parser pipeline under
53     * the `md` syntax setting, using {@see SpecCompatRenderer} —
54     * an XHTML renderer subclass that emits the minimal link/media HTML
55     * shape the GFM spec expects. Production rendering is unchanged;
56     * this override exists so spec output can be compared byte-for-byte.
57     *
58     * Typography is forced off for the spec run: $conf[typography] = 0
59     * keeps the Quotes and MultiplyEntity modes (curly quote pairing,
60     * apostrophe to numeric entity) out of the mode list. Both are
61     * correct for production wiki prose but diverge byte-for-byte from
62     * spec output. SpecCompatRenderer additionally neutralizes the
63     * Entity-table substitutions (--, ---, ->, (c), ...) at render time;
64     * see SpecCompatRenderer::entity().
65     *
66     * The renderer's acronym table is left empty so the parser-emitted
67     * acronym() calls (e.g. for "FTP") fall through to literal text
68     * instead of wrapping in <abbr>, which the spec output never has.
69     */
70    private function renderMarkdown(string $text): string
71    {
72        global $conf;
73        $conf['syntax'] = 'md';
74        $conf['typography'] = 0;
75
76        $instructions = p_get_instructions($text);
77
78        $renderer = new SpecCompatRenderer();
79        $renderer->reset();
80        $renderer->smileys   = getSmileys();
81        $renderer->entities  = getEntities();
82        $renderer->acronyms  = [];
83        $renderer->interwiki = getInterwiki();
84
85        foreach ($instructions as $instruction) {
86            if (method_exists($renderer, $instruction[0])) {
87                call_user_func_array([$renderer, $instruction[0]], $instruction[1] ?: []);
88            }
89        }
90        return $renderer->doc;
91    }
92
93    /**
94     * Assert two HTML strings are equivalent after whitespace normalization.
95     *
96     * DokuWiki's XHTML renderer emits extra whitespace around block tags
97     * that the spec's reference HTML omits. The comparator strips whitespace
98     * only around **block-level** tags (p, div, h1-h6, ul/ol/li, table/tr/td,
99     * blockquote, pre, hr). Whitespace around **inline** tags (em, strong,
100     * a, code, span, img, br, etc.) is preserved, because `<em>x</em> y`
101     * and `<em>x</em>y` render differently.
102     */
103    private function assertHtmlEquals(string $expected, string $actual): void
104    {
105        $this->assertEquals(
106            $this->normalizeHtml($expected),
107            $this->normalizeHtml($actual)
108        );
109    }
110
111    /**
112     * Strip whitespace adjacent to block-level tags; leave inline tags alone.
113     *
114     * Additionally drops DokuWiki-specific heading decoration that carries no
115     * semantic meaning for GFM-conformance checks:
116     *
117     * - `<div class="levelN">` / matching `</div>` section wrappers the
118     *   renderer emits after every header call.
119     * - `class="..."` / `id="..."` attributes on h1-h6 (section-edit anchor
120     *   and header-id generation; fine to ignore, the spec output has none).
121     */
122    private function normalizeHtml(string $html): string
123    {
124        $block = 'p|div|h[1-6]|hr|ul|ol|li|blockquote|pre|table|thead|tbody|tfoot|tr|th|td';
125
126        // Drop DokuWiki's `<div class="levelN">` section wrappers and the
127        // HTML comments (`<!-- EDIT... -->`) its section-edit machinery
128        // inserts after each heading. Neither is semantically part of the
129        // heading and GFM reference output never contains them.
130        $html = preg_replace('#<div class="level[1-6]">\s*#', '', $html);
131        $html = preg_replace('#\s*</div>\s*#', '', $html);
132        $html = preg_replace('#<!--[^<]*?-->#', '', $html);
133
134        // Strip sectionedit/id decoration from headings.
135        $html = preg_replace('#<(h[1-6])(?:\s+(?:class|id)="[^"]*")+\s*>#', '<$1>', $html);
136
137        // Whitespace before/after an opening block tag (including attributes)
138        $html = preg_replace('#\s*<(' . $block . ')((?:\s[^>]*)?)>\s*#', '<$1$2>', $html);
139        // Whitespace before/after a closing block tag
140        $html = preg_replace('#\s*</(' . $block . ')>\s*#', '</$1>', $html);
141
142        return trim($html);
143    }
144}
145