xref: /dokuwiki/_test/tests/Parsing/Markdown/GfmSpecTest.php (revision aa346d4b22f9f1b8a72dc049417af675e416fdb3)
172b2703bSAndreas Gohr<?php
272b2703bSAndreas Gohr
372b2703bSAndreas Gohrnamespace dokuwiki\test\Parsing\Markdown;
472b2703bSAndreas Gohr
572b2703bSAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
672b2703bSAndreas Gohr
772b2703bSAndreas Gohr/**
872b2703bSAndreas Gohr * Roundtrip tests driven by GFM's spec.txt.
972b2703bSAndreas Gohr *
1072b2703bSAndreas Gohr * Each example in gfm-spec/spec.txt becomes one data-provider case. The
1172b2703bSAndreas Gohr * markdown input is run through DokuWiki's full pipeline (parser + XHTML
1272b2703bSAndreas Gohr * renderer) and the result is compared to the expected HTML from the spec,
1372b2703bSAndreas Gohr * tolerating whitespace differences around block-level tags.
1472b2703bSAndreas Gohr *
1572b2703bSAndreas Gohr * Most examples are expected to FAIL until the relevant GFM parser modes
1672b2703bSAndreas Gohr * are implemented — they are the branch's living TODO list for GFM parity.
1772b2703bSAndreas Gohr * Do not mark such failures incomplete or skipped.
1872b2703bSAndreas Gohr *
1972b2703bSAndreas Gohr * `gfm-spec/skip.php` lists examples that are deliberately out of scope
2072b2703bSAndreas Gohr * for DokuWiki (e.g. CommonMark flanking-delimiter edge cases). Those are
2172b2703bSAndreas Gohr * reported as skipped with a reason.
2272b2703bSAndreas Gohr */
2372b2703bSAndreas Gohrclass GfmSpecTest extends \DokuWikiTest
2472b2703bSAndreas Gohr{
2572b2703bSAndreas Gohr    private const FIXTURE_DIR = __DIR__ . '/gfm-spec/';
2672b2703bSAndreas Gohr
2772b2703bSAndreas Gohr    public static function specProvider(): iterable
2872b2703bSAndreas Gohr    {
2972b2703bSAndreas Gohr        $reader = new SpecReader(self::FIXTURE_DIR . 'spec.txt');
3072b2703bSAndreas Gohr        $skip   = require self::FIXTURE_DIR . 'skip.php';
3172b2703bSAndreas Gohr
3209f34c31SAndreas Gohr        // Spec convention (spec.txt §"About this document"): the `→`
3309f34c31SAndreas Gohr        // character in examples represents a literal tab. Restore the
3409f34c31SAndreas Gohr        // tab in both input and expected output so the corpus exercises
3509f34c31SAndreas Gohr        // real tab-handling behavior, not arrow-character handling.
3672b2703bSAndreas Gohr        foreach ($reader->examples() as $ex) {
3772b2703bSAndreas Gohr            $reason = $skip[$ex['number']] ?? null;
3872b2703bSAndreas Gohr            $label  = sprintf('#%d %s', $ex['number'], $ex['section']);
3909f34c31SAndreas Gohr            $md     = strtr($ex['markdown'], ["\u{2192}" => "\t"]);
4009f34c31SAndreas Gohr            $html   = strtr($ex['html'], ["\u{2192}" => "\t"]);
4109f34c31SAndreas Gohr            yield $label => [$md, $html, $reason];
4272b2703bSAndreas Gohr        }
4372b2703bSAndreas Gohr    }
4472b2703bSAndreas Gohr
4572b2703bSAndreas Gohr    /**
4672b2703bSAndreas Gohr     * @dataProvider specProvider
4772b2703bSAndreas Gohr     */
4872b2703bSAndreas Gohr    public function testExample(string $md, string $expected, ?string $skipReason): void
4972b2703bSAndreas Gohr    {
5072b2703bSAndreas Gohr        if ($skipReason !== null) {
5172b2703bSAndreas Gohr            $this->markTestSkipped($skipReason);
5272b2703bSAndreas Gohr        }
5372b2703bSAndreas Gohr        $actual = $this->renderMarkdown($md);
5472b2703bSAndreas Gohr        $this->assertHtmlEquals($expected, $actual);
5572b2703bSAndreas Gohr    }
5672b2703bSAndreas Gohr
5772b2703bSAndreas Gohr    public function tearDown(): void
5872b2703bSAndreas Gohr    {
5972b2703bSAndreas Gohr        ModeRegistry::reset();
6072b2703bSAndreas Gohr        parent::tearDown();
6172b2703bSAndreas Gohr    }
6272b2703bSAndreas Gohr
6372b2703bSAndreas Gohr    /**
643440a8c0SAndreas Gohr     * Render markdown text through DokuWiki's full parser pipeline under
6513a62f81SAndreas Gohr     * the `md` syntax setting, using {@see SpecCompatRenderer} —
663440a8c0SAndreas Gohr     * an XHTML renderer subclass that emits the minimal link/media HTML
673440a8c0SAndreas Gohr     * shape the GFM spec expects. Production rendering is unchanged;
683440a8c0SAndreas Gohr     * this override exists so spec output can be compared byte-for-byte.
69d379b737SAndreas Gohr     *
70d379b737SAndreas Gohr     * Typography is forced off for the spec run: $conf[typography] = 0
71d379b737SAndreas Gohr     * keeps the Quotes and MultiplyEntity modes (curly quote pairing,
72d379b737SAndreas Gohr     * apostrophe to numeric entity) out of the mode list. Both are
73d379b737SAndreas Gohr     * correct for production wiki prose but diverge byte-for-byte from
74d379b737SAndreas Gohr     * spec output. SpecCompatRenderer additionally neutralizes the
75d379b737SAndreas Gohr     * Entity-table substitutions (--, ---, ->, (c), ...) at render time;
76d379b737SAndreas Gohr     * see SpecCompatRenderer::entity().
77*aa346d4bSAndreas Gohr     *
78*aa346d4bSAndreas Gohr     * The renderer's acronym table is left empty so the parser-emitted
79*aa346d4bSAndreas Gohr     * acronym() calls (e.g. for "FTP") fall through to literal text
80*aa346d4bSAndreas Gohr     * instead of wrapping in <abbr>, which the spec output never has.
8172b2703bSAndreas Gohr     */
8272b2703bSAndreas Gohr    private function renderMarkdown(string $text): string
8372b2703bSAndreas Gohr    {
8472b2703bSAndreas Gohr        global $conf;
8513a62f81SAndreas Gohr        $conf['syntax'] = 'md';
86d379b737SAndreas Gohr        $conf['typography'] = 0;
8772b2703bSAndreas Gohr        ModeRegistry::reset();
8872b2703bSAndreas Gohr
8972b2703bSAndreas Gohr        $instructions = p_get_instructions($text);
903440a8c0SAndreas Gohr
913440a8c0SAndreas Gohr        $renderer = new SpecCompatRenderer();
923440a8c0SAndreas Gohr        $renderer->reset();
933440a8c0SAndreas Gohr        $renderer->smileys   = getSmileys();
943440a8c0SAndreas Gohr        $renderer->entities  = getEntities();
95*aa346d4bSAndreas Gohr        $renderer->acronyms  = [];
963440a8c0SAndreas Gohr        $renderer->interwiki = getInterwiki();
973440a8c0SAndreas Gohr
983440a8c0SAndreas Gohr        foreach ($instructions as $instruction) {
993440a8c0SAndreas Gohr            if (method_exists($renderer, $instruction[0])) {
1003440a8c0SAndreas Gohr                call_user_func_array([$renderer, $instruction[0]], $instruction[1] ?: []);
1013440a8c0SAndreas Gohr            }
1023440a8c0SAndreas Gohr        }
1033440a8c0SAndreas Gohr        return $renderer->doc;
10472b2703bSAndreas Gohr    }
10572b2703bSAndreas Gohr
10672b2703bSAndreas Gohr    /**
10772b2703bSAndreas Gohr     * Assert two HTML strings are equivalent after whitespace normalization.
10872b2703bSAndreas Gohr     *
10972b2703bSAndreas Gohr     * DokuWiki's XHTML renderer emits extra whitespace around block tags
11072b2703bSAndreas Gohr     * that the spec's reference HTML omits. The comparator strips whitespace
11172b2703bSAndreas Gohr     * only around **block-level** tags (p, div, h1-h6, ul/ol/li, table/tr/td,
11272b2703bSAndreas Gohr     * blockquote, pre, hr). Whitespace around **inline** tags (em, strong,
11372b2703bSAndreas Gohr     * a, code, span, img, br, etc.) is preserved, because `<em>x</em> y`
11472b2703bSAndreas Gohr     * and `<em>x</em>y` render differently.
11572b2703bSAndreas Gohr     */
11672b2703bSAndreas Gohr    private function assertHtmlEquals(string $expected, string $actual): void
11772b2703bSAndreas Gohr    {
11872b2703bSAndreas Gohr        $this->assertEquals(
11972b2703bSAndreas Gohr            $this->normalizeHtml($expected),
12072b2703bSAndreas Gohr            $this->normalizeHtml($actual)
12172b2703bSAndreas Gohr        );
12272b2703bSAndreas Gohr    }
12372b2703bSAndreas Gohr
12472b2703bSAndreas Gohr    /**
12572b2703bSAndreas Gohr     * Strip whitespace adjacent to block-level tags; leave inline tags alone.
1268719732dSAndreas Gohr     *
1278719732dSAndreas Gohr     * Additionally drops DokuWiki-specific heading decoration that carries no
1288719732dSAndreas Gohr     * semantic meaning for GFM-conformance checks:
1298719732dSAndreas Gohr     *
1308719732dSAndreas Gohr     * - `<div class="levelN">` / matching `</div>` section wrappers the
1318719732dSAndreas Gohr     *   renderer emits after every header call.
1328719732dSAndreas Gohr     * - `class="..."` / `id="..."` attributes on h1-h6 (section-edit anchor
1338719732dSAndreas Gohr     *   and header-id generation; fine to ignore, the spec output has none).
13472b2703bSAndreas Gohr     */
13572b2703bSAndreas Gohr    private function normalizeHtml(string $html): string
13672b2703bSAndreas Gohr    {
13772b2703bSAndreas Gohr        $block = 'p|div|h[1-6]|hr|ul|ol|li|blockquote|pre|table|thead|tbody|tfoot|tr|th|td';
13872b2703bSAndreas Gohr
1398719732dSAndreas Gohr        // Drop DokuWiki's `<div class="levelN">` section wrappers and the
1408719732dSAndreas Gohr        // HTML comments (`<!-- EDIT... -->`) its section-edit machinery
1418719732dSAndreas Gohr        // inserts after each heading. Neither is semantically part of the
1428719732dSAndreas Gohr        // heading and GFM reference output never contains them.
1438719732dSAndreas Gohr        $html = preg_replace('#<div class="level[1-6]">\s*#', '', $html);
1448719732dSAndreas Gohr        $html = preg_replace('#\s*</div>\s*#', '', $html);
1458719732dSAndreas Gohr        $html = preg_replace('#<!--[^<]*?-->#', '', $html);
1468719732dSAndreas Gohr
1478719732dSAndreas Gohr        // Strip sectionedit/id decoration from headings.
1488719732dSAndreas Gohr        $html = preg_replace('#<(h[1-6])(?:\s+(?:class|id)="[^"]*")+\s*>#', '<$1>', $html);
1498719732dSAndreas Gohr
15072b2703bSAndreas Gohr        // Whitespace before/after an opening block tag (including attributes)
15172b2703bSAndreas Gohr        $html = preg_replace('#\s*<(' . $block . ')((?:\s[^>]*)?)>\s*#', '<$1$2>', $html);
15272b2703bSAndreas Gohr        // Whitespace before/after a closing block tag
15372b2703bSAndreas Gohr        $html = preg_replace('#\s*</(' . $block . ')>\s*#', '</$1>', $html);
15472b2703bSAndreas Gohr
15572b2703bSAndreas Gohr        return trim($html);
15672b2703bSAndreas Gohr    }
15772b2703bSAndreas Gohr}
158