xref: /dokuwiki/_test/tests/Parsing/ParserMode/GfmEscapeTest.php (revision 47a02a102092be9e1e6f1ddaf158bdfffdb13d4f)
174031e46SAndreas Gohr<?php
274031e46SAndreas Gohr
374031e46SAndreas Gohrnamespace dokuwiki\test\Parsing\ParserMode;
474031e46SAndreas Gohr
574031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmBacktickSingle;
674031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmEmphasis;
774031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmEscape;
874031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmHeader;
965dd2042SAndreas Gohruse dokuwiki\Parsing\ParserMode\Linebreak;
1074031e46SAndreas Gohr
1174031e46SAndreas Gohr/**
1274031e46SAndreas Gohr * Tests for the GFM backslash-escape mode.
1374031e46SAndreas Gohr */
1474031e46SAndreas Gohrclass GfmEscapeTest extends ParserTestBase
1574031e46SAndreas Gohr{
1674031e46SAndreas Gohr    public function setUp(): void
1774031e46SAndreas Gohr    {
1874031e46SAndreas Gohr        parent::setUp();
19*47a02a10SAndreas Gohr        $this->setSyntax('md');
2074031e46SAndreas Gohr    }
2174031e46SAndreas Gohr
2274031e46SAndreas Gohr    /**
2374031e46SAndreas Gohr     * Every ASCII punctuation character is escapable per GFM §6.1.
2474031e46SAndreas Gohr     *
2574031e46SAndreas Gohr     * @dataProvider provideEscapableChars
2674031e46SAndreas Gohr     */
2774031e46SAndreas Gohr    function testEscapableAsciiPunctuationProducesLiteral(string $char)
2874031e46SAndreas Gohr    {
2974031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
3074031e46SAndreas Gohr        $this->P->parse('foo \\' . $char . ' bar');
3174031e46SAndreas Gohr
3274031e46SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
3374031e46SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
3474031e46SAndreas Gohr
3574031e46SAndreas Gohr        $this->assertSame("\nfoo " . $char . ' bar', $joined,
3674031e46SAndreas Gohr            "Escaped {$char} must collapse to the literal char in cdata stream");
3774031e46SAndreas Gohr    }
3874031e46SAndreas Gohr
3974031e46SAndreas Gohr    public static function provideEscapableChars(): array
4074031e46SAndreas Gohr    {
4174031e46SAndreas Gohr        $chars = str_split('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~');
4274031e46SAndreas Gohr        return array_combine(
4374031e46SAndreas Gohr            array_map(static fn($c) => 'char_' . bin2hex($c), $chars),
4474031e46SAndreas Gohr            array_map(static fn($c) => [$c], $chars),
4574031e46SAndreas Gohr        );
4674031e46SAndreas Gohr    }
4774031e46SAndreas Gohr
4874031e46SAndreas Gohr    /**
4974031e46SAndreas Gohr     * Backslash before non-ASCII-punctuation stays literal — letters,
5074031e46SAndreas Gohr     * digits, multibyte chars, spaces, and tabs are not escapable. The
5174031e46SAndreas Gohr     * pattern simply doesn't match, so the bytes flow through as cdata.
5274031e46SAndreas Gohr     *
5374031e46SAndreas Gohr     * @dataProvider provideNonEscapableChars
5474031e46SAndreas Gohr     */
5574031e46SAndreas Gohr    function testNonEscapableCharsKeepBackslash(string $tail)
5674031e46SAndreas Gohr    {
5774031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
5874031e46SAndreas Gohr        $this->P->parse('a \\' . $tail . ' b');
5974031e46SAndreas Gohr
6074031e46SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
6174031e46SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
6274031e46SAndreas Gohr
6374031e46SAndreas Gohr        $this->assertSame("\na \\" . $tail . ' b', $joined);
6474031e46SAndreas Gohr    }
6574031e46SAndreas Gohr
6674031e46SAndreas Gohr    public static function provideNonEscapableChars(): array
6774031e46SAndreas Gohr    {
6874031e46SAndreas Gohr        return [
6974031e46SAndreas Gohr            'letter_upper' => ['A'],
7074031e46SAndreas Gohr            'letter_lower' => ['a'],
7174031e46SAndreas Gohr            'digit'        => ['3'],
7274031e46SAndreas Gohr            'multibyte'    => ['α'],
7374031e46SAndreas Gohr            'space'        => [' '],
7474031e46SAndreas Gohr            'tab'          => ["\t"],
7574031e46SAndreas Gohr        ];
7674031e46SAndreas Gohr    }
7774031e46SAndreas Gohr
7874031e46SAndreas Gohr    function testDoubleBackslashCollapsesToSingleBackslash()
7974031e46SAndreas Gohr    {
8074031e46SAndreas Gohr        // \\ is the escaped-backslash form. The first char in the match
8174031e46SAndreas Gohr        // is consumed as the escape introducer; the second is emitted as
8274031e46SAndreas Gohr        // a literal backslash.
8374031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
8474031e46SAndreas Gohr        $this->P->parse('foo \\\\ bar');
8574031e46SAndreas Gohr
8674031e46SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
8774031e46SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
8874031e46SAndreas Gohr
8974031e46SAndreas Gohr        $this->assertSame("\nfoo \\ bar", $joined);
9074031e46SAndreas Gohr    }
9174031e46SAndreas Gohr
9274031e46SAndreas Gohr    function testEscapedAsteriskBlocksEmphasis()
9374031e46SAndreas Gohr    {
9474031e46SAndreas Gohr        // GFM spec example 310 fragment. \* must consume the asterisk
9574031e46SAndreas Gohr        // before GfmEmphasis can use it as an opener.
9674031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
9774031e46SAndreas Gohr        $this->P->addMode('gfm_emphasis', new GfmEmphasis());
9874031e46SAndreas Gohr        $this->P->parse('\\*not emphasized*');
9974031e46SAndreas Gohr
10074031e46SAndreas Gohr        $modes = array_column($this->H->calls, 0);
10174031e46SAndreas Gohr        $this->assertNotContains('emphasis_open', $modes,
10274031e46SAndreas Gohr            'Escaped opener must not start emphasis');
10374031e46SAndreas Gohr    }
10474031e46SAndreas Gohr
10574031e46SAndreas Gohr    function testEscapedBackslashThenEmphasisOpens()
10674031e46SAndreas Gohr    {
10774031e46SAndreas Gohr        // GFM spec example 311. \\ collapses to a literal backslash, and
10874031e46SAndreas Gohr        // the *emphasis* that follows is now seen by GfmEmphasis with
10974031e46SAndreas Gohr        // its full text intact.
11074031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
11174031e46SAndreas Gohr        $this->P->addMode('gfm_emphasis', new GfmEmphasis());
11274031e46SAndreas Gohr        $this->P->parse('\\\\*emphasis*');
11374031e46SAndreas Gohr
11474031e46SAndreas Gohr        $modes = array_column($this->H->calls, 0);
11574031e46SAndreas Gohr        $this->assertContains('emphasis_open', $modes,
11674031e46SAndreas Gohr            'After \\\\ collapses, the surviving *emphasis* must open emphasis');
11774031e46SAndreas Gohr    }
11874031e46SAndreas Gohr
11974031e46SAndreas Gohr    function testEscapedHashBlocksHeader()
12074031e46SAndreas Gohr    {
12174031e46SAndreas Gohr        // \# must defeat GfmHeader's column-0 # match. The trailing text
12274031e46SAndreas Gohr        // becomes a normal paragraph instead.
12374031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
12474031e46SAndreas Gohr        $this->P->addMode('gfm_header', new GfmHeader());
12574031e46SAndreas Gohr        $this->P->parse("\\# not a heading");
12674031e46SAndreas Gohr
12774031e46SAndreas Gohr        $modes = array_column($this->H->calls, 0);
12874031e46SAndreas Gohr        $this->assertNotContains('header', $modes,
12974031e46SAndreas Gohr            'Escaped # must not produce a header');
13074031e46SAndreas Gohr    }
13174031e46SAndreas Gohr
13274031e46SAndreas Gohr    function testNoEscapeInsideBacktickSpan()
13374031e46SAndreas Gohr    {
13474031e46SAndreas Gohr        // GFM spec example 313. The whole `\[\`` is captured by
13574031e46SAndreas Gohr        // GfmBacktickSingle in one regex shot, so GfmEscape never runs
13674031e46SAndreas Gohr        // on its body. The body must retain the literal backslashes.
13774031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
13874031e46SAndreas Gohr        $this->P->addMode('gfm_backtick_single', new GfmBacktickSingle());
13974031e46SAndreas Gohr        $this->P->parse('`\\[\\`');
14074031e46SAndreas Gohr
14174031e46SAndreas Gohr        $unformatted = array_filter($this->H->calls, static fn($c) => $c[0] === 'unformatted');
14274031e46SAndreas Gohr        $bodies = array_map(static fn($c) => $c[1][0], $unformatted);
14374031e46SAndreas Gohr        $this->assertContains('\\[\\', $bodies,
14474031e46SAndreas Gohr            'Backtick span body must preserve the literal backslashes');
14574031e46SAndreas Gohr    }
14674031e46SAndreas Gohr
14774031e46SAndreas Gohr    function testSortValue()
14874031e46SAndreas Gohr    {
14974031e46SAndreas Gohr        $mode = new GfmEscape();
15074031e46SAndreas Gohr        $this->assertSame(5, $mode->getSort());
15174031e46SAndreas Gohr    }
15265dd2042SAndreas Gohr
15365dd2042SAndreas Gohr    /**
15465dd2042SAndreas Gohr     * In pure `md` mode, `\\` before a newline still escapes to a literal
15565dd2042SAndreas Gohr     * backslash per GFM §6.1 — no DW Linebreak is loaded to defer to.
15665dd2042SAndreas Gohr     */
15765dd2042SAndreas Gohr    function testDoubleBackslashBeforeNewlineEscapesInPureMd()
15865dd2042SAndreas Gohr    {
15965dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
16065dd2042SAndreas Gohr        $this->P->parse("foo \\\\\nbar");
16165dd2042SAndreas Gohr
16265dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
16365dd2042SAndreas Gohr        $this->assertNotContains('linebreak', $names,
16465dd2042SAndreas Gohr            'No DW Linebreak is loaded in pure md mode — `\\\\\\n` must stay an escape');
16565dd2042SAndreas Gohr
16665dd2042SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
16765dd2042SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
16865dd2042SAndreas Gohr        $this->assertSame("\nfoo \\\nbar", $joined,
16965dd2042SAndreas Gohr            '`\\\\` collapses to a literal backslash; the newline survives as cdata');
17065dd2042SAndreas Gohr    }
17165dd2042SAndreas Gohr
17265dd2042SAndreas Gohr    /**
17365dd2042SAndreas Gohr     * In any DW-loaded mode (`dw+md` / `md+dw`), `\\` before a space, tab,
17465dd2042SAndreas Gohr     * or newline must defer to DW's Linebreak mode. GfmEscape would
17565dd2042SAndreas Gohr     * otherwise consume those two bytes first (sort 5 vs Linebreak's 140)
17665dd2042SAndreas Gohr     * and the forced linebreak would never fire.
17765dd2042SAndreas Gohr     *
17865dd2042SAndreas Gohr     * @dataProvider provideDwLoadedSyntaxes
17965dd2042SAndreas Gohr     */
18065dd2042SAndreas Gohr    function testDoubleBackslashBeforeNewlineDefersToLinebreakWhenDwLoaded(string $syntax)
18165dd2042SAndreas Gohr    {
182*47a02a10SAndreas Gohr        $this->setSyntax($syntax);
18365dd2042SAndreas Gohr
18465dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
18565dd2042SAndreas Gohr        $this->P->addMode('linebreak', new Linebreak());
18665dd2042SAndreas Gohr        $this->P->parse("foo\\\\\nbar");
18765dd2042SAndreas Gohr
18865dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
18965dd2042SAndreas Gohr        $this->assertContains('linebreak', $names,
19065dd2042SAndreas Gohr            "Under $syntax, `\\\\\\\\\\n` must yield a DW linebreak instead of an escape");
19165dd2042SAndreas Gohr    }
19265dd2042SAndreas Gohr
19365dd2042SAndreas Gohr    /**
19465dd2042SAndreas Gohr     * Same deferral applies for `\\` before a literal space — the
19565dd2042SAndreas Gohr     * canonical DW forced-linebreak form.
19665dd2042SAndreas Gohr     *
19765dd2042SAndreas Gohr     * @dataProvider provideDwLoadedSyntaxes
19865dd2042SAndreas Gohr     */
19965dd2042SAndreas Gohr    function testDoubleBackslashBeforeSpaceDefersToLinebreakWhenDwLoaded(string $syntax)
20065dd2042SAndreas Gohr    {
201*47a02a10SAndreas Gohr        $this->setSyntax($syntax);
20265dd2042SAndreas Gohr
20365dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
20465dd2042SAndreas Gohr        $this->P->addMode('linebreak', new Linebreak());
20565dd2042SAndreas Gohr        $this->P->parse('foo \\\\ bar');
20665dd2042SAndreas Gohr
20765dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
20865dd2042SAndreas Gohr        $this->assertContains('linebreak', $names,
20965dd2042SAndreas Gohr            "Under $syntax, `\\\\\\\\ ` must yield a DW linebreak instead of an escape");
21065dd2042SAndreas Gohr    }
21165dd2042SAndreas Gohr
21265dd2042SAndreas Gohr    /**
21365dd2042SAndreas Gohr     * The deferral is narrow: `\\` followed by non-whitespace still
21465dd2042SAndreas Gohr     * escapes to a literal backslash, even with DW Linebreak loaded.
21565dd2042SAndreas Gohr     * UNC-style paths like `\\\\host\\share` would otherwise become a
21665dd2042SAndreas Gohr     * surprise of literal double-backslashes for a user who typed two
21765dd2042SAndreas Gohr     * GFM-escapes back-to-back.
21865dd2042SAndreas Gohr     *
21965dd2042SAndreas Gohr     * @dataProvider provideDwLoadedSyntaxes
22065dd2042SAndreas Gohr     */
22165dd2042SAndreas Gohr    function testMidLineDoubleBackslashStillEscapesWhenDwLoaded(string $syntax)
22265dd2042SAndreas Gohr    {
223*47a02a10SAndreas Gohr        $this->setSyntax($syntax);
22465dd2042SAndreas Gohr
22565dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
22665dd2042SAndreas Gohr        $this->P->addMode('linebreak', new Linebreak());
22765dd2042SAndreas Gohr        $this->P->parse('\\\\\\\\host\\\\share');
22865dd2042SAndreas Gohr
22965dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
23065dd2042SAndreas Gohr        $this->assertNotContains('linebreak', $names,
23165dd2042SAndreas Gohr            'Mid-line `\\\\` (no EOL whitespace) must not fire a linebreak');
23265dd2042SAndreas Gohr
23365dd2042SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
23465dd2042SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
23565dd2042SAndreas Gohr        $this->assertSame("\n\\\\host\\share", $joined,
23665dd2042SAndreas Gohr            'Each `\\\\` collapses to a single literal backslash, GFM-style');
23765dd2042SAndreas Gohr    }
23865dd2042SAndreas Gohr
23965dd2042SAndreas Gohr    public static function provideDwLoadedSyntaxes(): array
24065dd2042SAndreas Gohr    {
24165dd2042SAndreas Gohr        return [
24265dd2042SAndreas Gohr            'dw_md' => ['dw+md'],
24365dd2042SAndreas Gohr            'md_dw' => ['md+dw'],
24465dd2042SAndreas Gohr        ];
24565dd2042SAndreas Gohr    }
24674031e46SAndreas Gohr}
247