xref: /dokuwiki/_test/tests/Parsing/ParserMode/GfmEscapeTest.php (revision 65dd2042806255b56d4cf303530fd396ff38f151)
174031e46SAndreas Gohr<?php
274031e46SAndreas Gohr
374031e46SAndreas Gohrnamespace dokuwiki\test\Parsing\ParserMode;
474031e46SAndreas Gohr
574031e46SAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
674031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmBacktickSingle;
774031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmEmphasis;
874031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmEscape;
974031e46SAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmHeader;
10*65dd2042SAndreas Gohruse dokuwiki\Parsing\ParserMode\Linebreak;
1174031e46SAndreas Gohr
1274031e46SAndreas Gohr/**
1374031e46SAndreas Gohr * Tests for the GFM backslash-escape mode.
1474031e46SAndreas Gohr */
1574031e46SAndreas Gohrclass GfmEscapeTest extends ParserTestBase
1674031e46SAndreas Gohr{
1774031e46SAndreas Gohr    public function setUp(): void
1874031e46SAndreas Gohr    {
1974031e46SAndreas Gohr        parent::setUp();
2074031e46SAndreas Gohr        global $conf;
2113a62f81SAndreas Gohr        $conf['syntax'] = 'md';
2274031e46SAndreas Gohr        ModeRegistry::reset();
2374031e46SAndreas Gohr    }
2474031e46SAndreas Gohr
2574031e46SAndreas Gohr    public function tearDown(): void
2674031e46SAndreas Gohr    {
2774031e46SAndreas Gohr        ModeRegistry::reset();
2874031e46SAndreas Gohr        parent::tearDown();
2974031e46SAndreas Gohr    }
3074031e46SAndreas Gohr
3174031e46SAndreas Gohr    /**
3274031e46SAndreas Gohr     * Every ASCII punctuation character is escapable per GFM §6.1.
3374031e46SAndreas Gohr     *
3474031e46SAndreas Gohr     * @dataProvider provideEscapableChars
3574031e46SAndreas Gohr     */
3674031e46SAndreas Gohr    function testEscapableAsciiPunctuationProducesLiteral(string $char)
3774031e46SAndreas Gohr    {
3874031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
3974031e46SAndreas Gohr        $this->P->parse('foo \\' . $char . ' bar');
4074031e46SAndreas Gohr
4174031e46SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
4274031e46SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
4374031e46SAndreas Gohr
4474031e46SAndreas Gohr        $this->assertSame("\nfoo " . $char . ' bar', $joined,
4574031e46SAndreas Gohr            "Escaped {$char} must collapse to the literal char in cdata stream");
4674031e46SAndreas Gohr    }
4774031e46SAndreas Gohr
4874031e46SAndreas Gohr    public static function provideEscapableChars(): array
4974031e46SAndreas Gohr    {
5074031e46SAndreas Gohr        $chars = str_split('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~');
5174031e46SAndreas Gohr        return array_combine(
5274031e46SAndreas Gohr            array_map(static fn($c) => 'char_' . bin2hex($c), $chars),
5374031e46SAndreas Gohr            array_map(static fn($c) => [$c], $chars),
5474031e46SAndreas Gohr        );
5574031e46SAndreas Gohr    }
5674031e46SAndreas Gohr
5774031e46SAndreas Gohr    /**
5874031e46SAndreas Gohr     * Backslash before non-ASCII-punctuation stays literal — letters,
5974031e46SAndreas Gohr     * digits, multibyte chars, spaces, and tabs are not escapable. The
6074031e46SAndreas Gohr     * pattern simply doesn't match, so the bytes flow through as cdata.
6174031e46SAndreas Gohr     *
6274031e46SAndreas Gohr     * @dataProvider provideNonEscapableChars
6374031e46SAndreas Gohr     */
6474031e46SAndreas Gohr    function testNonEscapableCharsKeepBackslash(string $tail)
6574031e46SAndreas Gohr    {
6674031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
6774031e46SAndreas Gohr        $this->P->parse('a \\' . $tail . ' b');
6874031e46SAndreas Gohr
6974031e46SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
7074031e46SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
7174031e46SAndreas Gohr
7274031e46SAndreas Gohr        $this->assertSame("\na \\" . $tail . ' b', $joined);
7374031e46SAndreas Gohr    }
7474031e46SAndreas Gohr
7574031e46SAndreas Gohr    public static function provideNonEscapableChars(): array
7674031e46SAndreas Gohr    {
7774031e46SAndreas Gohr        return [
7874031e46SAndreas Gohr            'letter_upper' => ['A'],
7974031e46SAndreas Gohr            'letter_lower' => ['a'],
8074031e46SAndreas Gohr            'digit'        => ['3'],
8174031e46SAndreas Gohr            'multibyte'    => ['α'],
8274031e46SAndreas Gohr            'space'        => [' '],
8374031e46SAndreas Gohr            'tab'          => ["\t"],
8474031e46SAndreas Gohr        ];
8574031e46SAndreas Gohr    }
8674031e46SAndreas Gohr
8774031e46SAndreas Gohr    function testDoubleBackslashCollapsesToSingleBackslash()
8874031e46SAndreas Gohr    {
8974031e46SAndreas Gohr        // \\ is the escaped-backslash form. The first char in the match
9074031e46SAndreas Gohr        // is consumed as the escape introducer; the second is emitted as
9174031e46SAndreas Gohr        // a literal backslash.
9274031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
9374031e46SAndreas Gohr        $this->P->parse('foo \\\\ bar');
9474031e46SAndreas Gohr
9574031e46SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
9674031e46SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
9774031e46SAndreas Gohr
9874031e46SAndreas Gohr        $this->assertSame("\nfoo \\ bar", $joined);
9974031e46SAndreas Gohr    }
10074031e46SAndreas Gohr
10174031e46SAndreas Gohr    function testEscapedAsteriskBlocksEmphasis()
10274031e46SAndreas Gohr    {
10374031e46SAndreas Gohr        // GFM spec example 310 fragment. \* must consume the asterisk
10474031e46SAndreas Gohr        // before GfmEmphasis can use it as an opener.
10574031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
10674031e46SAndreas Gohr        $this->P->addMode('gfm_emphasis', new GfmEmphasis());
10774031e46SAndreas Gohr        $this->P->parse('\\*not emphasized*');
10874031e46SAndreas Gohr
10974031e46SAndreas Gohr        $modes = array_column($this->H->calls, 0);
11074031e46SAndreas Gohr        $this->assertNotContains('emphasis_open', $modes,
11174031e46SAndreas Gohr            'Escaped opener must not start emphasis');
11274031e46SAndreas Gohr    }
11374031e46SAndreas Gohr
11474031e46SAndreas Gohr    function testEscapedBackslashThenEmphasisOpens()
11574031e46SAndreas Gohr    {
11674031e46SAndreas Gohr        // GFM spec example 311. \\ collapses to a literal backslash, and
11774031e46SAndreas Gohr        // the *emphasis* that follows is now seen by GfmEmphasis with
11874031e46SAndreas Gohr        // its full text intact.
11974031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
12074031e46SAndreas Gohr        $this->P->addMode('gfm_emphasis', new GfmEmphasis());
12174031e46SAndreas Gohr        $this->P->parse('\\\\*emphasis*');
12274031e46SAndreas Gohr
12374031e46SAndreas Gohr        $modes = array_column($this->H->calls, 0);
12474031e46SAndreas Gohr        $this->assertContains('emphasis_open', $modes,
12574031e46SAndreas Gohr            'After \\\\ collapses, the surviving *emphasis* must open emphasis');
12674031e46SAndreas Gohr    }
12774031e46SAndreas Gohr
12874031e46SAndreas Gohr    function testEscapedHashBlocksHeader()
12974031e46SAndreas Gohr    {
13074031e46SAndreas Gohr        // \# must defeat GfmHeader's column-0 # match. The trailing text
13174031e46SAndreas Gohr        // becomes a normal paragraph instead.
13274031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
13374031e46SAndreas Gohr        $this->P->addMode('gfm_header', new GfmHeader());
13474031e46SAndreas Gohr        $this->P->parse("\\# not a heading");
13574031e46SAndreas Gohr
13674031e46SAndreas Gohr        $modes = array_column($this->H->calls, 0);
13774031e46SAndreas Gohr        $this->assertNotContains('header', $modes,
13874031e46SAndreas Gohr            'Escaped # must not produce a header');
13974031e46SAndreas Gohr    }
14074031e46SAndreas Gohr
14174031e46SAndreas Gohr    function testNoEscapeInsideBacktickSpan()
14274031e46SAndreas Gohr    {
14374031e46SAndreas Gohr        // GFM spec example 313. The whole `\[\`` is captured by
14474031e46SAndreas Gohr        // GfmBacktickSingle in one regex shot, so GfmEscape never runs
14574031e46SAndreas Gohr        // on its body. The body must retain the literal backslashes.
14674031e46SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
14774031e46SAndreas Gohr        $this->P->addMode('gfm_backtick_single', new GfmBacktickSingle());
14874031e46SAndreas Gohr        $this->P->parse('`\\[\\`');
14974031e46SAndreas Gohr
15074031e46SAndreas Gohr        $unformatted = array_filter($this->H->calls, static fn($c) => $c[0] === 'unformatted');
15174031e46SAndreas Gohr        $bodies = array_map(static fn($c) => $c[1][0], $unformatted);
15274031e46SAndreas Gohr        $this->assertContains('\\[\\', $bodies,
15374031e46SAndreas Gohr            'Backtick span body must preserve the literal backslashes');
15474031e46SAndreas Gohr    }
15574031e46SAndreas Gohr
15674031e46SAndreas Gohr    function testSortValue()
15774031e46SAndreas Gohr    {
15874031e46SAndreas Gohr        $mode = new GfmEscape();
15974031e46SAndreas Gohr        $this->assertSame(5, $mode->getSort());
16074031e46SAndreas Gohr    }
161*65dd2042SAndreas Gohr
162*65dd2042SAndreas Gohr    /**
163*65dd2042SAndreas Gohr     * In pure `md` mode, `\\` before a newline still escapes to a literal
164*65dd2042SAndreas Gohr     * backslash per GFM §6.1 — no DW Linebreak is loaded to defer to.
165*65dd2042SAndreas Gohr     */
166*65dd2042SAndreas Gohr    function testDoubleBackslashBeforeNewlineEscapesInPureMd()
167*65dd2042SAndreas Gohr    {
168*65dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
169*65dd2042SAndreas Gohr        $this->P->parse("foo \\\\\nbar");
170*65dd2042SAndreas Gohr
171*65dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
172*65dd2042SAndreas Gohr        $this->assertNotContains('linebreak', $names,
173*65dd2042SAndreas Gohr            'No DW Linebreak is loaded in pure md mode — `\\\\\\n` must stay an escape');
174*65dd2042SAndreas Gohr
175*65dd2042SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
176*65dd2042SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
177*65dd2042SAndreas Gohr        $this->assertSame("\nfoo \\\nbar", $joined,
178*65dd2042SAndreas Gohr            '`\\\\` collapses to a literal backslash; the newline survives as cdata');
179*65dd2042SAndreas Gohr    }
180*65dd2042SAndreas Gohr
181*65dd2042SAndreas Gohr    /**
182*65dd2042SAndreas Gohr     * In any DW-loaded mode (`dw+md` / `md+dw`), `\\` before a space, tab,
183*65dd2042SAndreas Gohr     * or newline must defer to DW's Linebreak mode. GfmEscape would
184*65dd2042SAndreas Gohr     * otherwise consume those two bytes first (sort 5 vs Linebreak's 140)
185*65dd2042SAndreas Gohr     * and the forced linebreak would never fire.
186*65dd2042SAndreas Gohr     *
187*65dd2042SAndreas Gohr     * @dataProvider provideDwLoadedSyntaxes
188*65dd2042SAndreas Gohr     */
189*65dd2042SAndreas Gohr    function testDoubleBackslashBeforeNewlineDefersToLinebreakWhenDwLoaded(string $syntax)
190*65dd2042SAndreas Gohr    {
191*65dd2042SAndreas Gohr        global $conf;
192*65dd2042SAndreas Gohr        $conf['syntax'] = $syntax;
193*65dd2042SAndreas Gohr
194*65dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
195*65dd2042SAndreas Gohr        $this->P->addMode('linebreak', new Linebreak());
196*65dd2042SAndreas Gohr        $this->P->parse("foo\\\\\nbar");
197*65dd2042SAndreas Gohr
198*65dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
199*65dd2042SAndreas Gohr        $this->assertContains('linebreak', $names,
200*65dd2042SAndreas Gohr            "Under $syntax, `\\\\\\\\\\n` must yield a DW linebreak instead of an escape");
201*65dd2042SAndreas Gohr    }
202*65dd2042SAndreas Gohr
203*65dd2042SAndreas Gohr    /**
204*65dd2042SAndreas Gohr     * Same deferral applies for `\\` before a literal space — the
205*65dd2042SAndreas Gohr     * canonical DW forced-linebreak form.
206*65dd2042SAndreas Gohr     *
207*65dd2042SAndreas Gohr     * @dataProvider provideDwLoadedSyntaxes
208*65dd2042SAndreas Gohr     */
209*65dd2042SAndreas Gohr    function testDoubleBackslashBeforeSpaceDefersToLinebreakWhenDwLoaded(string $syntax)
210*65dd2042SAndreas Gohr    {
211*65dd2042SAndreas Gohr        global $conf;
212*65dd2042SAndreas Gohr        $conf['syntax'] = $syntax;
213*65dd2042SAndreas Gohr
214*65dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
215*65dd2042SAndreas Gohr        $this->P->addMode('linebreak', new Linebreak());
216*65dd2042SAndreas Gohr        $this->P->parse('foo \\\\ bar');
217*65dd2042SAndreas Gohr
218*65dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
219*65dd2042SAndreas Gohr        $this->assertContains('linebreak', $names,
220*65dd2042SAndreas Gohr            "Under $syntax, `\\\\\\\\ ` must yield a DW linebreak instead of an escape");
221*65dd2042SAndreas Gohr    }
222*65dd2042SAndreas Gohr
223*65dd2042SAndreas Gohr    /**
224*65dd2042SAndreas Gohr     * The deferral is narrow: `\\` followed by non-whitespace still
225*65dd2042SAndreas Gohr     * escapes to a literal backslash, even with DW Linebreak loaded.
226*65dd2042SAndreas Gohr     * UNC-style paths like `\\\\host\\share` would otherwise become a
227*65dd2042SAndreas Gohr     * surprise of literal double-backslashes for a user who typed two
228*65dd2042SAndreas Gohr     * GFM-escapes back-to-back.
229*65dd2042SAndreas Gohr     *
230*65dd2042SAndreas Gohr     * @dataProvider provideDwLoadedSyntaxes
231*65dd2042SAndreas Gohr     */
232*65dd2042SAndreas Gohr    function testMidLineDoubleBackslashStillEscapesWhenDwLoaded(string $syntax)
233*65dd2042SAndreas Gohr    {
234*65dd2042SAndreas Gohr        global $conf;
235*65dd2042SAndreas Gohr        $conf['syntax'] = $syntax;
236*65dd2042SAndreas Gohr
237*65dd2042SAndreas Gohr        $this->P->addMode('gfm_escape', new GfmEscape());
238*65dd2042SAndreas Gohr        $this->P->addMode('linebreak', new Linebreak());
239*65dd2042SAndreas Gohr        $this->P->parse('\\\\\\\\host\\\\share');
240*65dd2042SAndreas Gohr
241*65dd2042SAndreas Gohr        $names = array_column($this->H->calls, 0);
242*65dd2042SAndreas Gohr        $this->assertNotContains('linebreak', $names,
243*65dd2042SAndreas Gohr            'Mid-line `\\\\` (no EOL whitespace) must not fire a linebreak');
244*65dd2042SAndreas Gohr
245*65dd2042SAndreas Gohr        $cdata = array_filter($this->H->calls, static fn($c) => $c[0] === 'cdata');
246*65dd2042SAndreas Gohr        $joined = implode('', array_map(static fn($c) => $c[1][0], $cdata));
247*65dd2042SAndreas Gohr        $this->assertSame("\n\\\\host\\share", $joined,
248*65dd2042SAndreas Gohr            'Each `\\\\` collapses to a single literal backslash, GFM-style');
249*65dd2042SAndreas Gohr    }
250*65dd2042SAndreas Gohr
251*65dd2042SAndreas Gohr    public static function provideDwLoadedSyntaxes(): array
252*65dd2042SAndreas Gohr    {
253*65dd2042SAndreas Gohr        return [
254*65dd2042SAndreas Gohr            'dw_md' => ['dw+md'],
255*65dd2042SAndreas Gohr            'md_dw' => ['md+dw'],
256*65dd2042SAndreas Gohr        ];
257*65dd2042SAndreas Gohr    }
25874031e46SAndreas Gohr}
259