xref: /dokuwiki/_test/tests/Parsing/ParserMode/GfmEmphasisUnderscoreTest.php (revision 47a02a102092be9e1e6f1ddaf158bdfffdb13d4f)
1bcefb8aeSAndreas Gohr<?php
2bcefb8aeSAndreas Gohr
3bcefb8aeSAndreas Gohrnamespace dokuwiki\test\Parsing\ParserMode;
4bcefb8aeSAndreas Gohr
5bcefb8aeSAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore;
6bcefb8aeSAndreas Gohr
7bcefb8aeSAndreas Gohr/**
8bcefb8aeSAndreas Gohr * Tests for the GFM underscore emphasis mode (`_text_`).
9bcefb8aeSAndreas Gohr */
10bcefb8aeSAndreas Gohrclass GfmEmphasisUnderscoreTest extends ParserTestBase
11bcefb8aeSAndreas Gohr{
12bcefb8aeSAndreas Gohr    public function setUp(): void
13bcefb8aeSAndreas Gohr    {
14bcefb8aeSAndreas Gohr        parent::setUp();
15*47a02a10SAndreas Gohr        $this->setSyntax('md');
16bcefb8aeSAndreas Gohr    }
17bcefb8aeSAndreas Gohr
18bcefb8aeSAndreas Gohr    function testBasicUnderscore()
19bcefb8aeSAndreas Gohr    {
20bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
21bcefb8aeSAndreas Gohr        $this->P->parse('Foo _Bar_ Baz');
22bcefb8aeSAndreas Gohr        $calls = [
23bcefb8aeSAndreas Gohr            ['document_start', []],
24bcefb8aeSAndreas Gohr            ['p_open', []],
25bcefb8aeSAndreas Gohr            ['cdata', ["\nFoo "]],
26bcefb8aeSAndreas Gohr            ['emphasis_open', []],
27bcefb8aeSAndreas Gohr            ['cdata', ['Bar']],
28bcefb8aeSAndreas Gohr            ['emphasis_close', []],
29bcefb8aeSAndreas Gohr            ['cdata', [' Baz']],
30bcefb8aeSAndreas Gohr            ['p_close', []],
31bcefb8aeSAndreas Gohr            ['document_end', []],
32bcefb8aeSAndreas Gohr        ];
33bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
34bcefb8aeSAndreas Gohr    }
35bcefb8aeSAndreas Gohr
36bcefb8aeSAndreas Gohr    function testSingleCharacter()
37bcefb8aeSAndreas Gohr    {
38bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
39bcefb8aeSAndreas Gohr        $this->P->parse('foo _b_ bar');
40bcefb8aeSAndreas Gohr        $calls = [
41bcefb8aeSAndreas Gohr            ['document_start', []],
42bcefb8aeSAndreas Gohr            ['p_open', []],
43bcefb8aeSAndreas Gohr            ['cdata', ["\nfoo "]],
44bcefb8aeSAndreas Gohr            ['emphasis_open', []],
45bcefb8aeSAndreas Gohr            ['cdata', ['b']],
46bcefb8aeSAndreas Gohr            ['emphasis_close', []],
47bcefb8aeSAndreas Gohr            ['cdata', [' bar']],
48bcefb8aeSAndreas Gohr            ['p_close', []],
49bcefb8aeSAndreas Gohr            ['document_end', []],
50bcefb8aeSAndreas Gohr        ];
51bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
52bcefb8aeSAndreas Gohr    }
53bcefb8aeSAndreas Gohr
54bcefb8aeSAndreas Gohr    function testMultipleWords()
55bcefb8aeSAndreas Gohr    {
56bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
57bcefb8aeSAndreas Gohr        $this->P->parse('_one two three_');
58bcefb8aeSAndreas Gohr        $calls = [
59bcefb8aeSAndreas Gohr            ['document_start', []],
60bcefb8aeSAndreas Gohr            ['p_open', []],
61bcefb8aeSAndreas Gohr            ['cdata', ["\n"]],
62bcefb8aeSAndreas Gohr            ['emphasis_open', []],
63bcefb8aeSAndreas Gohr            ['cdata', ['one two three']],
64bcefb8aeSAndreas Gohr            ['emphasis_close', []],
65bcefb8aeSAndreas Gohr            ['cdata', ['']],
66bcefb8aeSAndreas Gohr            ['p_close', []],
67bcefb8aeSAndreas Gohr            ['document_end', []],
68bcefb8aeSAndreas Gohr        ];
69bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
70bcefb8aeSAndreas Gohr    }
71bcefb8aeSAndreas Gohr
72bcefb8aeSAndreas Gohr    function testIntrawordUnderscoreIsNotEmphasised()
73bcefb8aeSAndreas Gohr    {
74bcefb8aeSAndreas Gohr        // GFM's key word-boundary rule: underscores inside words stay literal.
75bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
76bcefb8aeSAndreas Gohr        $this->P->parse('this_is_not_an_emphasis');
77bcefb8aeSAndreas Gohr        $calls = [
78bcefb8aeSAndreas Gohr            ['document_start', []],
79bcefb8aeSAndreas Gohr            ['p_open', []],
80bcefb8aeSAndreas Gohr            ['cdata', ["\nthis_is_not_an_emphasis"]],
81bcefb8aeSAndreas Gohr            ['p_close', []],
82bcefb8aeSAndreas Gohr            ['document_end', []],
83bcefb8aeSAndreas Gohr        ];
84bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
85bcefb8aeSAndreas Gohr    }
86bcefb8aeSAndreas Gohr
87bcefb8aeSAndreas Gohr    function testOpenerFollowedBySpaceDoesNotEmphasise()
88bcefb8aeSAndreas Gohr    {
89bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
90bcefb8aeSAndreas Gohr        $this->P->parse('foo _ bar_ baz');
91bcefb8aeSAndreas Gohr        $calls = [
92bcefb8aeSAndreas Gohr            ['document_start', []],
93bcefb8aeSAndreas Gohr            ['p_open', []],
94bcefb8aeSAndreas Gohr            ['cdata', ["\nfoo _ bar_ baz"]],
95bcefb8aeSAndreas Gohr            ['p_close', []],
96bcefb8aeSAndreas Gohr            ['document_end', []],
97bcefb8aeSAndreas Gohr        ];
98bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
99bcefb8aeSAndreas Gohr    }
100bcefb8aeSAndreas Gohr
101bcefb8aeSAndreas Gohr    function testDoubleUnderscoreDoesNotEmphasise()
102bcefb8aeSAndreas Gohr    {
103bcefb8aeSAndreas Gohr        // `__foo__` must stay literal. At the first `_`, the lookahead
104bcefb8aeSAndreas Gohr        // `(?=[^\s_])` forbids entry (next char is another `_`). At the
105bcefb8aeSAndreas Gohr        // second `_`, the lookbehind also fails because `_` itself is not
106bcefb8aeSAndreas Gohr        // a "non-word" character (it's excluded from NON_WORD_CHAR so that
107bcefb8aeSAndreas Gohr        // `__foo` can't open emphasis at the inner underscore).
108bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
109bcefb8aeSAndreas Gohr        $this->P->parse('foo __bar__ baz');
110bcefb8aeSAndreas Gohr        $calls = [
111bcefb8aeSAndreas Gohr            ['document_start', []],
112bcefb8aeSAndreas Gohr            ['p_open', []],
113bcefb8aeSAndreas Gohr            ['cdata', ["\nfoo __bar__ baz"]],
114bcefb8aeSAndreas Gohr            ['p_close', []],
115bcefb8aeSAndreas Gohr            ['document_end', []],
116bcefb8aeSAndreas Gohr        ];
117bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
118bcefb8aeSAndreas Gohr    }
119bcefb8aeSAndreas Gohr
120bcefb8aeSAndreas Gohr    function testTwoSeparateEmphasisOnOneLine()
121bcefb8aeSAndreas Gohr    {
122bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
123bcefb8aeSAndreas Gohr        $this->P->parse('_one_ and _two_');
124bcefb8aeSAndreas Gohr        $calls = [
125bcefb8aeSAndreas Gohr            ['document_start', []],
126bcefb8aeSAndreas Gohr            ['p_open', []],
127bcefb8aeSAndreas Gohr            ['cdata', ["\n"]],
128bcefb8aeSAndreas Gohr            ['emphasis_open', []],
129bcefb8aeSAndreas Gohr            ['cdata', ['one']],
130bcefb8aeSAndreas Gohr            ['emphasis_close', []],
131bcefb8aeSAndreas Gohr            ['cdata', [' and ']],
132bcefb8aeSAndreas Gohr            ['emphasis_open', []],
133bcefb8aeSAndreas Gohr            ['cdata', ['two']],
134bcefb8aeSAndreas Gohr            ['emphasis_close', []],
135bcefb8aeSAndreas Gohr            ['cdata', ['']],
136bcefb8aeSAndreas Gohr            ['p_close', []],
137bcefb8aeSAndreas Gohr            ['document_end', []],
138bcefb8aeSAndreas Gohr        ];
139bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
140bcefb8aeSAndreas Gohr    }
141bcefb8aeSAndreas Gohr
142bcefb8aeSAndreas Gohr    function testMultilineEmphasis()
143bcefb8aeSAndreas Gohr    {
144bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
145bcefb8aeSAndreas Gohr        $this->P->parse("_line\nline\nline_");
146bcefb8aeSAndreas Gohr        $calls = [
147bcefb8aeSAndreas Gohr            ['document_start', []],
148bcefb8aeSAndreas Gohr            ['p_open', []],
149bcefb8aeSAndreas Gohr            ['cdata', ["\n"]],
150bcefb8aeSAndreas Gohr            ['emphasis_open', []],
151bcefb8aeSAndreas Gohr            ['cdata', ["line\nline\nline"]],
152bcefb8aeSAndreas Gohr            ['emphasis_close', []],
153bcefb8aeSAndreas Gohr            ['cdata', ['']],
154bcefb8aeSAndreas Gohr            ['p_close', []],
155bcefb8aeSAndreas Gohr            ['document_end', []],
156bcefb8aeSAndreas Gohr        ];
157bcefb8aeSAndreas Gohr        $this->assertCalls($calls, $this->H->calls);
158bcefb8aeSAndreas Gohr    }
159bcefb8aeSAndreas Gohr
160bcefb8aeSAndreas Gohr    function testSortValue()
161bcefb8aeSAndreas Gohr    {
162bcefb8aeSAndreas Gohr        $mode = new GfmEmphasisUnderscore();
163bcefb8aeSAndreas Gohr        $this->assertSame(80, $mode->getSort());
164bcefb8aeSAndreas Gohr    }
165bcefb8aeSAndreas Gohr
166bcefb8aeSAndreas Gohr    function testDoesNotSpanParagraphBoundary()
167bcefb8aeSAndreas Gohr    {
168bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
169bcefb8aeSAndreas Gohr        $this->P->parse("_open\n\nclose_");
170bcefb8aeSAndreas Gohr        $modes = array_column($this->H->calls, 0);
171bcefb8aeSAndreas Gohr        $this->assertNotContains('emphasis_open', $modes,
172bcefb8aeSAndreas Gohr            'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line');
173bcefb8aeSAndreas Gohr    }
174bcefb8aeSAndreas Gohr
175bcefb8aeSAndreas Gohr    function testAllowsSingleNewlineInsideMultiline()
176bcefb8aeSAndreas Gohr    {
177bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
178bcefb8aeSAndreas Gohr        $this->P->parse("_open\nclose_");
179bcefb8aeSAndreas Gohr        $modes = array_column($this->H->calls, 0);
180bcefb8aeSAndreas Gohr        $this->assertContains('emphasis_open', $modes,
181bcefb8aeSAndreas Gohr            'GfmEmphasisUnderscore must still match across a single newline');
182bcefb8aeSAndreas Gohr    }
183bcefb8aeSAndreas Gohr
184bcefb8aeSAndreas Gohr    /**
185bcefb8aeSAndreas Gohr     * The intraword rule must apply to multibyte letters, not just ASCII.
186bcefb8aeSAndreas Gohr     * This test is derived from CommonMark spec §6.2 example 418:
187bcefb8aeSAndreas Gohr     *
188bcefb8aeSAndreas Gohr     *     пристаням_стремятся_
189bcefb8aeSAndreas Gohr     *
190bcefb8aeSAndreas Gohr     * which must render as literal (no emphasis). The surrounding Cyrillic
191bcefb8aeSAndreas Gohr     * letters are word-like; the underscores are intraword and must not
192bcefb8aeSAndreas Gohr     * emphasize.
193bcefb8aeSAndreas Gohr     *
194bcefb8aeSAndreas Gohr     * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are
195bcefb8aeSAndreas Gohr     * defined positively (matching explicit non-word chars) rather than
196bcefb8aeSAndreas Gohr     * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which
197bcefb8aeSAndreas Gohr     * are not in any ASCII class — are correctly treated as word-like.
198bcefb8aeSAndreas Gohr     *
199bcefb8aeSAndreas Gohr     * @dataProvider provideMultibyteIntrawordCases
200bcefb8aeSAndreas Gohr     */
201bcefb8aeSAndreas Gohr    function testIntrawordUnderscoreInMultibyteText(string $input)
202bcefb8aeSAndreas Gohr    {
203bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
204bcefb8aeSAndreas Gohr        $this->P->parse($input);
205bcefb8aeSAndreas Gohr        $modes = array_column($this->H->calls, 0);
206bcefb8aeSAndreas Gohr        $this->assertNotContains(
207bcefb8aeSAndreas Gohr            'emphasis_open',
208bcefb8aeSAndreas Gohr            $modes,
209bcefb8aeSAndreas Gohr            "Intraword `_` in multibyte text must not emphasize: " . json_encode($input)
210bcefb8aeSAndreas Gohr        );
211bcefb8aeSAndreas Gohr    }
212bcefb8aeSAndreas Gohr
213bcefb8aeSAndreas Gohr    public static function provideMultibyteIntrawordCases(): array
214bcefb8aeSAndreas Gohr    {
215bcefb8aeSAndreas Gohr        return [
216bcefb8aeSAndreas Gohr            // CommonMark spec §6.2 ex. 418 — Cyrillic intraword
217bcefb8aeSAndreas Gohr            'cyrillic-trailing'  => ['пристаням_стремятся_'],
218bcefb8aeSAndreas Gohr            // CommonMark spec §6.2 ex. 420 — Cyrillic leading
219bcefb8aeSAndreas Gohr            'cyrillic-leading'   => ['_пристаням_стремятся'],
220bcefb8aeSAndreas Gohr            // German umlaut — no established spec example, but the expected
221bcefb8aeSAndreas Gohr            // behavior is uncontroversial: intraword `_` stays literal.
222bcefb8aeSAndreas Gohr            'german-umlaut'      => ['für_etwas_text'],
223bcefb8aeSAndreas Gohr            // CJK — same expectation
224bcefb8aeSAndreas Gohr            'cjk-intraword'      => ['日本_語_の'],
225bcefb8aeSAndreas Gohr            // Greek
226bcefb8aeSAndreas Gohr            'greek-intraword'    => ['αυτό_είναι_κείμενο'],
227bcefb8aeSAndreas Gohr        ];
228bcefb8aeSAndreas Gohr    }
229bcefb8aeSAndreas Gohr
230bcefb8aeSAndreas Gohr    /**
231bcefb8aeSAndreas Gohr     * A `_foo_` span surrounded by multibyte letters must NOT open at the
232bcefb8aeSAndreas Gohr     * first `_` (it would be intraword) AND must still NOT open if the
233bcefb8aeSAndreas Gohr     * following letters are multibyte. Verifies that both the lookbehind
234bcefb8aeSAndreas Gohr     * and the closing-delimiter lookahead reject multibyte word chars.
235bcefb8aeSAndreas Gohr     */
236bcefb8aeSAndreas Gohr    function testMultibyteWordCharsAreNotTreatedAsBoundary()
237bcefb8aeSAndreas Gohr    {
238bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
239bcefb8aeSAndreas Gohr        // Intraword between Cyrillic on the left and Cyrillic on the right.
240bcefb8aeSAndreas Gohr        $this->P->parse('до_середины_текста');
241bcefb8aeSAndreas Gohr        $modes = array_column($this->H->calls, 0);
242bcefb8aeSAndreas Gohr        $this->assertNotContains('emphasis_open', $modes,
243bcefb8aeSAndreas Gohr            'Cyrillic-surrounded `_` must not emphasize');
244bcefb8aeSAndreas Gohr    }
245bcefb8aeSAndreas Gohr
246bcefb8aeSAndreas Gohr    /**
247bcefb8aeSAndreas Gohr     * Positive: when the surrounding non-word context is whitespace or
248bcefb8aeSAndreas Gohr     * punctuation, multibyte content *inside* the emphasis span is fine.
249bcefb8aeSAndreas Gohr     * `_für etwas_` surrounded by spaces should emphasize the multibyte text.
250bcefb8aeSAndreas Gohr     */
251bcefb8aeSAndreas Gohr    function testMultibyteContentInsideEmphasisWorks()
252bcefb8aeSAndreas Gohr    {
253bcefb8aeSAndreas Gohr        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
254bcefb8aeSAndreas Gohr        $this->P->parse('foo _für etwas_ bar');
255bcefb8aeSAndreas Gohr        $modes = array_column($this->H->calls, 0);
256bcefb8aeSAndreas Gohr        $this->assertContains('emphasis_open', $modes,
257bcefb8aeSAndreas Gohr            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
258bcefb8aeSAndreas Gohr        $this->assertContains('emphasis_close', $modes,
259bcefb8aeSAndreas Gohr            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
260bcefb8aeSAndreas Gohr    }
261bcefb8aeSAndreas Gohr}
262