xref: /dokuwiki/_test/tests/Parsing/ParserMode/GfmEmphasisUnderscoreTest.php (revision bcefb8ae61f4ff776efdbad9508c8ee8e5c548a6)
1<?php
2
3namespace dokuwiki\test\Parsing\ParserMode;
4
5use dokuwiki\Parsing\ModeRegistry;
6use dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore;
7
8/**
9 * Tests for the GFM underscore emphasis mode (`_text_`).
10 *
11 * Only loaded when Markdown is preferred or the only syntax; see SPEC.md and
12 * the registry test coverage in ModeRegistryTest.
13 *
14 * Follows the FormattingTest pattern: one mode loaded in isolation, assertions
15 * against handler instruction sequences. The setUp flips ModeRegistry to
16 * `markdown` syntax so the Base mode recognizes `gfm_emphasis_underscore` as
17 * an allowed nested mode.
18 */
19class GfmEmphasisUnderscoreTest extends ParserTestBase
20{
21    public function setUp(): void
22    {
23        parent::setUp();
24        global $conf;
25        $conf['syntax'] = 'markdown';
26        ModeRegistry::reset();
27    }
28
29    public function tearDown(): void
30    {
31        ModeRegistry::reset();
32        parent::tearDown();
33    }
34
35    function testBasicUnderscore()
36    {
37        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
38        $this->P->parse('Foo _Bar_ Baz');
39        $calls = [
40            ['document_start', []],
41            ['p_open', []],
42            ['cdata', ["\nFoo "]],
43            ['emphasis_open', []],
44            ['cdata', ['Bar']],
45            ['emphasis_close', []],
46            ['cdata', [' Baz']],
47            ['p_close', []],
48            ['document_end', []],
49        ];
50        $this->assertCalls($calls, $this->H->calls);
51    }
52
53    function testSingleCharacter()
54    {
55        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
56        $this->P->parse('foo _b_ bar');
57        $calls = [
58            ['document_start', []],
59            ['p_open', []],
60            ['cdata', ["\nfoo "]],
61            ['emphasis_open', []],
62            ['cdata', ['b']],
63            ['emphasis_close', []],
64            ['cdata', [' bar']],
65            ['p_close', []],
66            ['document_end', []],
67        ];
68        $this->assertCalls($calls, $this->H->calls);
69    }
70
71    function testMultipleWords()
72    {
73        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
74        $this->P->parse('_one two three_');
75        $calls = [
76            ['document_start', []],
77            ['p_open', []],
78            ['cdata', ["\n"]],
79            ['emphasis_open', []],
80            ['cdata', ['one two three']],
81            ['emphasis_close', []],
82            ['cdata', ['']],
83            ['p_close', []],
84            ['document_end', []],
85        ];
86        $this->assertCalls($calls, $this->H->calls);
87    }
88
89    function testIntrawordUnderscoreIsNotEmphasised()
90    {
91        // GFM's key word-boundary rule: underscores inside words stay literal.
92        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
93        $this->P->parse('this_is_not_an_emphasis');
94        $calls = [
95            ['document_start', []],
96            ['p_open', []],
97            ['cdata', ["\nthis_is_not_an_emphasis"]],
98            ['p_close', []],
99            ['document_end', []],
100        ];
101        $this->assertCalls($calls, $this->H->calls);
102    }
103
104    function testOpenerFollowedBySpaceDoesNotEmphasise()
105    {
106        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
107        $this->P->parse('foo _ bar_ baz');
108        $calls = [
109            ['document_start', []],
110            ['p_open', []],
111            ['cdata', ["\nfoo _ bar_ baz"]],
112            ['p_close', []],
113            ['document_end', []],
114        ];
115        $this->assertCalls($calls, $this->H->calls);
116    }
117
118    function testDoubleUnderscoreDoesNotEmphasise()
119    {
120        // `__foo__` must stay literal. At the first `_`, the lookahead
121        // `(?=[^\s_])` forbids entry (next char is another `_`). At the
122        // second `_`, the lookbehind also fails because `_` itself is not
123        // a "non-word" character (it's excluded from NON_WORD_CHAR so that
124        // `__foo` can't open emphasis at the inner underscore).
125        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
126        $this->P->parse('foo __bar__ baz');
127        $calls = [
128            ['document_start', []],
129            ['p_open', []],
130            ['cdata', ["\nfoo __bar__ baz"]],
131            ['p_close', []],
132            ['document_end', []],
133        ];
134        $this->assertCalls($calls, $this->H->calls);
135    }
136
137    function testTwoSeparateEmphasisOnOneLine()
138    {
139        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
140        $this->P->parse('_one_ and _two_');
141        $calls = [
142            ['document_start', []],
143            ['p_open', []],
144            ['cdata', ["\n"]],
145            ['emphasis_open', []],
146            ['cdata', ['one']],
147            ['emphasis_close', []],
148            ['cdata', [' and ']],
149            ['emphasis_open', []],
150            ['cdata', ['two']],
151            ['emphasis_close', []],
152            ['cdata', ['']],
153            ['p_close', []],
154            ['document_end', []],
155        ];
156        $this->assertCalls($calls, $this->H->calls);
157    }
158
159    function testMultilineEmphasis()
160    {
161        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
162        $this->P->parse("_line\nline\nline_");
163        $calls = [
164            ['document_start', []],
165            ['p_open', []],
166            ['cdata', ["\n"]],
167            ['emphasis_open', []],
168            ['cdata', ["line\nline\nline"]],
169            ['emphasis_close', []],
170            ['cdata', ['']],
171            ['p_close', []],
172            ['document_end', []],
173        ];
174        $this->assertCalls($calls, $this->H->calls);
175    }
176
177    function testSortValue()
178    {
179        $mode = new GfmEmphasisUnderscore();
180        $this->assertSame(80, $mode->getSort());
181    }
182
183    function testDoesNotSpanParagraphBoundary()
184    {
185        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
186        $this->P->parse("_open\n\nclose_");
187        $modes = array_column($this->H->calls, 0);
188        $this->assertNotContains('emphasis_open', $modes,
189            'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line');
190    }
191
192    function testAllowsSingleNewlineInsideMultiline()
193    {
194        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
195        $this->P->parse("_open\nclose_");
196        $modes = array_column($this->H->calls, 0);
197        $this->assertContains('emphasis_open', $modes,
198            'GfmEmphasisUnderscore must still match across a single newline');
199    }
200
201    /**
202     * The intraword rule must apply to multibyte letters, not just ASCII.
203     * This test is derived from CommonMark spec §6.2 example 418:
204     *
205     *     пристаням_стремятся_
206     *
207     * which must render as literal (no emphasis). The surrounding Cyrillic
208     * letters are word-like; the underscores are intraword and must not
209     * emphasize.
210     *
211     * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are
212     * defined positively (matching explicit non-word chars) rather than
213     * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which
214     * are not in any ASCII class — are correctly treated as word-like.
215     *
216     * @dataProvider provideMultibyteIntrawordCases
217     */
218    function testIntrawordUnderscoreInMultibyteText(string $input)
219    {
220        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
221        $this->P->parse($input);
222        $modes = array_column($this->H->calls, 0);
223        $this->assertNotContains(
224            'emphasis_open',
225            $modes,
226            "Intraword `_` in multibyte text must not emphasize: " . json_encode($input)
227        );
228    }
229
230    public static function provideMultibyteIntrawordCases(): array
231    {
232        return [
233            // CommonMark spec §6.2 ex. 418 — Cyrillic intraword
234            'cyrillic-trailing'  => ['пристаням_стремятся_'],
235            // CommonMark spec §6.2 ex. 420 — Cyrillic leading
236            'cyrillic-leading'   => ['_пристаням_стремятся'],
237            // German umlaut — no established spec example, but the expected
238            // behavior is uncontroversial: intraword `_` stays literal.
239            'german-umlaut'      => ['für_etwas_text'],
240            // CJK — same expectation
241            'cjk-intraword'      => ['日本_語_の'],
242            // Greek
243            'greek-intraword'    => ['αυτό_είναι_κείμενο'],
244        ];
245    }
246
247    /**
248     * A `_foo_` span surrounded by multibyte letters must NOT open at the
249     * first `_` (it would be intraword) AND must still NOT open if the
250     * following letters are multibyte. Verifies that both the lookbehind
251     * and the closing-delimiter lookahead reject multibyte word chars.
252     */
253    function testMultibyteWordCharsAreNotTreatedAsBoundary()
254    {
255        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
256        // Intraword between Cyrillic on the left and Cyrillic on the right.
257        $this->P->parse('до_середины_текста');
258        $modes = array_column($this->H->calls, 0);
259        $this->assertNotContains('emphasis_open', $modes,
260            'Cyrillic-surrounded `_` must not emphasize');
261    }
262
263    /**
264     * Positive: when the surrounding non-word context is whitespace or
265     * punctuation, multibyte content *inside* the emphasis span is fine.
266     * `_für etwas_` surrounded by spaces should emphasize the multibyte text.
267     */
268    function testMultibyteContentInsideEmphasisWorks()
269    {
270        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
271        $this->P->parse('foo _für etwas_ bar');
272        $modes = array_column($this->H->calls, 0);
273        $this->assertContains('emphasis_open', $modes,
274            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
275        $this->assertContains('emphasis_close', $modes,
276            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
277    }
278}
279