xref: /dokuwiki/_test/tests/Parsing/ParserMode/GfmEmphasisUnderscoreTest.php (revision 864d6c6daac408fa4fc25801adcaf0f744288b30)
1<?php
2
3namespace dokuwiki\test\Parsing\ParserMode;
4
5use dokuwiki\Parsing\ModeRegistry;
6use dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore;
7
8/**
9 * Tests for the GFM underscore emphasis mode (`_text_`).
10 */
11class GfmEmphasisUnderscoreTest extends ParserTestBase
12{
13    public function setUp(): void
14    {
15        parent::setUp();
16        global $conf;
17        $conf['syntax'] = 'markdown';
18        ModeRegistry::reset();
19    }
20
21    public function tearDown(): void
22    {
23        ModeRegistry::reset();
24        parent::tearDown();
25    }
26
27    function testBasicUnderscore()
28    {
29        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
30        $this->P->parse('Foo _Bar_ Baz');
31        $calls = [
32            ['document_start', []],
33            ['p_open', []],
34            ['cdata', ["\nFoo "]],
35            ['emphasis_open', []],
36            ['cdata', ['Bar']],
37            ['emphasis_close', []],
38            ['cdata', [' Baz']],
39            ['p_close', []],
40            ['document_end', []],
41        ];
42        $this->assertCalls($calls, $this->H->calls);
43    }
44
45    function testSingleCharacter()
46    {
47        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
48        $this->P->parse('foo _b_ bar');
49        $calls = [
50            ['document_start', []],
51            ['p_open', []],
52            ['cdata', ["\nfoo "]],
53            ['emphasis_open', []],
54            ['cdata', ['b']],
55            ['emphasis_close', []],
56            ['cdata', [' bar']],
57            ['p_close', []],
58            ['document_end', []],
59        ];
60        $this->assertCalls($calls, $this->H->calls);
61    }
62
63    function testMultipleWords()
64    {
65        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
66        $this->P->parse('_one two three_');
67        $calls = [
68            ['document_start', []],
69            ['p_open', []],
70            ['cdata', ["\n"]],
71            ['emphasis_open', []],
72            ['cdata', ['one two three']],
73            ['emphasis_close', []],
74            ['cdata', ['']],
75            ['p_close', []],
76            ['document_end', []],
77        ];
78        $this->assertCalls($calls, $this->H->calls);
79    }
80
81    function testIntrawordUnderscoreIsNotEmphasised()
82    {
83        // GFM's key word-boundary rule: underscores inside words stay literal.
84        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
85        $this->P->parse('this_is_not_an_emphasis');
86        $calls = [
87            ['document_start', []],
88            ['p_open', []],
89            ['cdata', ["\nthis_is_not_an_emphasis"]],
90            ['p_close', []],
91            ['document_end', []],
92        ];
93        $this->assertCalls($calls, $this->H->calls);
94    }
95
96    function testOpenerFollowedBySpaceDoesNotEmphasise()
97    {
98        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
99        $this->P->parse('foo _ bar_ baz');
100        $calls = [
101            ['document_start', []],
102            ['p_open', []],
103            ['cdata', ["\nfoo _ bar_ baz"]],
104            ['p_close', []],
105            ['document_end', []],
106        ];
107        $this->assertCalls($calls, $this->H->calls);
108    }
109
110    function testDoubleUnderscoreDoesNotEmphasise()
111    {
112        // `__foo__` must stay literal. At the first `_`, the lookahead
113        // `(?=[^\s_])` forbids entry (next char is another `_`). At the
114        // second `_`, the lookbehind also fails because `_` itself is not
115        // a "non-word" character (it's excluded from NON_WORD_CHAR so that
116        // `__foo` can't open emphasis at the inner underscore).
117        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
118        $this->P->parse('foo __bar__ baz');
119        $calls = [
120            ['document_start', []],
121            ['p_open', []],
122            ['cdata', ["\nfoo __bar__ baz"]],
123            ['p_close', []],
124            ['document_end', []],
125        ];
126        $this->assertCalls($calls, $this->H->calls);
127    }
128
129    function testTwoSeparateEmphasisOnOneLine()
130    {
131        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
132        $this->P->parse('_one_ and _two_');
133        $calls = [
134            ['document_start', []],
135            ['p_open', []],
136            ['cdata', ["\n"]],
137            ['emphasis_open', []],
138            ['cdata', ['one']],
139            ['emphasis_close', []],
140            ['cdata', [' and ']],
141            ['emphasis_open', []],
142            ['cdata', ['two']],
143            ['emphasis_close', []],
144            ['cdata', ['']],
145            ['p_close', []],
146            ['document_end', []],
147        ];
148        $this->assertCalls($calls, $this->H->calls);
149    }
150
151    function testMultilineEmphasis()
152    {
153        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
154        $this->P->parse("_line\nline\nline_");
155        $calls = [
156            ['document_start', []],
157            ['p_open', []],
158            ['cdata', ["\n"]],
159            ['emphasis_open', []],
160            ['cdata', ["line\nline\nline"]],
161            ['emphasis_close', []],
162            ['cdata', ['']],
163            ['p_close', []],
164            ['document_end', []],
165        ];
166        $this->assertCalls($calls, $this->H->calls);
167    }
168
169    function testSortValue()
170    {
171        $mode = new GfmEmphasisUnderscore();
172        $this->assertSame(80, $mode->getSort());
173    }
174
175    function testDoesNotSpanParagraphBoundary()
176    {
177        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
178        $this->P->parse("_open\n\nclose_");
179        $modes = array_column($this->H->calls, 0);
180        $this->assertNotContains('emphasis_open', $modes,
181            'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line');
182    }
183
184    function testAllowsSingleNewlineInsideMultiline()
185    {
186        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
187        $this->P->parse("_open\nclose_");
188        $modes = array_column($this->H->calls, 0);
189        $this->assertContains('emphasis_open', $modes,
190            'GfmEmphasisUnderscore must still match across a single newline');
191    }
192
193    /**
194     * The intraword rule must apply to multibyte letters, not just ASCII.
195     * This test is derived from CommonMark spec §6.2 example 418:
196     *
197     *     пристаням_стремятся_
198     *
199     * which must render as literal (no emphasis). The surrounding Cyrillic
200     * letters are word-like; the underscores are intraword and must not
201     * emphasize.
202     *
203     * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are
204     * defined positively (matching explicit non-word chars) rather than
205     * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which
206     * are not in any ASCII class — are correctly treated as word-like.
207     *
208     * @dataProvider provideMultibyteIntrawordCases
209     */
210    function testIntrawordUnderscoreInMultibyteText(string $input)
211    {
212        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
213        $this->P->parse($input);
214        $modes = array_column($this->H->calls, 0);
215        $this->assertNotContains(
216            'emphasis_open',
217            $modes,
218            "Intraword `_` in multibyte text must not emphasize: " . json_encode($input)
219        );
220    }
221
222    public static function provideMultibyteIntrawordCases(): array
223    {
224        return [
225            // CommonMark spec §6.2 ex. 418 — Cyrillic intraword
226            'cyrillic-trailing'  => ['пристаням_стремятся_'],
227            // CommonMark spec §6.2 ex. 420 — Cyrillic leading
228            'cyrillic-leading'   => ['_пристаням_стремятся'],
229            // German umlaut — no established spec example, but the expected
230            // behavior is uncontroversial: intraword `_` stays literal.
231            'german-umlaut'      => ['für_etwas_text'],
232            // CJK — same expectation
233            'cjk-intraword'      => ['日本_語_の'],
234            // Greek
235            'greek-intraword'    => ['αυτό_είναι_κείμενο'],
236        ];
237    }
238
239    /**
240     * A `_foo_` span surrounded by multibyte letters must NOT open at the
241     * first `_` (it would be intraword) AND must still NOT open if the
242     * following letters are multibyte. Verifies that both the lookbehind
243     * and the closing-delimiter lookahead reject multibyte word chars.
244     */
245    function testMultibyteWordCharsAreNotTreatedAsBoundary()
246    {
247        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
248        // Intraword between Cyrillic on the left and Cyrillic on the right.
249        $this->P->parse('до_середины_текста');
250        $modes = array_column($this->H->calls, 0);
251        $this->assertNotContains('emphasis_open', $modes,
252            'Cyrillic-surrounded `_` must not emphasize');
253    }
254
255    /**
256     * Positive: when the surrounding non-word context is whitespace or
257     * punctuation, multibyte content *inside* the emphasis span is fine.
258     * `_für etwas_` surrounded by spaces should emphasize the multibyte text.
259     */
260    function testMultibyteContentInsideEmphasisWorks()
261    {
262        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
263        $this->P->parse('foo _für etwas_ bar');
264        $modes = array_column($this->H->calls, 0);
265        $this->assertContains('emphasis_open', $modes,
266            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
267        $this->assertContains('emphasis_close', $modes,
268            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
269    }
270}
271