xref: /dokuwiki/_test/tests/Parsing/ParserMode/GfmEmphasisUnderscoreTest.php (revision b73ece99c18919754d993a1d1f5cb27140555705)
1<?php
2
3namespace dokuwiki\test\Parsing\ParserMode;
4
5use dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore;
6
7/**
8 * Tests for the GFM underscore emphasis mode (`_text_`).
9 */
10class GfmEmphasisUnderscoreTest extends ParserTestBase
11{
12    public function setUp(): void
13    {
14        parent::setUp();
15        $this->setSyntax('md');
16    }
17
18    function testBasicUnderscore()
19    {
20        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
21        $this->P->parse('Foo _Bar_ Baz');
22        $calls = [
23            ['document_start', []],
24            ['p_open', []],
25            ['cdata', ["\nFoo "]],
26            ['emphasis_open', []],
27            ['cdata', ['Bar']],
28            ['emphasis_close', []],
29            ['cdata', [' Baz']],
30            ['p_close', []],
31            ['document_end', []],
32        ];
33        $this->assertCalls($calls, $this->H->calls);
34    }
35
36    function testSingleCharacter()
37    {
38        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
39        $this->P->parse('foo _b_ bar');
40        $calls = [
41            ['document_start', []],
42            ['p_open', []],
43            ['cdata', ["\nfoo "]],
44            ['emphasis_open', []],
45            ['cdata', ['b']],
46            ['emphasis_close', []],
47            ['cdata', [' bar']],
48            ['p_close', []],
49            ['document_end', []],
50        ];
51        $this->assertCalls($calls, $this->H->calls);
52    }
53
54    function testMultipleWords()
55    {
56        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
57        $this->P->parse('_one two three_');
58        $calls = [
59            ['document_start', []],
60            ['p_open', []],
61            ['cdata', ["\n"]],
62            ['emphasis_open', []],
63            ['cdata', ['one two three']],
64            ['emphasis_close', []],
65            ['cdata', ['']],
66            ['p_close', []],
67            ['document_end', []],
68        ];
69        $this->assertCalls($calls, $this->H->calls);
70    }
71
72    function testIntrawordUnderscoreIsNotEmphasised()
73    {
74        // GFM's key word-boundary rule: underscores inside words stay literal.
75        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
76        $this->P->parse('this_is_not_an_emphasis');
77        $calls = [
78            ['document_start', []],
79            ['p_open', []],
80            ['cdata', ["\nthis_is_not_an_emphasis"]],
81            ['p_close', []],
82            ['document_end', []],
83        ];
84        $this->assertCalls($calls, $this->H->calls);
85    }
86
87    function testOpenerFollowedBySpaceDoesNotEmphasise()
88    {
89        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
90        $this->P->parse('foo _ bar_ baz');
91        $calls = [
92            ['document_start', []],
93            ['p_open', []],
94            ['cdata', ["\nfoo _ bar_ baz"]],
95            ['p_close', []],
96            ['document_end', []],
97        ];
98        $this->assertCalls($calls, $this->H->calls);
99    }
100
101    function testDoubleUnderscoreDoesNotEmphasise()
102    {
103        // `__foo__` must stay literal. At the first `_`, the lookahead
104        // `(?=[^\s_])` forbids entry (next char is another `_`). At the
105        // second `_`, the lookbehind also fails because `_` itself is not
106        // a "non-word" character (it's excluded from NON_WORD_CHAR so that
107        // `__foo` can't open emphasis at the inner underscore).
108        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
109        $this->P->parse('foo __bar__ baz');
110        $calls = [
111            ['document_start', []],
112            ['p_open', []],
113            ['cdata', ["\nfoo __bar__ baz"]],
114            ['p_close', []],
115            ['document_end', []],
116        ];
117        $this->assertCalls($calls, $this->H->calls);
118    }
119
120    function testTwoSeparateEmphasisOnOneLine()
121    {
122        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
123        $this->P->parse('_one_ and _two_');
124        $calls = [
125            ['document_start', []],
126            ['p_open', []],
127            ['cdata', ["\n"]],
128            ['emphasis_open', []],
129            ['cdata', ['one']],
130            ['emphasis_close', []],
131            ['cdata', [' and ']],
132            ['emphasis_open', []],
133            ['cdata', ['two']],
134            ['emphasis_close', []],
135            ['cdata', ['']],
136            ['p_close', []],
137            ['document_end', []],
138        ];
139        $this->assertCalls($calls, $this->H->calls);
140    }
141
142    function testMultilineEmphasis()
143    {
144        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
145        $this->P->parse("_line\nline\nline_");
146        $calls = [
147            ['document_start', []],
148            ['p_open', []],
149            ['cdata', ["\n"]],
150            ['emphasis_open', []],
151            ['cdata', ["line\nline\nline"]],
152            ['emphasis_close', []],
153            ['cdata', ['']],
154            ['p_close', []],
155            ['document_end', []],
156        ];
157        $this->assertCalls($calls, $this->H->calls);
158    }
159
160    function testSortValue()
161    {
162        $mode = new GfmEmphasisUnderscore();
163        $this->assertSame(80, $mode->getSort());
164    }
165
166    function testDoesNotSpanParagraphBoundary()
167    {
168        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
169        $this->P->parse("_open\n\nclose_");
170        $modes = array_column($this->H->calls, 0);
171        $this->assertNotContains('emphasis_open', $modes,
172            'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line');
173    }
174
175    function testAllowsSingleNewlineInsideMultiline()
176    {
177        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
178        $this->P->parse("_open\nclose_");
179        $modes = array_column($this->H->calls, 0);
180        $this->assertContains('emphasis_open', $modes,
181            'GfmEmphasisUnderscore must still match across a single newline');
182    }
183
184    /**
185     * The intraword rule must apply to multibyte letters, not just ASCII.
186     * This test is derived from CommonMark spec §6.2 example 418:
187     *
188     *     пристаням_стремятся_
189     *
190     * which must render as literal (no emphasis). The surrounding Cyrillic
191     * letters are word-like; the underscores are intraword and must not
192     * emphasize.
193     *
194     * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are
195     * defined positively (matching explicit non-word chars) rather than
196     * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which
197     * are not in any ASCII class — are correctly treated as word-like.
198     *
199     * @dataProvider provideMultibyteIntrawordCases
200     */
201    function testIntrawordUnderscoreInMultibyteText(string $input)
202    {
203        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
204        $this->P->parse($input);
205        $modes = array_column($this->H->calls, 0);
206        $this->assertNotContains(
207            'emphasis_open',
208            $modes,
209            "Intraword `_` in multibyte text must not emphasize: " . json_encode($input)
210        );
211    }
212
213    public static function provideMultibyteIntrawordCases(): array
214    {
215        return [
216            // CommonMark spec §6.2 ex. 418 — Cyrillic intraword
217            'cyrillic-trailing'  => ['пристаням_стремятся_'],
218            // CommonMark spec §6.2 ex. 420 — Cyrillic leading
219            'cyrillic-leading'   => ['_пристаням_стремятся'],
220            // German umlaut — no established spec example, but the expected
221            // behavior is uncontroversial: intraword `_` stays literal.
222            'german-umlaut'      => ['für_etwas_text'],
223            // CJK — same expectation
224            'cjk-intraword'      => ['日本_語_の'],
225            // Greek
226            'greek-intraword'    => ['αυτό_είναι_κείμενο'],
227        ];
228    }
229
230    /**
231     * A `_foo_` span surrounded by multibyte letters must NOT open at the
232     * first `_` (it would be intraword) AND must still NOT open if the
233     * following letters are multibyte. Verifies that both the lookbehind
234     * and the closing-delimiter lookahead reject multibyte word chars.
235     */
236    function testMultibyteWordCharsAreNotTreatedAsBoundary()
237    {
238        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
239        // Intraword between Cyrillic on the left and Cyrillic on the right.
240        $this->P->parse('до_середины_текста');
241        $modes = array_column($this->H->calls, 0);
242        $this->assertNotContains('emphasis_open', $modes,
243            'Cyrillic-surrounded `_` must not emphasize');
244    }
245
246    /**
247     * Positive: when the surrounding non-word context is whitespace or
248     * punctuation, multibyte content *inside* the emphasis span is fine.
249     * `_für etwas_` surrounded by spaces should emphasize the multibyte text.
250     */
251    function testMultibyteContentInsideEmphasisWorks()
252    {
253        $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore());
254        $this->P->parse('foo _für etwas_ bar');
255        $modes = array_column($this->H->calls, 0);
256        $this->assertContains('emphasis_open', $modes,
257            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
258        $this->assertContains('emphasis_close', $modes,
259            'Multibyte text inside `_..._` must emphasize when boundaries are clear');
260    }
261}
262