1<?php 2 3namespace dokuwiki\test\Parsing\ParserMode; 4 5use dokuwiki\Parsing\ModeRegistry; 6use dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore; 7 8/** 9 * Tests for the GFM underscore emphasis mode (`_text_`). 10 * 11 * Only loaded when Markdown is preferred or the only syntax; see SPEC.md and 12 * the registry test coverage in ModeRegistryTest. 13 * 14 * Follows the FormattingTest pattern: one mode loaded in isolation, assertions 15 * against handler instruction sequences. The setUp flips ModeRegistry to 16 * `markdown` syntax so the Base mode recognizes `gfm_emphasis_underscore` as 17 * an allowed nested mode. 18 */ 19class GfmEmphasisUnderscoreTest extends ParserTestBase 20{ 21 public function setUp(): void 22 { 23 parent::setUp(); 24 global $conf; 25 $conf['syntax'] = 'markdown'; 26 ModeRegistry::reset(); 27 } 28 29 public function tearDown(): void 30 { 31 ModeRegistry::reset(); 32 parent::tearDown(); 33 } 34 35 function testBasicUnderscore() 36 { 37 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 38 $this->P->parse('Foo _Bar_ Baz'); 39 $calls = [ 40 ['document_start', []], 41 ['p_open', []], 42 ['cdata', ["\nFoo "]], 43 ['emphasis_open', []], 44 ['cdata', ['Bar']], 45 ['emphasis_close', []], 46 ['cdata', [' Baz']], 47 ['p_close', []], 48 ['document_end', []], 49 ]; 50 $this->assertCalls($calls, $this->H->calls); 51 } 52 53 function testSingleCharacter() 54 { 55 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 56 $this->P->parse('foo _b_ bar'); 57 $calls = [ 58 ['document_start', []], 59 ['p_open', []], 60 ['cdata', ["\nfoo "]], 61 ['emphasis_open', []], 62 ['cdata', ['b']], 63 ['emphasis_close', []], 64 ['cdata', [' bar']], 65 ['p_close', []], 66 ['document_end', []], 67 ]; 68 $this->assertCalls($calls, $this->H->calls); 69 } 70 71 function testMultipleWords() 72 { 73 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 74 $this->P->parse('_one two three_'); 75 $calls = [ 76 ['document_start', []], 77 ['p_open', []], 78 ['cdata', ["\n"]], 79 ['emphasis_open', []], 80 ['cdata', ['one two three']], 81 ['emphasis_close', []], 82 ['cdata', ['']], 83 ['p_close', []], 84 ['document_end', []], 85 ]; 86 $this->assertCalls($calls, $this->H->calls); 87 } 88 89 function testIntrawordUnderscoreIsNotEmphasised() 90 { 91 // GFM's key word-boundary rule: underscores inside words stay literal. 92 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 93 $this->P->parse('this_is_not_an_emphasis'); 94 $calls = [ 95 ['document_start', []], 96 ['p_open', []], 97 ['cdata', ["\nthis_is_not_an_emphasis"]], 98 ['p_close', []], 99 ['document_end', []], 100 ]; 101 $this->assertCalls($calls, $this->H->calls); 102 } 103 104 function testOpenerFollowedBySpaceDoesNotEmphasise() 105 { 106 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 107 $this->P->parse('foo _ bar_ baz'); 108 $calls = [ 109 ['document_start', []], 110 ['p_open', []], 111 ['cdata', ["\nfoo _ bar_ baz"]], 112 ['p_close', []], 113 ['document_end', []], 114 ]; 115 $this->assertCalls($calls, $this->H->calls); 116 } 117 118 function testDoubleUnderscoreDoesNotEmphasise() 119 { 120 // `__foo__` must stay literal. At the first `_`, the lookahead 121 // `(?=[^\s_])` forbids entry (next char is another `_`). At the 122 // second `_`, the lookbehind also fails because `_` itself is not 123 // a "non-word" character (it's excluded from NON_WORD_CHAR so that 124 // `__foo` can't open emphasis at the inner underscore). 125 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 126 $this->P->parse('foo __bar__ baz'); 127 $calls = [ 128 ['document_start', []], 129 ['p_open', []], 130 ['cdata', ["\nfoo __bar__ baz"]], 131 ['p_close', []], 132 ['document_end', []], 133 ]; 134 $this->assertCalls($calls, $this->H->calls); 135 } 136 137 function testTwoSeparateEmphasisOnOneLine() 138 { 139 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 140 $this->P->parse('_one_ and _two_'); 141 $calls = [ 142 ['document_start', []], 143 ['p_open', []], 144 ['cdata', ["\n"]], 145 ['emphasis_open', []], 146 ['cdata', ['one']], 147 ['emphasis_close', []], 148 ['cdata', [' and ']], 149 ['emphasis_open', []], 150 ['cdata', ['two']], 151 ['emphasis_close', []], 152 ['cdata', ['']], 153 ['p_close', []], 154 ['document_end', []], 155 ]; 156 $this->assertCalls($calls, $this->H->calls); 157 } 158 159 function testMultilineEmphasis() 160 { 161 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 162 $this->P->parse("_line\nline\nline_"); 163 $calls = [ 164 ['document_start', []], 165 ['p_open', []], 166 ['cdata', ["\n"]], 167 ['emphasis_open', []], 168 ['cdata', ["line\nline\nline"]], 169 ['emphasis_close', []], 170 ['cdata', ['']], 171 ['p_close', []], 172 ['document_end', []], 173 ]; 174 $this->assertCalls($calls, $this->H->calls); 175 } 176 177 function testSortValue() 178 { 179 $mode = new GfmEmphasisUnderscore(); 180 $this->assertSame(80, $mode->getSort()); 181 } 182 183 function testDoesNotSpanParagraphBoundary() 184 { 185 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 186 $this->P->parse("_open\n\nclose_"); 187 $modes = array_column($this->H->calls, 0); 188 $this->assertNotContains('emphasis_open', $modes, 189 'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line'); 190 } 191 192 function testAllowsSingleNewlineInsideMultiline() 193 { 194 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 195 $this->P->parse("_open\nclose_"); 196 $modes = array_column($this->H->calls, 0); 197 $this->assertContains('emphasis_open', $modes, 198 'GfmEmphasisUnderscore must still match across a single newline'); 199 } 200 201 /** 202 * The intraword rule must apply to multibyte letters, not just ASCII. 203 * This test is derived from CommonMark spec §6.2 example 418: 204 * 205 * пристаням_стремятся_ 206 * 207 * which must render as literal (no emphasis). The surrounding Cyrillic 208 * letters are word-like; the underscores are intraword and must not 209 * emphasize. 210 * 211 * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are 212 * defined positively (matching explicit non-word chars) rather than 213 * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which 214 * are not in any ASCII class — are correctly treated as word-like. 215 * 216 * @dataProvider provideMultibyteIntrawordCases 217 */ 218 function testIntrawordUnderscoreInMultibyteText(string $input) 219 { 220 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 221 $this->P->parse($input); 222 $modes = array_column($this->H->calls, 0); 223 $this->assertNotContains( 224 'emphasis_open', 225 $modes, 226 "Intraword `_` in multibyte text must not emphasize: " . json_encode($input) 227 ); 228 } 229 230 public static function provideMultibyteIntrawordCases(): array 231 { 232 return [ 233 // CommonMark spec §6.2 ex. 418 — Cyrillic intraword 234 'cyrillic-trailing' => ['пристаням_стремятся_'], 235 // CommonMark spec §6.2 ex. 420 — Cyrillic leading 236 'cyrillic-leading' => ['_пристаням_стремятся'], 237 // German umlaut — no established spec example, but the expected 238 // behavior is uncontroversial: intraword `_` stays literal. 239 'german-umlaut' => ['für_etwas_text'], 240 // CJK — same expectation 241 'cjk-intraword' => ['日本_語_の'], 242 // Greek 243 'greek-intraword' => ['αυτό_είναι_κείμενο'], 244 ]; 245 } 246 247 /** 248 * A `_foo_` span surrounded by multibyte letters must NOT open at the 249 * first `_` (it would be intraword) AND must still NOT open if the 250 * following letters are multibyte. Verifies that both the lookbehind 251 * and the closing-delimiter lookahead reject multibyte word chars. 252 */ 253 function testMultibyteWordCharsAreNotTreatedAsBoundary() 254 { 255 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 256 // Intraword between Cyrillic on the left and Cyrillic on the right. 257 $this->P->parse('до_середины_текста'); 258 $modes = array_column($this->H->calls, 0); 259 $this->assertNotContains('emphasis_open', $modes, 260 'Cyrillic-surrounded `_` must not emphasize'); 261 } 262 263 /** 264 * Positive: when the surrounding non-word context is whitespace or 265 * punctuation, multibyte content *inside* the emphasis span is fine. 266 * `_für etwas_` surrounded by spaces should emphasize the multibyte text. 267 */ 268 function testMultibyteContentInsideEmphasisWorks() 269 { 270 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 271 $this->P->parse('foo _für etwas_ bar'); 272 $modes = array_column($this->H->calls, 0); 273 $this->assertContains('emphasis_open', $modes, 274 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 275 $this->assertContains('emphasis_close', $modes, 276 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 277 } 278} 279