1<?php 2 3namespace dokuwiki\test\Parsing\ParserMode; 4 5use dokuwiki\Parsing\ModeRegistry; 6use dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore; 7 8/** 9 * Tests for the GFM underscore emphasis mode (`_text_`). 10 */ 11class GfmEmphasisUnderscoreTest extends ParserTestBase 12{ 13 public function setUp(): void 14 { 15 parent::setUp(); 16 global $conf; 17 $conf['syntax'] = 'markdown'; 18 ModeRegistry::reset(); 19 } 20 21 public function tearDown(): void 22 { 23 ModeRegistry::reset(); 24 parent::tearDown(); 25 } 26 27 function testBasicUnderscore() 28 { 29 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 30 $this->P->parse('Foo _Bar_ Baz'); 31 $calls = [ 32 ['document_start', []], 33 ['p_open', []], 34 ['cdata', ["\nFoo "]], 35 ['emphasis_open', []], 36 ['cdata', ['Bar']], 37 ['emphasis_close', []], 38 ['cdata', [' Baz']], 39 ['p_close', []], 40 ['document_end', []], 41 ]; 42 $this->assertCalls($calls, $this->H->calls); 43 } 44 45 function testSingleCharacter() 46 { 47 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 48 $this->P->parse('foo _b_ bar'); 49 $calls = [ 50 ['document_start', []], 51 ['p_open', []], 52 ['cdata', ["\nfoo "]], 53 ['emphasis_open', []], 54 ['cdata', ['b']], 55 ['emphasis_close', []], 56 ['cdata', [' bar']], 57 ['p_close', []], 58 ['document_end', []], 59 ]; 60 $this->assertCalls($calls, $this->H->calls); 61 } 62 63 function testMultipleWords() 64 { 65 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 66 $this->P->parse('_one two three_'); 67 $calls = [ 68 ['document_start', []], 69 ['p_open', []], 70 ['cdata', ["\n"]], 71 ['emphasis_open', []], 72 ['cdata', ['one two three']], 73 ['emphasis_close', []], 74 ['cdata', ['']], 75 ['p_close', []], 76 ['document_end', []], 77 ]; 78 $this->assertCalls($calls, $this->H->calls); 79 } 80 81 function testIntrawordUnderscoreIsNotEmphasised() 82 { 83 // GFM's key word-boundary rule: underscores inside words stay literal. 84 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 85 $this->P->parse('this_is_not_an_emphasis'); 86 $calls = [ 87 ['document_start', []], 88 ['p_open', []], 89 ['cdata', ["\nthis_is_not_an_emphasis"]], 90 ['p_close', []], 91 ['document_end', []], 92 ]; 93 $this->assertCalls($calls, $this->H->calls); 94 } 95 96 function testOpenerFollowedBySpaceDoesNotEmphasise() 97 { 98 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 99 $this->P->parse('foo _ bar_ baz'); 100 $calls = [ 101 ['document_start', []], 102 ['p_open', []], 103 ['cdata', ["\nfoo _ bar_ baz"]], 104 ['p_close', []], 105 ['document_end', []], 106 ]; 107 $this->assertCalls($calls, $this->H->calls); 108 } 109 110 function testDoubleUnderscoreDoesNotEmphasise() 111 { 112 // `__foo__` must stay literal. At the first `_`, the lookahead 113 // `(?=[^\s_])` forbids entry (next char is another `_`). At the 114 // second `_`, the lookbehind also fails because `_` itself is not 115 // a "non-word" character (it's excluded from NON_WORD_CHAR so that 116 // `__foo` can't open emphasis at the inner underscore). 117 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 118 $this->P->parse('foo __bar__ baz'); 119 $calls = [ 120 ['document_start', []], 121 ['p_open', []], 122 ['cdata', ["\nfoo __bar__ baz"]], 123 ['p_close', []], 124 ['document_end', []], 125 ]; 126 $this->assertCalls($calls, $this->H->calls); 127 } 128 129 function testTwoSeparateEmphasisOnOneLine() 130 { 131 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 132 $this->P->parse('_one_ and _two_'); 133 $calls = [ 134 ['document_start', []], 135 ['p_open', []], 136 ['cdata', ["\n"]], 137 ['emphasis_open', []], 138 ['cdata', ['one']], 139 ['emphasis_close', []], 140 ['cdata', [' and ']], 141 ['emphasis_open', []], 142 ['cdata', ['two']], 143 ['emphasis_close', []], 144 ['cdata', ['']], 145 ['p_close', []], 146 ['document_end', []], 147 ]; 148 $this->assertCalls($calls, $this->H->calls); 149 } 150 151 function testMultilineEmphasis() 152 { 153 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 154 $this->P->parse("_line\nline\nline_"); 155 $calls = [ 156 ['document_start', []], 157 ['p_open', []], 158 ['cdata', ["\n"]], 159 ['emphasis_open', []], 160 ['cdata', ["line\nline\nline"]], 161 ['emphasis_close', []], 162 ['cdata', ['']], 163 ['p_close', []], 164 ['document_end', []], 165 ]; 166 $this->assertCalls($calls, $this->H->calls); 167 } 168 169 function testSortValue() 170 { 171 $mode = new GfmEmphasisUnderscore(); 172 $this->assertSame(80, $mode->getSort()); 173 } 174 175 function testDoesNotSpanParagraphBoundary() 176 { 177 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 178 $this->P->parse("_open\n\nclose_"); 179 $modes = array_column($this->H->calls, 0); 180 $this->assertNotContains('emphasis_open', $modes, 181 'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line'); 182 } 183 184 function testAllowsSingleNewlineInsideMultiline() 185 { 186 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 187 $this->P->parse("_open\nclose_"); 188 $modes = array_column($this->H->calls, 0); 189 $this->assertContains('emphasis_open', $modes, 190 'GfmEmphasisUnderscore must still match across a single newline'); 191 } 192 193 /** 194 * The intraword rule must apply to multibyte letters, not just ASCII. 195 * This test is derived from CommonMark spec §6.2 example 418: 196 * 197 * пристаням_стремятся_ 198 * 199 * which must render as literal (no emphasis). The surrounding Cyrillic 200 * letters are word-like; the underscores are intraword and must not 201 * emphasize. 202 * 203 * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are 204 * defined positively (matching explicit non-word chars) rather than 205 * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which 206 * are not in any ASCII class — are correctly treated as word-like. 207 * 208 * @dataProvider provideMultibyteIntrawordCases 209 */ 210 function testIntrawordUnderscoreInMultibyteText(string $input) 211 { 212 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 213 $this->P->parse($input); 214 $modes = array_column($this->H->calls, 0); 215 $this->assertNotContains( 216 'emphasis_open', 217 $modes, 218 "Intraword `_` in multibyte text must not emphasize: " . json_encode($input) 219 ); 220 } 221 222 public static function provideMultibyteIntrawordCases(): array 223 { 224 return [ 225 // CommonMark spec §6.2 ex. 418 — Cyrillic intraword 226 'cyrillic-trailing' => ['пристаням_стремятся_'], 227 // CommonMark spec §6.2 ex. 420 — Cyrillic leading 228 'cyrillic-leading' => ['_пристаням_стремятся'], 229 // German umlaut — no established spec example, but the expected 230 // behavior is uncontroversial: intraword `_` stays literal. 231 'german-umlaut' => ['für_etwas_text'], 232 // CJK — same expectation 233 'cjk-intraword' => ['日本_語_の'], 234 // Greek 235 'greek-intraword' => ['αυτό_είναι_κείμενο'], 236 ]; 237 } 238 239 /** 240 * A `_foo_` span surrounded by multibyte letters must NOT open at the 241 * first `_` (it would be intraword) AND must still NOT open if the 242 * following letters are multibyte. Verifies that both the lookbehind 243 * and the closing-delimiter lookahead reject multibyte word chars. 244 */ 245 function testMultibyteWordCharsAreNotTreatedAsBoundary() 246 { 247 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 248 // Intraword between Cyrillic on the left and Cyrillic on the right. 249 $this->P->parse('до_середины_текста'); 250 $modes = array_column($this->H->calls, 0); 251 $this->assertNotContains('emphasis_open', $modes, 252 'Cyrillic-surrounded `_` must not emphasize'); 253 } 254 255 /** 256 * Positive: when the surrounding non-word context is whitespace or 257 * punctuation, multibyte content *inside* the emphasis span is fine. 258 * `_für etwas_` surrounded by spaces should emphasize the multibyte text. 259 */ 260 function testMultibyteContentInsideEmphasisWorks() 261 { 262 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 263 $this->P->parse('foo _für etwas_ bar'); 264 $modes = array_column($this->H->calls, 0); 265 $this->assertContains('emphasis_open', $modes, 266 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 267 $this->assertContains('emphasis_close', $modes, 268 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 269 } 270} 271