1<?php 2 3namespace dokuwiki\test\Parsing\ParserMode; 4 5use dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore; 6 7/** 8 * Tests for the GFM underscore emphasis mode (`_text_`). 9 */ 10class GfmEmphasisUnderscoreTest extends ParserTestBase 11{ 12 public function setUp(): void 13 { 14 parent::setUp(); 15 $this->setSyntax('md'); 16 } 17 18 function testBasicUnderscore() 19 { 20 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 21 $this->P->parse('Foo _Bar_ Baz'); 22 $calls = [ 23 ['document_start', []], 24 ['p_open', []], 25 ['cdata', ["\nFoo "]], 26 ['emphasis_open', []], 27 ['cdata', ['Bar']], 28 ['emphasis_close', []], 29 ['cdata', [' Baz']], 30 ['p_close', []], 31 ['document_end', []], 32 ]; 33 $this->assertCalls($calls, $this->H->calls); 34 } 35 36 function testSingleCharacter() 37 { 38 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 39 $this->P->parse('foo _b_ bar'); 40 $calls = [ 41 ['document_start', []], 42 ['p_open', []], 43 ['cdata', ["\nfoo "]], 44 ['emphasis_open', []], 45 ['cdata', ['b']], 46 ['emphasis_close', []], 47 ['cdata', [' bar']], 48 ['p_close', []], 49 ['document_end', []], 50 ]; 51 $this->assertCalls($calls, $this->H->calls); 52 } 53 54 function testMultipleWords() 55 { 56 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 57 $this->P->parse('_one two three_'); 58 $calls = [ 59 ['document_start', []], 60 ['p_open', []], 61 ['cdata', ["\n"]], 62 ['emphasis_open', []], 63 ['cdata', ['one two three']], 64 ['emphasis_close', []], 65 ['cdata', ['']], 66 ['p_close', []], 67 ['document_end', []], 68 ]; 69 $this->assertCalls($calls, $this->H->calls); 70 } 71 72 function testIntrawordUnderscoreIsNotEmphasised() 73 { 74 // GFM's key word-boundary rule: underscores inside words stay literal. 75 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 76 $this->P->parse('this_is_not_an_emphasis'); 77 $calls = [ 78 ['document_start', []], 79 ['p_open', []], 80 ['cdata', ["\nthis_is_not_an_emphasis"]], 81 ['p_close', []], 82 ['document_end', []], 83 ]; 84 $this->assertCalls($calls, $this->H->calls); 85 } 86 87 function testOpenerFollowedBySpaceDoesNotEmphasise() 88 { 89 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 90 $this->P->parse('foo _ bar_ baz'); 91 $calls = [ 92 ['document_start', []], 93 ['p_open', []], 94 ['cdata', ["\nfoo _ bar_ baz"]], 95 ['p_close', []], 96 ['document_end', []], 97 ]; 98 $this->assertCalls($calls, $this->H->calls); 99 } 100 101 function testDoubleUnderscoreDoesNotEmphasise() 102 { 103 // `__foo__` must stay literal. At the first `_`, the lookahead 104 // `(?=[^\s_])` forbids entry (next char is another `_`). At the 105 // second `_`, the lookbehind also fails because `_` itself is not 106 // a "non-word" character (it's excluded from NON_WORD_CHAR so that 107 // `__foo` can't open emphasis at the inner underscore). 108 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 109 $this->P->parse('foo __bar__ baz'); 110 $calls = [ 111 ['document_start', []], 112 ['p_open', []], 113 ['cdata', ["\nfoo __bar__ baz"]], 114 ['p_close', []], 115 ['document_end', []], 116 ]; 117 $this->assertCalls($calls, $this->H->calls); 118 } 119 120 function testTwoSeparateEmphasisOnOneLine() 121 { 122 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 123 $this->P->parse('_one_ and _two_'); 124 $calls = [ 125 ['document_start', []], 126 ['p_open', []], 127 ['cdata', ["\n"]], 128 ['emphasis_open', []], 129 ['cdata', ['one']], 130 ['emphasis_close', []], 131 ['cdata', [' and ']], 132 ['emphasis_open', []], 133 ['cdata', ['two']], 134 ['emphasis_close', []], 135 ['cdata', ['']], 136 ['p_close', []], 137 ['document_end', []], 138 ]; 139 $this->assertCalls($calls, $this->H->calls); 140 } 141 142 function testMultilineEmphasis() 143 { 144 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 145 $this->P->parse("_line\nline\nline_"); 146 $calls = [ 147 ['document_start', []], 148 ['p_open', []], 149 ['cdata', ["\n"]], 150 ['emphasis_open', []], 151 ['cdata', ["line\nline\nline"]], 152 ['emphasis_close', []], 153 ['cdata', ['']], 154 ['p_close', []], 155 ['document_end', []], 156 ]; 157 $this->assertCalls($calls, $this->H->calls); 158 } 159 160 function testSortValue() 161 { 162 $mode = new GfmEmphasisUnderscore(); 163 $this->assertSame(80, $mode->getSort()); 164 } 165 166 function testDoesNotSpanParagraphBoundary() 167 { 168 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 169 $this->P->parse("_open\n\nclose_"); 170 $modes = array_column($this->H->calls, 0); 171 $this->assertNotContains('emphasis_open', $modes, 172 'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line'); 173 } 174 175 function testAllowsSingleNewlineInsideMultiline() 176 { 177 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 178 $this->P->parse("_open\nclose_"); 179 $modes = array_column($this->H->calls, 0); 180 $this->assertContains('emphasis_open', $modes, 181 'GfmEmphasisUnderscore must still match across a single newline'); 182 } 183 184 /** 185 * The intraword rule must apply to multibyte letters, not just ASCII. 186 * This test is derived from CommonMark spec §6.2 example 418: 187 * 188 * пристаням_стремятся_ 189 * 190 * which must render as literal (no emphasis). The surrounding Cyrillic 191 * letters are word-like; the underscores are intraword and must not 192 * emphasize. 193 * 194 * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are 195 * defined positively (matching explicit non-word chars) rather than 196 * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which 197 * are not in any ASCII class — are correctly treated as word-like. 198 * 199 * @dataProvider provideMultibyteIntrawordCases 200 */ 201 function testIntrawordUnderscoreInMultibyteText(string $input) 202 { 203 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 204 $this->P->parse($input); 205 $modes = array_column($this->H->calls, 0); 206 $this->assertNotContains( 207 'emphasis_open', 208 $modes, 209 "Intraword `_` in multibyte text must not emphasize: " . json_encode($input) 210 ); 211 } 212 213 public static function provideMultibyteIntrawordCases(): array 214 { 215 return [ 216 // CommonMark spec §6.2 ex. 418 — Cyrillic intraword 217 'cyrillic-trailing' => ['пристаням_стремятся_'], 218 // CommonMark spec §6.2 ex. 420 — Cyrillic leading 219 'cyrillic-leading' => ['_пристаням_стремятся'], 220 // German umlaut — no established spec example, but the expected 221 // behavior is uncontroversial: intraword `_` stays literal. 222 'german-umlaut' => ['für_etwas_text'], 223 // CJK — same expectation 224 'cjk-intraword' => ['日本_語_の'], 225 // Greek 226 'greek-intraword' => ['αυτό_είναι_κείμενο'], 227 ]; 228 } 229 230 /** 231 * A `_foo_` span surrounded by multibyte letters must NOT open at the 232 * first `_` (it would be intraword) AND must still NOT open if the 233 * following letters are multibyte. Verifies that both the lookbehind 234 * and the closing-delimiter lookahead reject multibyte word chars. 235 */ 236 function testMultibyteWordCharsAreNotTreatedAsBoundary() 237 { 238 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 239 // Intraword between Cyrillic on the left and Cyrillic on the right. 240 $this->P->parse('до_середины_текста'); 241 $modes = array_column($this->H->calls, 0); 242 $this->assertNotContains('emphasis_open', $modes, 243 'Cyrillic-surrounded `_` must not emphasize'); 244 } 245 246 /** 247 * Positive: when the surrounding non-word context is whitespace or 248 * punctuation, multibyte content *inside* the emphasis span is fine. 249 * `_für etwas_` surrounded by spaces should emphasize the multibyte text. 250 */ 251 function testMultibyteContentInsideEmphasisWorks() 252 { 253 $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 254 $this->P->parse('foo _für etwas_ bar'); 255 $modes = array_column($this->H->calls, 0); 256 $this->assertContains('emphasis_open', $modes, 257 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 258 $this->assertContains('emphasis_close', $modes, 259 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 260 } 261} 262