1bcefb8aeSAndreas Gohr<?php 2bcefb8aeSAndreas Gohr 3bcefb8aeSAndreas Gohrnamespace dokuwiki\test\Parsing\ParserMode; 4bcefb8aeSAndreas Gohr 5bcefb8aeSAndreas Gohruse dokuwiki\Parsing\ParserMode\GfmEmphasisUnderscore; 6bcefb8aeSAndreas Gohr 7bcefb8aeSAndreas Gohr/** 8bcefb8aeSAndreas Gohr * Tests for the GFM underscore emphasis mode (`_text_`). 9bcefb8aeSAndreas Gohr */ 10bcefb8aeSAndreas Gohrclass GfmEmphasisUnderscoreTest extends ParserTestBase 11bcefb8aeSAndreas Gohr{ 12bcefb8aeSAndreas Gohr public function setUp(): void 13bcefb8aeSAndreas Gohr { 14bcefb8aeSAndreas Gohr parent::setUp(); 15*47a02a10SAndreas Gohr $this->setSyntax('md'); 16bcefb8aeSAndreas Gohr } 17bcefb8aeSAndreas Gohr 18bcefb8aeSAndreas Gohr function testBasicUnderscore() 19bcefb8aeSAndreas Gohr { 20bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 21bcefb8aeSAndreas Gohr $this->P->parse('Foo _Bar_ Baz'); 22bcefb8aeSAndreas Gohr $calls = [ 23bcefb8aeSAndreas Gohr ['document_start', []], 24bcefb8aeSAndreas Gohr ['p_open', []], 25bcefb8aeSAndreas Gohr ['cdata', ["\nFoo "]], 26bcefb8aeSAndreas Gohr ['emphasis_open', []], 27bcefb8aeSAndreas Gohr ['cdata', ['Bar']], 28bcefb8aeSAndreas Gohr ['emphasis_close', []], 29bcefb8aeSAndreas Gohr ['cdata', [' Baz']], 30bcefb8aeSAndreas Gohr ['p_close', []], 31bcefb8aeSAndreas Gohr ['document_end', []], 32bcefb8aeSAndreas Gohr ]; 33bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 34bcefb8aeSAndreas Gohr } 35bcefb8aeSAndreas Gohr 36bcefb8aeSAndreas Gohr function testSingleCharacter() 37bcefb8aeSAndreas Gohr { 38bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 39bcefb8aeSAndreas Gohr $this->P->parse('foo _b_ bar'); 40bcefb8aeSAndreas Gohr $calls = [ 41bcefb8aeSAndreas Gohr ['document_start', []], 42bcefb8aeSAndreas Gohr ['p_open', []], 43bcefb8aeSAndreas Gohr ['cdata', ["\nfoo "]], 44bcefb8aeSAndreas Gohr ['emphasis_open', []], 45bcefb8aeSAndreas Gohr ['cdata', ['b']], 46bcefb8aeSAndreas Gohr ['emphasis_close', []], 47bcefb8aeSAndreas Gohr ['cdata', [' bar']], 48bcefb8aeSAndreas Gohr ['p_close', []], 49bcefb8aeSAndreas Gohr ['document_end', []], 50bcefb8aeSAndreas Gohr ]; 51bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 52bcefb8aeSAndreas Gohr } 53bcefb8aeSAndreas Gohr 54bcefb8aeSAndreas Gohr function testMultipleWords() 55bcefb8aeSAndreas Gohr { 56bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 57bcefb8aeSAndreas Gohr $this->P->parse('_one two three_'); 58bcefb8aeSAndreas Gohr $calls = [ 59bcefb8aeSAndreas Gohr ['document_start', []], 60bcefb8aeSAndreas Gohr ['p_open', []], 61bcefb8aeSAndreas Gohr ['cdata', ["\n"]], 62bcefb8aeSAndreas Gohr ['emphasis_open', []], 63bcefb8aeSAndreas Gohr ['cdata', ['one two three']], 64bcefb8aeSAndreas Gohr ['emphasis_close', []], 65bcefb8aeSAndreas Gohr ['cdata', ['']], 66bcefb8aeSAndreas Gohr ['p_close', []], 67bcefb8aeSAndreas Gohr ['document_end', []], 68bcefb8aeSAndreas Gohr ]; 69bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 70bcefb8aeSAndreas Gohr } 71bcefb8aeSAndreas Gohr 72bcefb8aeSAndreas Gohr function testIntrawordUnderscoreIsNotEmphasised() 73bcefb8aeSAndreas Gohr { 74bcefb8aeSAndreas Gohr // GFM's key word-boundary rule: underscores inside words stay literal. 75bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 76bcefb8aeSAndreas Gohr $this->P->parse('this_is_not_an_emphasis'); 77bcefb8aeSAndreas Gohr $calls = [ 78bcefb8aeSAndreas Gohr ['document_start', []], 79bcefb8aeSAndreas Gohr ['p_open', []], 80bcefb8aeSAndreas Gohr ['cdata', ["\nthis_is_not_an_emphasis"]], 81bcefb8aeSAndreas Gohr ['p_close', []], 82bcefb8aeSAndreas Gohr ['document_end', []], 83bcefb8aeSAndreas Gohr ]; 84bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 85bcefb8aeSAndreas Gohr } 86bcefb8aeSAndreas Gohr 87bcefb8aeSAndreas Gohr function testOpenerFollowedBySpaceDoesNotEmphasise() 88bcefb8aeSAndreas Gohr { 89bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 90bcefb8aeSAndreas Gohr $this->P->parse('foo _ bar_ baz'); 91bcefb8aeSAndreas Gohr $calls = [ 92bcefb8aeSAndreas Gohr ['document_start', []], 93bcefb8aeSAndreas Gohr ['p_open', []], 94bcefb8aeSAndreas Gohr ['cdata', ["\nfoo _ bar_ baz"]], 95bcefb8aeSAndreas Gohr ['p_close', []], 96bcefb8aeSAndreas Gohr ['document_end', []], 97bcefb8aeSAndreas Gohr ]; 98bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 99bcefb8aeSAndreas Gohr } 100bcefb8aeSAndreas Gohr 101bcefb8aeSAndreas Gohr function testDoubleUnderscoreDoesNotEmphasise() 102bcefb8aeSAndreas Gohr { 103bcefb8aeSAndreas Gohr // `__foo__` must stay literal. At the first `_`, the lookahead 104bcefb8aeSAndreas Gohr // `(?=[^\s_])` forbids entry (next char is another `_`). At the 105bcefb8aeSAndreas Gohr // second `_`, the lookbehind also fails because `_` itself is not 106bcefb8aeSAndreas Gohr // a "non-word" character (it's excluded from NON_WORD_CHAR so that 107bcefb8aeSAndreas Gohr // `__foo` can't open emphasis at the inner underscore). 108bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 109bcefb8aeSAndreas Gohr $this->P->parse('foo __bar__ baz'); 110bcefb8aeSAndreas Gohr $calls = [ 111bcefb8aeSAndreas Gohr ['document_start', []], 112bcefb8aeSAndreas Gohr ['p_open', []], 113bcefb8aeSAndreas Gohr ['cdata', ["\nfoo __bar__ baz"]], 114bcefb8aeSAndreas Gohr ['p_close', []], 115bcefb8aeSAndreas Gohr ['document_end', []], 116bcefb8aeSAndreas Gohr ]; 117bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 118bcefb8aeSAndreas Gohr } 119bcefb8aeSAndreas Gohr 120bcefb8aeSAndreas Gohr function testTwoSeparateEmphasisOnOneLine() 121bcefb8aeSAndreas Gohr { 122bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 123bcefb8aeSAndreas Gohr $this->P->parse('_one_ and _two_'); 124bcefb8aeSAndreas Gohr $calls = [ 125bcefb8aeSAndreas Gohr ['document_start', []], 126bcefb8aeSAndreas Gohr ['p_open', []], 127bcefb8aeSAndreas Gohr ['cdata', ["\n"]], 128bcefb8aeSAndreas Gohr ['emphasis_open', []], 129bcefb8aeSAndreas Gohr ['cdata', ['one']], 130bcefb8aeSAndreas Gohr ['emphasis_close', []], 131bcefb8aeSAndreas Gohr ['cdata', [' and ']], 132bcefb8aeSAndreas Gohr ['emphasis_open', []], 133bcefb8aeSAndreas Gohr ['cdata', ['two']], 134bcefb8aeSAndreas Gohr ['emphasis_close', []], 135bcefb8aeSAndreas Gohr ['cdata', ['']], 136bcefb8aeSAndreas Gohr ['p_close', []], 137bcefb8aeSAndreas Gohr ['document_end', []], 138bcefb8aeSAndreas Gohr ]; 139bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 140bcefb8aeSAndreas Gohr } 141bcefb8aeSAndreas Gohr 142bcefb8aeSAndreas Gohr function testMultilineEmphasis() 143bcefb8aeSAndreas Gohr { 144bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 145bcefb8aeSAndreas Gohr $this->P->parse("_line\nline\nline_"); 146bcefb8aeSAndreas Gohr $calls = [ 147bcefb8aeSAndreas Gohr ['document_start', []], 148bcefb8aeSAndreas Gohr ['p_open', []], 149bcefb8aeSAndreas Gohr ['cdata', ["\n"]], 150bcefb8aeSAndreas Gohr ['emphasis_open', []], 151bcefb8aeSAndreas Gohr ['cdata', ["line\nline\nline"]], 152bcefb8aeSAndreas Gohr ['emphasis_close', []], 153bcefb8aeSAndreas Gohr ['cdata', ['']], 154bcefb8aeSAndreas Gohr ['p_close', []], 155bcefb8aeSAndreas Gohr ['document_end', []], 156bcefb8aeSAndreas Gohr ]; 157bcefb8aeSAndreas Gohr $this->assertCalls($calls, $this->H->calls); 158bcefb8aeSAndreas Gohr } 159bcefb8aeSAndreas Gohr 160bcefb8aeSAndreas Gohr function testSortValue() 161bcefb8aeSAndreas Gohr { 162bcefb8aeSAndreas Gohr $mode = new GfmEmphasisUnderscore(); 163bcefb8aeSAndreas Gohr $this->assertSame(80, $mode->getSort()); 164bcefb8aeSAndreas Gohr } 165bcefb8aeSAndreas Gohr 166bcefb8aeSAndreas Gohr function testDoesNotSpanParagraphBoundary() 167bcefb8aeSAndreas Gohr { 168bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 169bcefb8aeSAndreas Gohr $this->P->parse("_open\n\nclose_"); 170bcefb8aeSAndreas Gohr $modes = array_column($this->H->calls, 0); 171bcefb8aeSAndreas Gohr $this->assertNotContains('emphasis_open', $modes, 172bcefb8aeSAndreas Gohr 'GfmEmphasisUnderscore must not open when the closing `_` is past a blank line'); 173bcefb8aeSAndreas Gohr } 174bcefb8aeSAndreas Gohr 175bcefb8aeSAndreas Gohr function testAllowsSingleNewlineInsideMultiline() 176bcefb8aeSAndreas Gohr { 177bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 178bcefb8aeSAndreas Gohr $this->P->parse("_open\nclose_"); 179bcefb8aeSAndreas Gohr $modes = array_column($this->H->calls, 0); 180bcefb8aeSAndreas Gohr $this->assertContains('emphasis_open', $modes, 181bcefb8aeSAndreas Gohr 'GfmEmphasisUnderscore must still match across a single newline'); 182bcefb8aeSAndreas Gohr } 183bcefb8aeSAndreas Gohr 184bcefb8aeSAndreas Gohr /** 185bcefb8aeSAndreas Gohr * The intraword rule must apply to multibyte letters, not just ASCII. 186bcefb8aeSAndreas Gohr * This test is derived from CommonMark spec §6.2 example 418: 187bcefb8aeSAndreas Gohr * 188bcefb8aeSAndreas Gohr * пристаням_стремятся_ 189bcefb8aeSAndreas Gohr * 190bcefb8aeSAndreas Gohr * which must render as literal (no emphasis). The surrounding Cyrillic 191bcefb8aeSAndreas Gohr * letters are word-like; the underscores are intraword and must not 192bcefb8aeSAndreas Gohr * emphasize. 193bcefb8aeSAndreas Gohr * 194bcefb8aeSAndreas Gohr * The word-boundary constants (NO_WORD_BEFORE / NO_WORD_AFTER) are 195bcefb8aeSAndreas Gohr * defined positively (matching explicit non-word chars) rather than 196bcefb8aeSAndreas Gohr * negatively (not matching a-zA-Z0-9), so multibyte UTF-8 bytes — which 197bcefb8aeSAndreas Gohr * are not in any ASCII class — are correctly treated as word-like. 198bcefb8aeSAndreas Gohr * 199bcefb8aeSAndreas Gohr * @dataProvider provideMultibyteIntrawordCases 200bcefb8aeSAndreas Gohr */ 201bcefb8aeSAndreas Gohr function testIntrawordUnderscoreInMultibyteText(string $input) 202bcefb8aeSAndreas Gohr { 203bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 204bcefb8aeSAndreas Gohr $this->P->parse($input); 205bcefb8aeSAndreas Gohr $modes = array_column($this->H->calls, 0); 206bcefb8aeSAndreas Gohr $this->assertNotContains( 207bcefb8aeSAndreas Gohr 'emphasis_open', 208bcefb8aeSAndreas Gohr $modes, 209bcefb8aeSAndreas Gohr "Intraword `_` in multibyte text must not emphasize: " . json_encode($input) 210bcefb8aeSAndreas Gohr ); 211bcefb8aeSAndreas Gohr } 212bcefb8aeSAndreas Gohr 213bcefb8aeSAndreas Gohr public static function provideMultibyteIntrawordCases(): array 214bcefb8aeSAndreas Gohr { 215bcefb8aeSAndreas Gohr return [ 216bcefb8aeSAndreas Gohr // CommonMark spec §6.2 ex. 418 — Cyrillic intraword 217bcefb8aeSAndreas Gohr 'cyrillic-trailing' => ['пристаням_стремятся_'], 218bcefb8aeSAndreas Gohr // CommonMark spec §6.2 ex. 420 — Cyrillic leading 219bcefb8aeSAndreas Gohr 'cyrillic-leading' => ['_пристаням_стремятся'], 220bcefb8aeSAndreas Gohr // German umlaut — no established spec example, but the expected 221bcefb8aeSAndreas Gohr // behavior is uncontroversial: intraword `_` stays literal. 222bcefb8aeSAndreas Gohr 'german-umlaut' => ['für_etwas_text'], 223bcefb8aeSAndreas Gohr // CJK — same expectation 224bcefb8aeSAndreas Gohr 'cjk-intraword' => ['日本_語_の'], 225bcefb8aeSAndreas Gohr // Greek 226bcefb8aeSAndreas Gohr 'greek-intraword' => ['αυτό_είναι_κείμενο'], 227bcefb8aeSAndreas Gohr ]; 228bcefb8aeSAndreas Gohr } 229bcefb8aeSAndreas Gohr 230bcefb8aeSAndreas Gohr /** 231bcefb8aeSAndreas Gohr * A `_foo_` span surrounded by multibyte letters must NOT open at the 232bcefb8aeSAndreas Gohr * first `_` (it would be intraword) AND must still NOT open if the 233bcefb8aeSAndreas Gohr * following letters are multibyte. Verifies that both the lookbehind 234bcefb8aeSAndreas Gohr * and the closing-delimiter lookahead reject multibyte word chars. 235bcefb8aeSAndreas Gohr */ 236bcefb8aeSAndreas Gohr function testMultibyteWordCharsAreNotTreatedAsBoundary() 237bcefb8aeSAndreas Gohr { 238bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 239bcefb8aeSAndreas Gohr // Intraword between Cyrillic on the left and Cyrillic on the right. 240bcefb8aeSAndreas Gohr $this->P->parse('до_середины_текста'); 241bcefb8aeSAndreas Gohr $modes = array_column($this->H->calls, 0); 242bcefb8aeSAndreas Gohr $this->assertNotContains('emphasis_open', $modes, 243bcefb8aeSAndreas Gohr 'Cyrillic-surrounded `_` must not emphasize'); 244bcefb8aeSAndreas Gohr } 245bcefb8aeSAndreas Gohr 246bcefb8aeSAndreas Gohr /** 247bcefb8aeSAndreas Gohr * Positive: when the surrounding non-word context is whitespace or 248bcefb8aeSAndreas Gohr * punctuation, multibyte content *inside* the emphasis span is fine. 249bcefb8aeSAndreas Gohr * `_für etwas_` surrounded by spaces should emphasize the multibyte text. 250bcefb8aeSAndreas Gohr */ 251bcefb8aeSAndreas Gohr function testMultibyteContentInsideEmphasisWorks() 252bcefb8aeSAndreas Gohr { 253bcefb8aeSAndreas Gohr $this->P->addMode('gfm_emphasis_underscore', new GfmEmphasisUnderscore()); 254bcefb8aeSAndreas Gohr $this->P->parse('foo _für etwas_ bar'); 255bcefb8aeSAndreas Gohr $modes = array_column($this->H->calls, 0); 256bcefb8aeSAndreas Gohr $this->assertContains('emphasis_open', $modes, 257bcefb8aeSAndreas Gohr 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 258bcefb8aeSAndreas Gohr $this->assertContains('emphasis_close', $modes, 259bcefb8aeSAndreas Gohr 'Multibyte text inside `_..._` must emphasize when boundaries are clear'); 260bcefb8aeSAndreas Gohr } 261bcefb8aeSAndreas Gohr} 262