xref: /dokuwiki/inc/Parsing/ParserMode/GfmEscape.php (revision 884caed926ca0aa0af6ce3f34ae3aa7317a3361a)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Escape;
7
8/**
9 * GFM backslash escapes: a backslash before any ASCII punctuation
10 * character produces the literal punctuation character; the backslash
11 * itself is consumed and the following char loses any markup meaning.
12 *
13 * Backslashes before any other character (letters, digits, multibyte,
14 * spaces, tabs, newlines) are NOT escapes — those sequences stay
15 * literal because the pattern doesn't match them and the lexer leaves
16 * them as cdata.
17 *
18 * Sort 5 places this mode ahead of every other inline mode so that
19 * leftmost-then-priority resolution claims `\X` before any competing
20 * delimiter (emphasis `*`, heading `#`, link `[`, …) can match the
21 * unescaped char.
22 *
23 * Category SUBSTITUTION (alongside Smiley and Entity) so the mode is
24 * reachable everywhere those run: inside paragraphs, formatting
25 * modes (emphasis, strong, deleted), list items, table cells, headers
26 * — every container whose allowedModes include SUBSTITUTION. Whole-span
27 * code modes (GfmCode, GfmFile, GfmBacktickSingle, GfmBacktickDouble)
28 * capture their entire body in one regex shot and therefore bypass
29 * GfmEscape on their content — matching GFM's rule that escapes don't
30 * fire inside code blocks or code spans.
31 *
32 * Modes that capture a literal string and need GFM unescape applied
33 * post-hoc (link URL/label, fence info string) call
34 * {@see \dokuwiki\Parsing\Helpers\Escape::unescapeBackslashes()} from
35 * their handle() — same character class.
36 *
37 * Collision with DokuWiki's Linebreak mode (`\\` before a space, tab,
38 * or newline): both patterns can claim the two backslashes at the same
39 * position. GfmEscape's sort 5 beats Linebreak's sort 140 on tie, which
40 * would silently swallow every DW forced linebreak in mixed-syntax
41 * settings. To avoid that, when DW syntax is loaded the pattern carries
42 * a negative lookahead that declines `\\` followed by `[ \t\n]` —
43 * deferring those bytes to Linebreak. Mid-line `\\` (e.g. UNC paths
44 * like `\\\\host\\share`) still escapes normally; only the EOL-adjacent
45 * form is handed off. In pure `md` mode no DW Linebreak is loaded and
46 * the lookahead is omitted so GFM-spec behavior is preserved.
47 */
48class GfmEscape extends AbstractMode
49{
50    /** @inheritdoc */
51    public function getSort()
52    {
53        return 5;
54    }
55
56    /** @inheritdoc */
57    public function connectTo($mode)
58    {
59        // PHP `\\\\\\\\` → regex `\\\\` → matches two literal backslashes.
60        $lookahead = $this->registry->getSyntax() === 'md' ? '' : '(?!\\\\\\\\[ \t\n])';
61        $this->Lexer->addSpecialPattern(
62            $lookahead . '\\\\' . Escape::PUNCTUATION_CHAR_CLASS,
63            $mode,
64            'gfm_escape'
65        );
66    }
67
68    /** @inheritdoc */
69    public function handle($match, $state, $pos, Handler $handler)
70    {
71        $handler->addCall('cdata', [substr($match, 1)], $pos);
72        return true;
73    }
74}
75