xref: /dokuwiki/inc/Parsing/ParserMode/GfmEscape.php (revision 65dd2042806255b56d4cf303530fd396ff38f151)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Escape;
7
8/**
9 * GFM backslash escapes: a backslash before any ASCII punctuation
10 * character produces the literal punctuation character; the backslash
11 * itself is consumed and the following char loses any markup meaning.
12 *
13 * Backslashes before any other character (letters, digits, multibyte,
14 * spaces, tabs, newlines) are NOT escapes — those sequences stay
15 * literal because the pattern doesn't match them and the lexer leaves
16 * them as cdata.
17 *
18 * Sort 5 places this mode ahead of every other inline mode so that
19 * leftmost-then-priority resolution claims `\X` before any competing
20 * delimiter (emphasis `*`, heading `#`, link `[`, …) can match the
21 * unescaped char.
22 *
23 * Category SUBSTITUTION (alongside Smiley and Entity) so the mode is
24 * reachable everywhere those run: inside paragraphs, formatting
25 * modes (emphasis, strong, deleted), list items, table cells, headers
26 * — every container whose allowedModes include SUBSTITUTION. Whole-span
27 * code modes (GfmCode, GfmFile, GfmBacktickSingle, GfmBacktickDouble)
28 * capture their entire body in one regex shot and therefore bypass
29 * GfmEscape on their content — matching GFM's rule that escapes don't
30 * fire inside code blocks or code spans.
31 *
32 * Modes that capture a literal string and need GFM unescape applied
33 * post-hoc (link URL/label, fence info string) call
34 * {@see \dokuwiki\Parsing\Helpers\Escape::unescapeBackslashes()} from
35 * their handle() — same character class.
36 *
37 * Collision with DokuWiki's Linebreak mode (`\\` before a space, tab,
38 * or newline): both patterns can claim the two backslashes at the same
39 * position. GfmEscape's sort 5 beats Linebreak's sort 140 on tie, which
40 * would silently swallow every DW forced linebreak in mixed-syntax
41 * settings. To avoid that, when DW syntax is loaded the pattern carries
42 * a negative lookahead that declines `\\` followed by `[ \t\n]` —
43 * deferring those bytes to Linebreak. Mid-line `\\` (e.g. UNC paths
44 * like `\\\\host\\share`) still escapes normally; only the EOL-adjacent
45 * form is handed off. In pure `md` mode no DW Linebreak is loaded and
46 * the lookahead is omitted so GFM-spec behavior is preserved.
47 */
48class GfmEscape extends AbstractMode
49{
50    public function __construct()
51    {
52        $this->allowedModes = [];
53    }
54
55    /** @inheritdoc */
56    public function getSort()
57    {
58        return 5;
59    }
60
61    /** @inheritdoc */
62    public function connectTo($mode)
63    {
64        global $conf;
65        // PHP `\\\\\\\\` → regex `\\\\` → matches two literal backslashes.
66        $lookahead = $conf['syntax'] === 'md' ? '' : '(?!\\\\\\\\[ \t\n])';
67        $this->Lexer->addSpecialPattern(
68            $lookahead . '\\\\' . Escape::PUNCTUATION_CHAR_CLASS,
69            $mode,
70            'gfm_escape'
71        );
72    }
73
74    /** @inheritdoc */
75    public function handle($match, $state, $pos, Handler $handler)
76    {
77        $handler->addCall('cdata', [substr($match, 1)], $pos);
78        return true;
79    }
80}
81