xref: /dokuwiki/inc/Parsing/ParserMode/GfmCode.php (revision 13a62f810fbd091d15ab734b467eaec0a6bf829a)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Code as CodeHelper;
7use dokuwiki\Parsing\Helpers\Escape;
8
9/**
10 * GFM fenced code block with backtick fences: ```...```
11 *
12 * Emits the same `code` handler instruction DokuWiki's `<code>` mode
13 * emits, so renderers, indexing, and syntax highlighting reuse the
14 * existing pipeline.
15 *
16 * The info string after the opening fence accepts DokuWiki's full
17 * code-tag attribute vocabulary — language, optional filename, and
18 * optional [key=value,...] highlight options — parsed via
19 * Helpers\Code::parseAttributes. Markdown authors pasting to GitHub
20 * will see the extras render as part of the language class; the
21 * divergence is intentional, for feature parity with DokuWiki's
22 * <code>...</code> blocks.
23 *
24 * Column-0 fences only (no indent tolerance, no body dedent). The close
25 * fence is any run of 3+ fence chars at column 0 with only trailing
26 * whitespace on the line — the opener's length is not paired with the
27 * closer's, because ParallelRegex does not support backreferences.
28 *
29 * Unclosed fences stay literal text. GFM's spec says an unclosed fence
30 * runs to end of input (and any enclosing container's end), but that
31 * rule is part of CommonMark's two-pass block-then-inline parser where
32 * "any container boundary closes" is the uniform termination rule. Our
33 * single-pass regex lexer has no notion of container boundaries, so the
34 * best we could do is "close at EOF" — a partial implementation that
35 * already leaks (spec example 98, fence inside a blockquote, stays red
36 * because we can't close at the blockquote boundary). Doing a degraded
37 * version of the rule just moves the broken edge case somewhere less
38 * obvious.
39 *
40 * Requiring a closer is also consistent with every other inline GFM
41 * mode in this codebase (all of which use entry-pattern lookaheads to
42 * verify a matching closer exists) and with DokuWiki's own <code> tag
43 * parsing (<code\b(?=.*</code>)>). And it has a safer failure mode: a
44 * stray ``` at the top of a document stays as literal text rather than
45 * swallowing everything below it into a code block. Spec examples 96
46 * and 97 are in skip.php with this rationale.
47 *
48 * @see GfmFile
49 */
50class GfmCode extends AbstractMode
51{
52    /** @var string The call type used in addCall ('code' or 'file') */
53    protected $type = 'code';
54
55    /** @var string The fence character (`` ` `` or `~`). */
56    protected $fenceChar = '`';
57
58    /**
59     * Info-string character class. Backtick fences forbid backticks in
60     * the info string (spec example 115); tilde fences allow anything
61     * except newline (spec example 116).
62     */
63    protected $infoClass = '[^\n`]*';
64
65    public function __construct()
66    {
67        $this->allowedModes = [];
68    }
69
70    /** @inheritdoc */
71    public function getSort()
72    {
73        return 200;
74    }
75
76    /** The lexer state / mode name. Subclasses override for tildes. */
77    protected function getModeName(): string
78    {
79        return 'gfm_code';
80    }
81
82    /** @inheritdoc */
83    public function connectTo($mode)
84    {
85        // Entry pattern breakdown (F = fence char, INFO = info-string class):
86        //   \n                      — line start (Parser prepends a newline)
87        //   F{3,}                   — opener: 3+ fence chars at column 0
88        //   INFO                    — info-string (language etc.)
89        //   (?=\n)                  — opener line must end at a newline;
90        //                             without this anchor `` ``` aa ``` ``
91        //                             on one line would parse as a fence
92        //   (?:(?!CLOSE).)*         — body: any char (DOTALL) that isn't
93        //                             the start of a close-fence line
94        //   CLOSE = \nF{3,}[ \t]*(?=\n)  — close fence, required.
95        //                             No `\z` fallback: unclosed fences stay
96        //                             literal (see class docblock)
97        $close = '\n' . $this->fenceChar . '{3,}[ \t]*(?=\n)';
98        $this->Lexer->addSpecialPattern(
99            '\n' . $this->fenceChar . '{3,}' . $this->infoClass . '(?=\n)'
100                . '(?:(?!' . $close . ').)*' . $close,
101            $mode,
102            $this->getModeName()
103        );
104    }
105
106    /** @inheritdoc */
107    public function handle($match, $state, $pos, Handler $handler)
108    {
109        $c = $this->fenceChar;
110
111        // Shed the pattern's leading \n, the opener fence run, and the
112        // close-fence run with its trailing whitespace.
113        $text = rtrim(ltrim(substr($match, 1), $c), " \t" . $c);
114
115        // The opener ended at a newline (required by the pattern's `(?=\n)`
116        // anchor), so an explode split always has two parts.
117        [$info, $body] = explode("\n", $text, 2);
118
119        [$language, $filename, $options] = CodeHelper::parseAttributes(Escape::unescapeBackslashes($info));
120
121        $param = [$body, $language, $filename];
122        if ($options !== null) $param[] = $options;
123        $handler->addCall($this->type, $param, $pos);
124        return true;
125    }
126}
127