xref: /dokuwiki/inc/Parsing/ParserMode/GfmCode.php (revision 47a02a102092be9e1e6f1ddaf158bdfffdb13d4f)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Code as CodeHelper;
7use dokuwiki\Parsing\Helpers\Escape;
8use dokuwiki\Parsing\Helpers\HtmlEntity;
9
10/**
11 * GFM fenced code block with backtick fences: ```...```
12 *
13 * Emits the same `code` handler instruction DokuWiki's `<code>` mode
14 * emits, so renderers, indexing, and syntax highlighting reuse the
15 * existing pipeline.
16 *
17 * The info string after the opening fence accepts DokuWiki's full
18 * code-tag attribute vocabulary — language, optional filename, and
19 * optional [key=value,...] highlight options — parsed via
20 * Helpers\Code::parseAttributes. Markdown authors pasting to GitHub
21 * will see the extras render as part of the language class; the
22 * divergence is intentional, for feature parity with DokuWiki's
23 * <code>...</code> blocks.
24 *
25 * Column-0 fences only (no indent tolerance, no body dedent). The close
26 * fence is any run of 3+ fence chars at column 0 with only trailing
27 * whitespace on the line — the opener's length is not paired with the
28 * closer's, because ParallelRegex does not support backreferences.
29 *
30 * Unclosed fences stay literal text. GFM's spec says an unclosed fence
31 * runs to end of input (and any enclosing container's end), but that
32 * rule is part of CommonMark's two-pass block-then-inline parser where
33 * "any container boundary closes" is the uniform termination rule. Our
34 * single-pass regex lexer has no notion of container boundaries, so the
35 * best we could do is "close at EOF" — a partial implementation that
36 * already leaks (spec example 98, fence inside a blockquote, stays red
37 * because we can't close at the blockquote boundary). Doing a degraded
38 * version of the rule just moves the broken edge case somewhere less
39 * obvious.
40 *
41 * Requiring a closer is also consistent with every other inline GFM
42 * mode in this codebase (all of which use entry-pattern lookaheads to
43 * verify a matching closer exists) and with DokuWiki's own <code> tag
44 * parsing (<code\b(?=.*</code>)>). And it has a safer failure mode: a
45 * stray ``` at the top of a document stays as literal text rather than
46 * swallowing everything below it into a code block. Spec examples 96
47 * and 97 are in skip.php with this rationale.
48 *
49 * @see GfmFile
50 */
51class GfmCode extends AbstractMode
52{
53    /** @var string The call type used in addCall ('code' or 'file') */
54    protected $type = 'code';
55
56    /** @var string The fence character (`` ` `` or `~`). */
57    protected $fenceChar = '`';
58
59    /**
60     * Info-string character class. Backtick fences forbid backticks in
61     * the info string (spec example 115); tilde fences allow anything
62     * except newline (spec example 116).
63     */
64    protected $infoClass = '[^\n`]*';
65
66    /** @inheritdoc */
67    public function getSort()
68    {
69        return 200;
70    }
71
72    /** The lexer state / mode name. Subclasses override for tildes. */
73    protected function getModeName(): string
74    {
75        return 'gfm_code';
76    }
77
78    /** @inheritdoc */
79    public function connectTo($mode)
80    {
81        // Entry pattern breakdown (F = fence char, INFO = info-string class):
82        //   \n                      — line start (Parser prepends a newline)
83        //   F{3,}                   — opener: 3+ fence chars at column 0
84        //   INFO                    — info-string (language etc.)
85        //   (?=\n)                  — opener line must end at a newline;
86        //                             without this anchor `` ``` aa ``` ``
87        //                             on one line would parse as a fence
88        //   (?:(?!CLOSE).)*         — body: any char (DOTALL) that isn't
89        //                             the start of a close-fence line
90        //   CLOSE = \nF{3,}[ \t]*(?=\n)  — close fence, required.
91        //                             No `\z` fallback: unclosed fences stay
92        //                             literal (see class docblock)
93        $close = '\n' . $this->fenceChar . '{3,}[ \t]*(?=\n)';
94        $this->Lexer->addSpecialPattern(
95            '\n' . $this->fenceChar . '{3,}' . $this->infoClass . '(?=\n)'
96                . '(?:(?!' . $close . ').)*' . $close,
97            $mode,
98            $this->getModeName()
99        );
100    }
101
102    /** @inheritdoc */
103    public function handle($match, $state, $pos, Handler $handler)
104    {
105        $c = $this->fenceChar;
106
107        // Shed the pattern's leading \n, the opener fence run, and the
108        // close-fence run with its trailing whitespace.
109        $text = rtrim(ltrim(substr($match, 1), $c), " \t" . $c);
110
111        // The opener ended at a newline (required by the pattern's `(?=\n)`
112        // anchor), so an explode split always has two parts.
113        [$info, $body] = explode("\n", $text, 2);
114
115        [$language, $filename, $options] = CodeHelper::parseAttributes(
116            Escape::unescapeBackslashes(HtmlEntity::decode($info))
117        );
118
119        $param = [$body, $language, $filename];
120        if ($options !== null) $param[] = $options;
121        $handler->addCall($this->type, $param, $pos);
122        return true;
123    }
124}
125