xref: /dokuwiki/inc/Parsing/ParserMode/GfmCode.php (revision b1c59bed2e3645a1f5f11438cdbe7d1596f4a3a4)
1*b1c59bedSAndreas Gohr<?php
2*b1c59bedSAndreas Gohr
3*b1c59bedSAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4*b1c59bedSAndreas Gohr
5*b1c59bedSAndreas Gohruse dokuwiki\Parsing\Handler;
6*b1c59bedSAndreas Gohruse dokuwiki\Parsing\Helpers;
7*b1c59bedSAndreas Gohr
8*b1c59bedSAndreas Gohr/**
9*b1c59bedSAndreas Gohr * GFM fenced code block with backtick fences: ```...```
10*b1c59bedSAndreas Gohr *
11*b1c59bedSAndreas Gohr * Emits the same `code` handler instruction DokuWiki's `<code>` mode
12*b1c59bedSAndreas Gohr * emits, so renderers, indexing, and syntax highlighting reuse the
13*b1c59bedSAndreas Gohr * existing pipeline.
14*b1c59bedSAndreas Gohr *
15*b1c59bedSAndreas Gohr * The info string after the opening fence accepts DokuWiki's full
16*b1c59bedSAndreas Gohr * code-tag attribute vocabulary — language, optional filename, and
17*b1c59bedSAndreas Gohr * optional [key=value,...] highlight options — parsed via
18*b1c59bedSAndreas Gohr * Helpers::parseCodeAttributes. Markdown authors pasting to GitHub
19*b1c59bedSAndreas Gohr * will see the extras render as part of the language class; the
20*b1c59bedSAndreas Gohr * divergence is intentional, for feature parity with DokuWiki's
21*b1c59bedSAndreas Gohr * <code>...</code> blocks.
22*b1c59bedSAndreas Gohr *
23*b1c59bedSAndreas Gohr * Column-0 fences only (no indent tolerance, no body dedent). The close
24*b1c59bedSAndreas Gohr * fence is any run of 3+ fence chars at column 0 with only trailing
25*b1c59bedSAndreas Gohr * whitespace on the line — the opener's length is not paired with the
26*b1c59bedSAndreas Gohr * closer's, because ParallelRegex does not support backreferences.
27*b1c59bedSAndreas Gohr *
28*b1c59bedSAndreas Gohr * Unclosed fences stay literal text. GFM's spec says an unclosed fence
29*b1c59bedSAndreas Gohr * runs to end of input (and any enclosing container's end), but that
30*b1c59bedSAndreas Gohr * rule is part of CommonMark's two-pass block-then-inline parser where
31*b1c59bedSAndreas Gohr * "any container boundary closes" is the uniform termination rule. Our
32*b1c59bedSAndreas Gohr * single-pass regex lexer has no notion of container boundaries, so the
33*b1c59bedSAndreas Gohr * best we could do is "close at EOF" — a partial implementation that
34*b1c59bedSAndreas Gohr * already leaks (spec example 98, fence inside a blockquote, stays red
35*b1c59bedSAndreas Gohr * because we can't close at the blockquote boundary). Doing a degraded
36*b1c59bedSAndreas Gohr * version of the rule just moves the broken edge case somewhere less
37*b1c59bedSAndreas Gohr * obvious.
38*b1c59bedSAndreas Gohr *
39*b1c59bedSAndreas Gohr * Requiring a closer is also consistent with every other inline GFM
40*b1c59bedSAndreas Gohr * mode in this codebase (all of which use entry-pattern lookaheads to
41*b1c59bedSAndreas Gohr * verify a matching closer exists) and with DokuWiki's own <code> tag
42*b1c59bedSAndreas Gohr * parsing (<code\b(?=.*</code>)>). And it has a safer failure mode: a
43*b1c59bedSAndreas Gohr * stray ``` at the top of a document stays as literal text rather than
44*b1c59bedSAndreas Gohr * swallowing everything below it into a code block. Spec examples 96
45*b1c59bedSAndreas Gohr * and 97 are in skip.php with this rationale.
46*b1c59bedSAndreas Gohr *
47*b1c59bedSAndreas Gohr * @see GfmFile
48*b1c59bedSAndreas Gohr */
49*b1c59bedSAndreas Gohrclass GfmCode extends AbstractMode
50*b1c59bedSAndreas Gohr{
51*b1c59bedSAndreas Gohr    /** @var string The call type used in addCall ('code' or 'file') */
52*b1c59bedSAndreas Gohr    protected $type = 'code';
53*b1c59bedSAndreas Gohr
54*b1c59bedSAndreas Gohr    /** @var string The fence character (`` ` `` or `~`). */
55*b1c59bedSAndreas Gohr    protected $fenceChar = '`';
56*b1c59bedSAndreas Gohr
57*b1c59bedSAndreas Gohr    /**
58*b1c59bedSAndreas Gohr     * Info-string character class. Backtick fences forbid backticks in
59*b1c59bedSAndreas Gohr     * the info string (spec example 115); tilde fences allow anything
60*b1c59bedSAndreas Gohr     * except newline (spec example 116).
61*b1c59bedSAndreas Gohr     */
62*b1c59bedSAndreas Gohr    protected $infoClass = '[^\n`]*';
63*b1c59bedSAndreas Gohr
64*b1c59bedSAndreas Gohr    public function __construct()
65*b1c59bedSAndreas Gohr    {
66*b1c59bedSAndreas Gohr        $this->allowedModes = [];
67*b1c59bedSAndreas Gohr    }
68*b1c59bedSAndreas Gohr
69*b1c59bedSAndreas Gohr    /** @inheritdoc */
70*b1c59bedSAndreas Gohr    public function getSort()
71*b1c59bedSAndreas Gohr    {
72*b1c59bedSAndreas Gohr        return 200;
73*b1c59bedSAndreas Gohr    }
74*b1c59bedSAndreas Gohr
75*b1c59bedSAndreas Gohr    /** The lexer state / mode name. Subclasses override for tildes. */
76*b1c59bedSAndreas Gohr    protected function getModeName(): string
77*b1c59bedSAndreas Gohr    {
78*b1c59bedSAndreas Gohr        return 'gfm_code';
79*b1c59bedSAndreas Gohr    }
80*b1c59bedSAndreas Gohr
81*b1c59bedSAndreas Gohr    /** @inheritdoc */
82*b1c59bedSAndreas Gohr    public function connectTo($mode)
83*b1c59bedSAndreas Gohr    {
84*b1c59bedSAndreas Gohr        // Entry pattern breakdown (F = fence char, INFO = info-string class):
85*b1c59bedSAndreas Gohr        //   \n                      — line start (Parser prepends a newline)
86*b1c59bedSAndreas Gohr        //   F{3,}                   — opener: 3+ fence chars at column 0
87*b1c59bedSAndreas Gohr        //   INFO                    — info-string (language etc.)
88*b1c59bedSAndreas Gohr        //   (?=\n)                  — opener line must end at a newline;
89*b1c59bedSAndreas Gohr        //                             without this anchor `` ``` aa ``` ``
90*b1c59bedSAndreas Gohr        //                             on one line would parse as a fence
91*b1c59bedSAndreas Gohr        //   (?:(?!CLOSE).)*         — body: any char (DOTALL) that isn't
92*b1c59bedSAndreas Gohr        //                             the start of a close-fence line
93*b1c59bedSAndreas Gohr        //   CLOSE = \nF{3,}[ \t]*(?=\n)  — close fence, required.
94*b1c59bedSAndreas Gohr        //                             No `\z` fallback: unclosed fences stay
95*b1c59bedSAndreas Gohr        //                             literal (see class docblock)
96*b1c59bedSAndreas Gohr        $close = '\n' . $this->fenceChar . '{3,}[ \t]*(?=\n)';
97*b1c59bedSAndreas Gohr        $this->Lexer->addSpecialPattern(
98*b1c59bedSAndreas Gohr            '\n' . $this->fenceChar . '{3,}' . $this->infoClass . '(?=\n)'
99*b1c59bedSAndreas Gohr                . '(?:(?!' . $close . ').)*' . $close,
100*b1c59bedSAndreas Gohr            $mode,
101*b1c59bedSAndreas Gohr            $this->getModeName()
102*b1c59bedSAndreas Gohr        );
103*b1c59bedSAndreas Gohr    }
104*b1c59bedSAndreas Gohr
105*b1c59bedSAndreas Gohr    /** @inheritdoc */
106*b1c59bedSAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
107*b1c59bedSAndreas Gohr    {
108*b1c59bedSAndreas Gohr        $c = $this->fenceChar;
109*b1c59bedSAndreas Gohr
110*b1c59bedSAndreas Gohr        // Shed the pattern's leading \n, the opener fence run, and the
111*b1c59bedSAndreas Gohr        // close-fence run with its trailing whitespace.
112*b1c59bedSAndreas Gohr        $text = rtrim(ltrim(substr($match, 1), $c), " \t" . $c);
113*b1c59bedSAndreas Gohr
114*b1c59bedSAndreas Gohr        // The opener ended at a newline (required by the pattern's `(?=\n)`
115*b1c59bedSAndreas Gohr        // anchor), so an explode split always has two parts.
116*b1c59bedSAndreas Gohr        [$info, $body] = explode("\n", $text, 2);
117*b1c59bedSAndreas Gohr
118*b1c59bedSAndreas Gohr        [$language, $filename, $options] = Helpers::parseCodeAttributes($info);
119*b1c59bedSAndreas Gohr
120*b1c59bedSAndreas Gohr        $param = [$body, $language, $filename];
121*b1c59bedSAndreas Gohr        if ($options !== null) $param[] = $options;
122*b1c59bedSAndreas Gohr        $handler->addCall($this->type, $param, $pos);
123*b1c59bedSAndreas Gohr        return true;
124*b1c59bedSAndreas Gohr    }
125*b1c59bedSAndreas Gohr}
126