xref: /dokuwiki/inc/Parsing/ParserMode/GfmTable.php (revision 3dabe4e0a0d70b79a7aced8ac8a36d4b37a61024)
1*3dabe4e0SAndreas Gohr<?php
2*3dabe4e0SAndreas Gohr
3*3dabe4e0SAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4*3dabe4e0SAndreas Gohr
5*3dabe4e0SAndreas Gohruse dokuwiki\Parsing\Handler;
6*3dabe4e0SAndreas Gohruse dokuwiki\Parsing\Handler\GfmTable as GfmTableRewriter;
7*3dabe4e0SAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
8*3dabe4e0SAndreas Gohr
9*3dabe4e0SAndreas Gohr/**
10*3dabe4e0SAndreas Gohr * GFM table block.
11*3dabe4e0SAndreas Gohr *
12*3dabe4e0SAndreas Gohr * Architecturally mirrors DokuWiki's native Table mode: an entry/exit
13*3dabe4e0SAndreas Gohr * lexer state with inline modes nested via `allowedModes`, plus a small
14*3dabe4e0SAndreas Gohr * post-processing rewriter (Handler\GfmTable) that turns the flat token
15*3dabe4e0SAndreas Gohr * stream into the canonical DokuWiki table call sequence.
16*3dabe4e0SAndreas Gohr *
17*3dabe4e0SAndreas Gohr * Cells are inline-only per spec ("Block-level elements cannot be inserted
18*3dabe4e0SAndreas Gohr * in a table"). Allowed nested categories therefore mirror DW Table:
19*3dabe4e0SAndreas Gohr * FORMATTING, SUBSTITION, PROTECTED, DISABLED.
20*3dabe4e0SAndreas Gohr *
21*3dabe4e0SAndreas Gohr * Entry-pattern strategy: a single zero-width lookahead asserts the table
22*3dabe4e0SAndreas Gohr * shape (header line containing a pipe, followed by a delimiter row whose
23*3dabe4e0SAndreas Gohr * cells are exactly `:?-+:?`). Only the leading newline is consumed; the
24*3dabe4e0SAndreas Gohr * lookahead validates the rest. Non-tables — paragraphs that happen to
25*3dabe4e0SAndreas Gohr * contain pipes — never enter the mode.
26*3dabe4e0SAndreas Gohr *
27*3dabe4e0SAndreas Gohr * The internal patterns recognise:
28*3dabe4e0SAndreas Gohr *   - `\|` as a cell separator, with a `(?<!\\)` lookbehind so a backslash-
29*3dabe4e0SAndreas Gohr *     prefixed pipe is left as raw input — the cell-splitting concern. The
30*3dabe4e0SAndreas Gohr *     unescape itself (turning `\|` into a literal `|`) is GfmEscape's
31*3dabe4e0SAndreas Gohr *     concern, not this mode's; until that mode lands, `\|` survives in
32*3dabe4e0SAndreas Gohr *     cell content as the literal two-char sequence.
33*3dabe4e0SAndreas Gohr *   - `\n` followed by a non-newline, non-`>` character as a row separator;
34*3dabe4e0SAndreas Gohr *   - any other `\n` exits the mode (blank line, blockquote start, EOF).
35*3dabe4e0SAndreas Gohr *
36*3dabe4e0SAndreas Gohr * Sort 55 — one below DW Table's 60 — so that in `dw+md` and `md+dw` (where
37*3dabe4e0SAndreas Gohr * both modes load) the GFM lookahead-validated entry tries first; if it
38*3dabe4e0SAndreas Gohr * does not see a valid delimiter row, DW Table at sort 60 takes over for
39*3dabe4e0SAndreas Gohr * `\n|` rows.
40*3dabe4e0SAndreas Gohr */
41*3dabe4e0SAndreas Gohrclass GfmTable extends AbstractMode
42*3dabe4e0SAndreas Gohr{
43*3dabe4e0SAndreas Gohr    /**
44*3dabe4e0SAndreas Gohr     * GFM table cells parse only inline content.
45*3dabe4e0SAndreas Gohr     */
46*3dabe4e0SAndreas Gohr    public function __construct()
47*3dabe4e0SAndreas Gohr    {
48*3dabe4e0SAndreas Gohr        $this->allowedModes = ModeRegistry::getInstance()->getModesForCategories([
49*3dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_FORMATTING,
50*3dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_SUBSTITION,
51*3dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_PROTECTED,
52*3dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_DISABLED,
53*3dabe4e0SAndreas Gohr        ]);
54*3dabe4e0SAndreas Gohr    }
55*3dabe4e0SAndreas Gohr
56*3dabe4e0SAndreas Gohr    /** @inheritdoc */
57*3dabe4e0SAndreas Gohr    public function getSort()
58*3dabe4e0SAndreas Gohr    {
59*3dabe4e0SAndreas Gohr        return 55;
60*3dabe4e0SAndreas Gohr    }
61*3dabe4e0SAndreas Gohr
62*3dabe4e0SAndreas Gohr    /** @inheritdoc */
63*3dabe4e0SAndreas Gohr    public function preConnect()
64*3dabe4e0SAndreas Gohr    {
65*3dabe4e0SAndreas Gohr        ModeRegistry::getInstance()->registerBlockEolMode('gfm_table');
66*3dabe4e0SAndreas Gohr    }
67*3dabe4e0SAndreas Gohr
68*3dabe4e0SAndreas Gohr    /**
69*3dabe4e0SAndreas Gohr     * Entry pattern with lookahead-validated delimiter row.
70*3dabe4e0SAndreas Gohr     *
71*3dabe4e0SAndreas Gohr     * Consumes only `\n`; the zero-width lookahead asserts:
72*3dabe4e0SAndreas Gohr     *   - a header line containing at least one `|`, and
73*3dabe4e0SAndreas Gohr     *   - a delimiter row of `:?-+:?` cells separated by `|`.
74*3dabe4e0SAndreas Gohr     *
75*3dabe4e0SAndreas Gohr     * Without that validation, any paragraph containing a pipe would
76*3dabe4e0SAndreas Gohr     * trigger the table mode. With it, non-tables flow through as plain
77*3dabe4e0SAndreas Gohr     * paragraphs.
78*3dabe4e0SAndreas Gohr     *
79*3dabe4e0SAndreas Gohr     * @inheritdoc
80*3dabe4e0SAndreas Gohr     */
81*3dabe4e0SAndreas Gohr    public function connectTo($mode)
82*3dabe4e0SAndreas Gohr    {
83*3dabe4e0SAndreas Gohr        $delim =
84*3dabe4e0SAndreas Gohr            '[ \t]*\|?[ \t]*:?-+:?' .
85*3dabe4e0SAndreas Gohr            '(?:[ \t]*\|[ \t]*:?-+:?)*' .
86*3dabe4e0SAndreas Gohr            '[ \t]*\|?[ \t]*';
87*3dabe4e0SAndreas Gohr        $entry =
88*3dabe4e0SAndreas Gohr            '\n(?=' .
89*3dabe4e0SAndreas Gohr                '[^\n]*\|[^\n]*' .  // header line containing a pipe
90*3dabe4e0SAndreas Gohr                '\n' . $delim .
91*3dabe4e0SAndreas Gohr                '(?:\n|$)' .
92*3dabe4e0SAndreas Gohr            ')';
93*3dabe4e0SAndreas Gohr        $this->Lexer->addEntryPattern($entry, $mode, 'gfm_table');
94*3dabe4e0SAndreas Gohr    }
95*3dabe4e0SAndreas Gohr
96*3dabe4e0SAndreas Gohr    /** @inheritdoc */
97*3dabe4e0SAndreas Gohr    public function postConnect()
98*3dabe4e0SAndreas Gohr    {
99*3dabe4e0SAndreas Gohr        // Cell separator. The `(?<!\\)` lookbehind keeps `\|` from being
100*3dabe4e0SAndreas Gohr        // treated as a separator so backslash-escaped pipes don't split
101*3dabe4e0SAndreas Gohr        // cells. The unescape — turning `\|` into a literal `|` in cell
102*3dabe4e0SAndreas Gohr        // content — is GfmEscape's responsibility; we just need the cells
103*3dabe4e0SAndreas Gohr        // to come out the right shape. Edge: `\\|` (escaped backslash,
104*3dabe4e0SAndreas Gohr        // then a real separator pipe) is technically wrong here — the
105*3dabe4e0SAndreas Gohr        // lookbehind sees the second `\` and refuses to split — but
106*3dabe4e0SAndreas Gohr        // GfmEscape will fix it for free by consuming `\\` first, leaving
107*3dabe4e0SAndreas Gohr        // a clean `|` at separator position.
108*3dabe4e0SAndreas Gohr        $this->Lexer->addPattern('(?<!\\\\)\|', 'gfm_table');
109*3dabe4e0SAndreas Gohr        // Row separator: a newline followed by a non-newline, non-`>` char.
110*3dabe4e0SAndreas Gohr        // Excluding `>` lets a blockquote terminate the table (spec 201);
111*3dabe4e0SAndreas Gohr        // requiring a non-newline excludes blank lines and end-of-input.
112*3dabe4e0SAndreas Gohr        $this->Lexer->addPattern('\n(?=[^\n>])', 'gfm_table');
113*3dabe4e0SAndreas Gohr        // Any other newline (blank line, blockquote start, EOF) exits.
114*3dabe4e0SAndreas Gohr        $this->Lexer->addExitPattern('\n', 'gfm_table');
115*3dabe4e0SAndreas Gohr    }
116*3dabe4e0SAndreas Gohr
117*3dabe4e0SAndreas Gohr    /** @inheritdoc */
118*3dabe4e0SAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
119*3dabe4e0SAndreas Gohr    {
120*3dabe4e0SAndreas Gohr        switch ($state) {
121*3dabe4e0SAndreas Gohr            case DOKU_LEXER_ENTER:
122*3dabe4e0SAndreas Gohr                $handler->setCallWriter(new GfmTableRewriter($handler->getCallWriter()));
123*3dabe4e0SAndreas Gohr                // table_start carries the body position (skip the consumed `\n`).
124*3dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_start', [$pos + 1], $pos);
125*3dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_row', [], $pos);
126*3dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_cell', [], $pos);
127*3dabe4e0SAndreas Gohr                break;
128*3dabe4e0SAndreas Gohr
129*3dabe4e0SAndreas Gohr            case DOKU_LEXER_MATCHED:
130*3dabe4e0SAndreas Gohr                if (str_contains($match, "\n")) {
131*3dabe4e0SAndreas Gohr                    // Row separator: also opens the first cell of the new row.
132*3dabe4e0SAndreas Gohr                    $handler->addCall('gfm_table_row', [], $pos);
133*3dabe4e0SAndreas Gohr                    $handler->addCall('gfm_table_cell', [], $pos);
134*3dabe4e0SAndreas Gohr                } else {
135*3dabe4e0SAndreas Gohr                    // Bare `|` — cell separator within the current row.
136*3dabe4e0SAndreas Gohr                    $handler->addCall('gfm_table_cell', [], $pos);
137*3dabe4e0SAndreas Gohr                }
138*3dabe4e0SAndreas Gohr                break;
139*3dabe4e0SAndreas Gohr
140*3dabe4e0SAndreas Gohr            case DOKU_LEXER_UNMATCHED:
141*3dabe4e0SAndreas Gohr                $handler->addCall('cdata', [$match], $pos);
142*3dabe4e0SAndreas Gohr                break;
143*3dabe4e0SAndreas Gohr
144*3dabe4e0SAndreas Gohr            case DOKU_LEXER_EXIT:
145*3dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_end', [], $pos);
146*3dabe4e0SAndreas Gohr                /** @var GfmTableRewriter $reWriter */
147*3dabe4e0SAndreas Gohr                $reWriter = $handler->getCallWriter();
148*3dabe4e0SAndreas Gohr                $handler->setCallWriter($reWriter->process());
149*3dabe4e0SAndreas Gohr                break;
150*3dabe4e0SAndreas Gohr        }
151*3dabe4e0SAndreas Gohr        return true;
152*3dabe4e0SAndreas Gohr    }
153*3dabe4e0SAndreas Gohr}
154