xref: /dokuwiki/inc/Parsing/Handler/GfmTable.php (revision 13a62f810fbd091d15ab734b467eaec0a6bf829a)
1<?php
2
3namespace dokuwiki\Parsing\Handler;
4
5/**
6 * CallWriter rewriter for GFM tables.
7 *
8 * GfmTable's lexer state emits a flat token stream of marker calls
9 * (`gfm_table_start`, `gfm_table_row`, `gfm_table_cell`, `gfm_table_end`)
10 * interleaved with whatever inline modes (emphasis, code spans, links, …)
11 * matched inside the cells. This rewriter:
12 *
13 *   1. Groups the flat stream into rows-of-cells, where each cell carries
14 *      its own list of nested handler calls.
15 *   2. Strips the empty leading and trailing cells that result from leading
16 *      and trailing pipes (`| a | b |` → cells `["", " a ", " b ", ""]` →
17 *      `[" a ", " b "]`).
18 *   3. Parses the second row as the GFM delimiter row, deriving per-column
19 *      alignment from `:-+:?` patterns and the column count from the cell
20 *      count.
21 *   4. Validates that the header row's cell count matches the delimiter's.
22 *      On mismatch (spec example 203), emits the captured text back as a
23 *      single cdata so the Block rewriter wraps it in a paragraph.
24 *   5. Pads body rows that are short (spec 202) and truncates body rows
25 *      that are long (spec 204) to the header's column count.
26 *   6. Trims leading/trailing whitespace from each cell's edge cdata calls
27 *      ("Spaces between pipes and cell content are trimmed").
28 *   7. Emits the canonical DokuWiki table call sequence — `table_open`,
29 *      `tablethead_open`, `tablerow_open`, per-column `tableheader_open`
30 *      with alignment, `tablethead_close`, then (only when there are
31 *      body rows — spec 205) `tabletbody_open`, per-row `tablerow_open`
32 *      with `tablecell_open`, `tabletbody_close`, and finally
33 *      `table_close`. No new handler instructions are introduced;
34 *      `tabletbody_open` / `tabletbody_close` are part of DokuWiki's
35 *      base renderer API but were never emitted before — DW Table omits
36 *      `<tbody>` entirely. Activating them here is what frees the test
37 *      renderer from having to track tbody state.
38 *
39 * Backslash-escaped pipes outside protected regions are consumed by
40 * GfmEscape before the cell content reaches this rewriter. Inside
41 * code spans (and any other whole-span PROTECTED capture) the `\|`
42 * survives as literal text — and the GFM tables extension demands
43 * that `\|` unescape to `|` even there, overriding §6.1's
44 * "escapes don't work in code spans" rule. unescapePipes() applies
45 * that rewrite per cell to every text-bearing call.
46 */
47class GfmTable extends AbstractRewriter
48{
49    /** @inheritdoc */
50    protected function getClosingCall(): string
51    {
52        return 'gfm_table_end';
53    }
54
55    /** @inheritdoc */
56    public function process()
57    {
58        ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos] = $this->groupRows();
59        $rows = array_map($this->stripBoundaryEmpty(...), $rows);
60
61        $alignments = array_map(
62            fn($cell) => $this->parseAlign($this->cellText($cell)),
63            $rows[1]
64        );
65        $cols = count($alignments);
66
67        // Header / delimiter column-count mismatch is the spec-203 fallback.
68        if (count($rows[0]) !== $cols) {
69            $this->emitFallback($rows, $startPos);
70            return $this->callWriter;
71        }
72
73        $headerRow = $this->unescapePipes($this->trimCellEdges($rows[0]));
74        $bodyRows = array_map(
75            fn($row) => $this->unescapePipes($this->trimCellEdges($this->padOrTruncate($row, $cols))),
76            array_slice($rows, 2)
77        );
78
79        $out = $this->buildOutput($headerRow, $bodyRows, $alignments, $cols, $startPos, $endPos);
80        $this->callWriter->writeCalls($out);
81        return $this->callWriter;
82    }
83
84    /**
85     * Walk $this->calls and bucket them into rows-of-cells-of-calls.
86     *
87     * @return array{rows: array<int, array<int, array<int, array>>>, startPos: int, endPos: int}
88     *   `rows[r][c]` is a list of handler calls captured inside row `r`'s
89     *   cell `c`. `startPos` and `endPos` carry the table's opening and
90     *   closing source positions.
91     */
92    protected function groupRows(): array
93    {
94        $rows = [];
95        $rowIdx = -1;
96        $startPos = 0;
97        $endPos = 0;
98
99        foreach ($this->calls as $call) {
100            switch ($call[0]) {
101                case 'gfm_table_start':
102                    $startPos = $call[1][0] ?? $call[2];
103                    break;
104                case 'gfm_table_end':
105                    $endPos = $call[2];
106                    break;
107                case 'gfm_table_row':
108                    $rows[] = [];
109                    $rowIdx++;
110                    break;
111                case 'gfm_table_cell':
112                    $rows[$rowIdx][] = [];
113                    break;
114                default:
115                    if ($rowIdx >= 0 && !empty($rows[$rowIdx])) {
116                        $cellIdx = count($rows[$rowIdx]) - 1;
117                        $rows[$rowIdx][$cellIdx][] = $call;
118                    }
119                    break;
120            }
121        }
122
123        return ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos];
124    }
125
126    /**
127     * Remove leading and trailing empty cell from given row.
128     *
129     * Effects of leading and trailing pipes: `| a | b |` parses into four
130     * cells `["", " a ", " b ", ""]`. A row with no surrounding pipes
131     * (`a | b`) parses into two non-empty cells, which stay untouched.
132     *
133     * @param array $row a row as a list of cells; each cell is a list of
134     *                   handler calls captured between separators
135     * @return array the row with at most one boundary empty cell stripped
136     *               from each end
137     */
138    protected function stripBoundaryEmpty(array $row): array
139    {
140        if ($row && $row[0] === []) array_shift($row);
141        if ($row && end($row) === []) array_pop($row);
142        return $row;
143    }
144
145    /**
146     * Concatenate the original source text of every text-bearing call in a
147     * cell. Used for delimiter parsing and the spec-203 fallback.
148     *
149     * Relies on the project-wide convention that any inline mode which
150     * swallows source text records the matched string at args[0] — true
151     * for `cdata`, `entity`, `unformatted`, `smiley`, `multiplyentity`,
152     * plugin substitutions, etc. Open/close pairs carry empty args and
153     * drop out naturally.
154     *
155     * Motivating case: Entity eats runs of `---` as em-dash entities, so
156     * a naive cdata-only join would lose the delimiter dashes and
157     * parseAlign() would refuse the column.
158     *
159     * Implementation: extract every call's args list, extract index 0
160     * from each, implode.
161     *
162     * @param array $cellCalls handler calls captured inside one cell
163     * @return string the concatenated source text
164     */
165    protected function cellText(array $cellCalls): string
166    {
167        return implode('', array_column(array_column($cellCalls, 1), 0));
168    }
169
170    /**
171     * Decode a single delimiter cell into 'left' / 'center' / 'right' / null.
172     *
173     * Trusts the entry pattern's validation that the cell has the shape
174     * `:?-+:?`; just checks for colons at the edges.
175     *
176     * @param string $cellText the joined source text of one delimiter cell
177     * @return string|null 'left', 'center', 'right', or null when no
178     *                     alignment marker is present
179     */
180    protected function parseAlign(string $cellText): ?string
181    {
182        $trimmed = trim($cellText);
183        $left = str_starts_with($trimmed, ':');
184        $right = str_ends_with($trimmed, ':');
185        return match (true) {
186            $left && $right => 'center',
187            $right => 'right',
188            $left => 'left',
189            default => null,
190        };
191    }
192
193    /**
194     * Return a copy of the row padded with empty cells (spec 202) or
195     * truncated to the header column count (spec 204).
196     *
197     * @param array $row a body row as a list of cells
198     * @param int $cols the target column count derived from the delimiter row
199     * @return array the row with exactly $cols cells
200     */
201    protected function padOrTruncate(array $row, int $cols): array
202    {
203        $count = count($row);
204        if ($count < $cols) {
205            return array_pad($row, $cols, []);
206        }
207        if ($count > $cols) {
208            return array_slice($row, 0, $cols);
209        }
210        return $row;
211    }
212
213    /**
214     * Return a copy of the row with each cell's first cdata ltrimmed,
215     * its last cdata rtrimmed, and any cdata that became empty dropped.
216     * Intermediate cdata are left intact so internal spaces are preserved.
217     *
218     * @param array $row a row as a list of cells
219     * @return array the row with each cell's edge cdata trimmed
220     */
221    protected function trimCellEdges(array $row): array
222    {
223        return array_map($this->trimCell(...), $row);
224    }
225
226    /**
227     * Helper for trimCellEdges: trim edge cdata of a single cell.
228     *
229     * @param array $cell the cell as a list of handler calls
230     * @return array the cell with its first cdata ltrimmed, its last
231     *               cdata rtrimmed, and any cdata that became empty
232     *               dropped
233     */
234    protected function trimCell(array $cell): array
235    {
236        // get all cdata call indexes
237        $cdataIdx = array_keys(array_filter($cell, fn($c) => $c[0] === 'cdata'));
238        if ($cdataIdx) {
239            // if any, trim the first and last one's text
240            $cell[$cdataIdx[0]][1][0] = ltrim($cell[$cdataIdx[0]][1][0]);
241            $cell[end($cdataIdx)][1][0] = rtrim($cell[end($cdataIdx)][1][0]);
242        }
243        // return all cells that are not cdate or are not empty after trimming
244        return array_values(array_filter(
245            $cell,
246            fn($c) => $c[0] !== 'cdata' || $c[1][0] !== ''
247        ));
248    }
249
250    /**
251     * Apply the GFM tables-extension rule that `\|` always unescapes to
252     * `|` inside table cells — including the bodies of code spans and
253     * other whole-span PROTECTED captures, where standard §6.1 escape
254     * rules don't fire. Walks every text-bearing call (cdata,
255     * unformatted, entity, plugin substitutions, …) and str_replace's
256     * the literal two-char sequence on its first arg. Other escapes
257     * inside code spans are left alone — only `\|` gets the special
258     * table treatment.
259     *
260     * In normal cell text, GfmEscape has already consumed `\|` upstream,
261     * so this pass is a no-op there; its job is to catch the codespan
262     * case that bypasses the lexer.
263     *
264     * @param array $row a row as a list of cells
265     * @return array the row with `\|` rewritten to `|` in every cell
266     */
267    protected function unescapePipes(array $row): array
268    {
269        foreach ($row as &$cell) {
270            foreach ($cell as &$call) {
271                if (isset($call[1][0]) && is_string($call[1][0])) {
272                    $call[1][0] = str_replace('\\|', '|', $call[1][0]);
273                }
274            }
275        }
276        return $row;
277    }
278
279    /**
280     * Spec-203 fallback. Reconstruct a `|a|b|`-style line from each row's
281     * cells via cellText() and emit the joined block as a single cdata so
282     * the Block rewriter wraps it in a paragraph. Because cellText() also
283     * walks `entity` / `unformatted` / etc., the source-text delimiter
284     * characters survive even when an inline mode consumed them.
285     *
286     * @param array $rows the captured rows-of-cells-of-calls structure
287     * @param int $pos the source position to attach to the emitted cdata
288     */
289    protected function emitFallback(array $rows, int $pos): void
290    {
291        $lines = [];
292        foreach ($rows as $row) {
293            $cellTexts = [];
294            foreach ($row as $cell) {
295                $cellTexts[] = $this->cellText($cell);
296            }
297            $lines[] = '|' . implode('|', $cellTexts) . '|';
298        }
299        $text = implode("\n", $lines);
300        if ($text === '') return;
301        $this->callWriter->writeCall(['cdata', [$text], $pos]);
302    }
303
304    /**
305     * Assemble the canonical DokuWiki table-instruction sequence.
306     *
307     * `tabletbody_open` / `tabletbody_close` are emitted only when there
308     * are body rows. Suppressing them for empty-body tables (spec 205)
309     * matches the spec's "<thead> only, no <tbody>" expectation without
310     * any state-tracking on the renderer side.
311     *
312     * @param array $headerRow trimmed header row, one cell per column
313     * @param array $bodyRows trimmed body rows, each padded or truncated
314     *                        to $cols
315     * @param array $alignments per-column alignment from the delimiter
316     *                          row; each entry is 'left' / 'center' /
317     *                          'right' / null
318     * @param int $cols column count derived from the delimiter row
319     * @param int $startPos source position of the table's start
320     * @param int $endPos source position of the table's end
321     * @return array the canonical DokuWiki table call sequence ready for
322     *               the outer call writer
323     */
324    protected function buildOutput(
325        array $headerRow,
326        array $bodyRows,
327        array $alignments,
328        int $cols,
329        int $startPos,
330        int $endPos
331    ): array {
332        $out = [];
333        $out[] = ['table_open', [$cols, 1 + count($bodyRows), $startPos], $startPos];
334        $out[] = ['tablethead_open', [], $startPos];
335        $out[] = ['tablerow_open', [], $startPos];
336        foreach ($headerRow as $i => $cell) {
337            $out[] = ['tableheader_open', [1, $alignments[$i], 1], $startPos];
338            foreach ($cell as $c) $out[] = $c;
339            $out[] = ['tableheader_close', [], $startPos];
340        }
341        $out[] = ['tablerow_close', [], $startPos];
342        $out[] = ['tablethead_close', [], $startPos];
343
344        if ($bodyRows) {
345            $out[] = ['tabletbody_open', [], $startPos];
346            foreach ($bodyRows as $row) {
347                $out[] = ['tablerow_open', [], $startPos];
348                foreach ($row as $i => $cell) {
349                    $out[] = ['tablecell_open', [1, $alignments[$i], 1], $startPos];
350                    foreach ($cell as $c) $out[] = $c;
351                    $out[] = ['tablecell_close', [], $startPos];
352                }
353                $out[] = ['tablerow_close', [], $startPos];
354            }
355            $out[] = ['tabletbody_close', [], $startPos];
356        }
357        $out[] = ['table_close', [$endPos], $endPos];
358        return $out;
359    }
360}
361