xref: /dokuwiki/inc/Parsing/Handler/GfmTable.php (revision 3dabe4e0a0d70b79a7aced8ac8a36d4b37a61024)
1<?php
2
3namespace dokuwiki\Parsing\Handler;
4
5/**
6 * CallWriter rewriter for GFM tables.
7 *
8 * GfmTable's lexer state emits a flat token stream of marker calls
9 * (`gfm_table_start`, `gfm_table_row`, `gfm_table_cell`, `gfm_table_end`)
10 * interleaved with whatever inline modes (emphasis, code spans, links, …)
11 * matched inside the cells. This rewriter:
12 *
13 *   1. Groups the flat stream into rows-of-cells, where each cell carries
14 *      its own list of nested handler calls.
15 *   2. Strips the empty leading and trailing cells that result from leading
16 *      and trailing pipes (`| a | b |` → cells `["", " a ", " b ", ""]` →
17 *      `[" a ", " b "]`).
18 *   3. Parses the second row as the GFM delimiter row, deriving per-column
19 *      alignment from `:-+:?` patterns and the column count from the cell
20 *      count.
21 *   4. Validates that the header row's cell count matches the delimiter's.
22 *      On mismatch (spec example 203), emits the captured text back as a
23 *      single cdata so the Block rewriter wraps it in a paragraph.
24 *   5. Pads body rows that are short (spec 202) and truncates body rows
25 *      that are long (spec 204) to the header's column count.
26 *   6. Trims leading/trailing whitespace from each cell's edge cdata calls
27 *      ("Spaces between pipes and cell content are trimmed").
28 *   7. Emits the canonical DokuWiki table call sequence — `table_open`,
29 *      `tablethead_open`, `tablerow_open`, per-column `tableheader_open`
30 *      with alignment, `tablethead_close`, then (only when there are
31 *      body rows — spec 205) `tabletbody_open`, per-row `tablerow_open`
32 *      with `tablecell_open`, `tabletbody_close`, and finally
33 *      `table_close`. No new handler instructions are introduced;
34 *      `tabletbody_open` / `tabletbody_close` are part of DokuWiki's
35 *      base renderer API but were never emitted before — DW Table omits
36 *      `<tbody>` entirely. Activating them here is what frees the test
37 *      renderer from having to track tbody state.
38 *
39 * Backslash-escaped pipes (`\|`) are not unescaped here — that is
40 * GfmEscape's responsibility and applies project-wide. Until that mode
41 * lands, the literal `\|` survives in cell content. The lexer's cell-
42 * separator lookbehind ensures the escape at least keeps cells from
43 * being split on the protected pipe (spec 200, partially).
44 */
45class GfmTable extends AbstractRewriter
46{
47    /** @inheritdoc */
48    protected function getClosingCall(): string
49    {
50        return 'gfm_table_end';
51    }
52
53    /** @inheritdoc */
54    public function process()
55    {
56        ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos] = $this->groupRows();
57        $rows = array_map($this->stripBoundaryEmpty(...), $rows);
58
59        $alignments = array_map(
60            fn($cell) => $this->parseAlign($this->cellText($cell)),
61            $rows[1]
62        );
63        $cols = count($alignments);
64
65        // Header / delimiter column-count mismatch is the spec-203 fallback.
66        if (count($rows[0]) !== $cols) {
67            $this->emitFallback($rows, $startPos);
68            return $this->callWriter;
69        }
70
71        $headerRow = $this->trimCellEdges($rows[0]);
72        $bodyRows = array_map(
73            fn($row) => $this->trimCellEdges($this->padOrTruncate($row, $cols)),
74            array_slice($rows, 2)
75        );
76
77        $out = $this->buildOutput($headerRow, $bodyRows, $alignments, $cols, $startPos, $endPos);
78        $this->callWriter->writeCalls($out);
79        return $this->callWriter;
80    }
81
82    /**
83     * Walk $this->calls and bucket them into rows-of-cells-of-calls.
84     *
85     * @return array{rows: array<int, array<int, array<int, array>>>, startPos: int, endPos: int}
86     *   `rows[r][c]` is a list of handler calls captured inside row `r`'s
87     *   cell `c`. `startPos` and `endPos` carry the table's opening and
88     *   closing source positions.
89     */
90    protected function groupRows(): array
91    {
92        $rows = [];
93        $rowIdx = -1;
94        $startPos = 0;
95        $endPos = 0;
96
97        foreach ($this->calls as $call) {
98            switch ($call[0]) {
99                case 'gfm_table_start':
100                    $startPos = $call[1][0] ?? $call[2];
101                    break;
102                case 'gfm_table_end':
103                    $endPos = $call[2];
104                    break;
105                case 'gfm_table_row':
106                    $rows[] = [];
107                    $rowIdx++;
108                    break;
109                case 'gfm_table_cell':
110                    $rows[$rowIdx][] = [];
111                    break;
112                default:
113                    if ($rowIdx >= 0 && !empty($rows[$rowIdx])) {
114                        $cellIdx = count($rows[$rowIdx]) - 1;
115                        $rows[$rowIdx][$cellIdx][] = $call;
116                    }
117                    break;
118            }
119        }
120
121        return ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos];
122    }
123
124    /**
125     * Remove leading and trailing empty cell from given row.
126     *
127     * Effects of leading and trailing pipes: `| a | b |` parses into four
128     * cells `["", " a ", " b ", ""]`. A row with no surrounding pipes
129     * (`a | b`) parses into two non-empty cells, which stay untouched.
130     *
131     * @param array $row a row as a list of cells; each cell is a list of
132     *                   handler calls captured between separators
133     * @return array the row with at most one boundary empty cell stripped
134     *               from each end
135     */
136    protected function stripBoundaryEmpty(array $row): array
137    {
138        if ($row && $row[0] === []) array_shift($row);
139        if ($row && end($row) === []) array_pop($row);
140        return $row;
141    }
142
143    /**
144     * Concatenate the original source text of every text-bearing call in a
145     * cell. Used for delimiter parsing and the spec-203 fallback.
146     *
147     * Relies on the project-wide convention that any inline mode which
148     * swallows source text records the matched string at args[0] — true
149     * for `cdata`, `entity`, `unformatted`, `smiley`, `multiplyentity`,
150     * plugin substitutions, etc. Open/close pairs carry empty args and
151     * drop out naturally.
152     *
153     * Motivating case: Entity eats runs of `---` as em-dash entities, so
154     * a naive cdata-only join would lose the delimiter dashes and
155     * parseAlign() would refuse the column.
156     *
157     * Implementation: extract every call's args list, extract index 0
158     * from each, implode.
159     *
160     * @param array $cellCalls handler calls captured inside one cell
161     * @return string the concatenated source text
162     */
163    protected function cellText(array $cellCalls): string
164    {
165        return implode('', array_column(array_column($cellCalls, 1), 0));
166    }
167
168    /**
169     * Decode a single delimiter cell into 'left' / 'center' / 'right' / null.
170     *
171     * Trusts the entry pattern's validation that the cell has the shape
172     * `:?-+:?`; just checks for colons at the edges.
173     *
174     * @param string $cellText the joined source text of one delimiter cell
175     * @return string|null 'left', 'center', 'right', or null when no
176     *                     alignment marker is present
177     */
178    protected function parseAlign(string $cellText): ?string
179    {
180        $trimmed = trim($cellText);
181        $left = str_starts_with($trimmed, ':');
182        $right = str_ends_with($trimmed, ':');
183        return match (true) {
184            $left && $right => 'center',
185            $right => 'right',
186            $left => 'left',
187            default => null,
188        };
189    }
190
191    /**
192     * Return a copy of the row padded with empty cells (spec 202) or
193     * truncated to the header column count (spec 204).
194     *
195     * @param array $row a body row as a list of cells
196     * @param int $cols the target column count derived from the delimiter row
197     * @return array the row with exactly $cols cells
198     */
199    protected function padOrTruncate(array $row, int $cols): array
200    {
201        $count = count($row);
202        if ($count < $cols) {
203            return array_pad($row, $cols, []);
204        }
205        if ($count > $cols) {
206            return array_slice($row, 0, $cols);
207        }
208        return $row;
209    }
210
211    /**
212     * Return a copy of the row with each cell's first cdata ltrimmed,
213     * its last cdata rtrimmed, and any cdata that became empty dropped.
214     * Intermediate cdata are left intact so internal spaces are preserved.
215     *
216     * @param array $row a row as a list of cells
217     * @return array the row with each cell's edge cdata trimmed
218     */
219    protected function trimCellEdges(array $row): array
220    {
221        return array_map($this->trimCell(...), $row);
222    }
223
224    /**
225     * Helper for trimCellEdges: trim edge cdata of a single cell.
226     *
227     * @param array $cell the cell as a list of handler calls
228     * @return array the cell with its first cdata ltrimmed, its last
229     *               cdata rtrimmed, and any cdata that became empty
230     *               dropped
231     */
232    protected function trimCell(array $cell): array
233    {
234        // get all cdata call indexes
235        $cdataIdx = array_keys(array_filter($cell, fn($c) => $c[0] === 'cdata'));
236        if ($cdataIdx) {
237            // if any, trim the first and last one's text
238            $cell[$cdataIdx[0]][1][0] = ltrim($cell[$cdataIdx[0]][1][0]);
239            $cell[end($cdataIdx)][1][0] = rtrim($cell[end($cdataIdx)][1][0]);
240        }
241        // return all cells that are not cdate or are not empty after trimming
242        return array_values(array_filter(
243            $cell,
244            fn($c) => $c[0] !== 'cdata' || $c[1][0] !== ''
245        ));
246    }
247
248    /**
249     * Spec-203 fallback. Reconstruct a `|a|b|`-style line from each row's
250     * cells via cellText() and emit the joined block as a single cdata so
251     * the Block rewriter wraps it in a paragraph. Because cellText() also
252     * walks `entity` / `unformatted` / etc., the source-text delimiter
253     * characters survive even when an inline mode consumed them.
254     *
255     * @param array $rows the captured rows-of-cells-of-calls structure
256     * @param int $pos the source position to attach to the emitted cdata
257     */
258    protected function emitFallback(array $rows, int $pos): void
259    {
260        $lines = [];
261        foreach ($rows as $row) {
262            $cellTexts = [];
263            foreach ($row as $cell) {
264                $cellTexts[] = $this->cellText($cell);
265            }
266            $lines[] = '|' . implode('|', $cellTexts) . '|';
267        }
268        $text = implode("\n", $lines);
269        if ($text === '') return;
270        $this->callWriter->writeCall(['cdata', [$text], $pos]);
271    }
272
273    /**
274     * Assemble the canonical DokuWiki table-instruction sequence.
275     *
276     * `tabletbody_open` / `tabletbody_close` are emitted only when there
277     * are body rows. Suppressing them for empty-body tables (spec 205)
278     * matches the spec's "<thead> only, no <tbody>" expectation without
279     * any state-tracking on the renderer side.
280     *
281     * @param array $headerRow trimmed header row, one cell per column
282     * @param array $bodyRows trimmed body rows, each padded or truncated
283     *                        to $cols
284     * @param array $alignments per-column alignment from the delimiter
285     *                          row; each entry is 'left' / 'center' /
286     *                          'right' / null
287     * @param int $cols column count derived from the delimiter row
288     * @param int $startPos source position of the table's start
289     * @param int $endPos source position of the table's end
290     * @return array the canonical DokuWiki table call sequence ready for
291     *               the outer call writer
292     */
293    protected function buildOutput(
294        array $headerRow,
295        array $bodyRows,
296        array $alignments,
297        int $cols,
298        int $startPos,
299        int $endPos
300    ): array {
301        $out = [];
302        $out[] = ['table_open', [$cols, 1 + count($bodyRows), $startPos], $startPos];
303        $out[] = ['tablethead_open', [], $startPos];
304        $out[] = ['tablerow_open', [], $startPos];
305        foreach ($headerRow as $i => $cell) {
306            $out[] = ['tableheader_open', [1, $alignments[$i], 1], $startPos];
307            foreach ($cell as $c) $out[] = $c;
308            $out[] = ['tableheader_close', [], $startPos];
309        }
310        $out[] = ['tablerow_close', [], $startPos];
311        $out[] = ['tablethead_close', [], $startPos];
312
313        if ($bodyRows) {
314            $out[] = ['tabletbody_open', [], $startPos];
315            foreach ($bodyRows as $row) {
316                $out[] = ['tablerow_open', [], $startPos];
317                foreach ($row as $i => $cell) {
318                    $out[] = ['tablecell_open', [1, $alignments[$i], 1], $startPos];
319                    foreach ($cell as $c) $out[] = $c;
320                    $out[] = ['tablecell_close', [], $startPos];
321                }
322                $out[] = ['tablerow_close', [], $startPos];
323            }
324            $out[] = ['tabletbody_close', [], $startPos];
325        }
326        $out[] = ['table_close', [$endPos], $endPos];
327        return $out;
328    }
329}
330