1<?php 2 3namespace dokuwiki\Parsing\Handler; 4 5/** 6 * CallWriter rewriter for GFM tables. 7 * 8 * GfmTable's lexer state emits a flat token stream of marker calls 9 * (`gfm_table_start`, `gfm_table_row`, `gfm_table_cell`, `gfm_table_end`) 10 * interleaved with whatever inline modes (emphasis, code spans, links, …) 11 * matched inside the cells. This rewriter: 12 * 13 * 1. Groups the flat stream into rows-of-cells, where each cell carries 14 * its own list of nested handler calls. 15 * 2. Strips the empty leading and trailing cells that result from leading 16 * and trailing pipes (`| a | b |` → cells `["", " a ", " b ", ""]` → 17 * `[" a ", " b "]`). 18 * 3. Parses the second row as the GFM delimiter row, deriving per-column 19 * alignment from `:-+:?` patterns and the column count from the cell 20 * count. 21 * 4. Validates that the header row's cell count matches the delimiter's. 22 * On mismatch (spec example 203), emits the captured text back as a 23 * single cdata so the Block rewriter wraps it in a paragraph. 24 * 5. Pads body rows that are short (spec 202) and truncates body rows 25 * that are long (spec 204) to the header's column count. 26 * 6. Trims leading/trailing whitespace from each cell's edge cdata calls 27 * ("Spaces between pipes and cell content are trimmed"). 28 * 7. Emits the canonical DokuWiki table call sequence — `table_open`, 29 * `tablethead_open`, `tablerow_open`, per-column `tableheader_open` 30 * with alignment, `tablethead_close`, then (only when there are 31 * body rows — spec 205) `tabletbody_open`, per-row `tablerow_open` 32 * with `tablecell_open`, `tabletbody_close`, and finally 33 * `table_close`. No new handler instructions are introduced; 34 * `tabletbody_open` / `tabletbody_close` are part of DokuWiki's 35 * base renderer API but were never emitted before — DW Table omits 36 * `<tbody>` entirely. Activating them here is what frees the test 37 * renderer from having to track tbody state. 38 * 39 * Backslash-escaped pipes outside protected regions are consumed by 40 * GfmEscape before the cell content reaches this rewriter. Inside 41 * code spans (and any other whole-span PROTECTED capture) the `\|` 42 * survives as literal text — and the GFM tables extension demands 43 * that `\|` unescape to `|` even there, overriding §6.1's 44 * "escapes don't work in code spans" rule. unescapePipes() applies 45 * that rewrite per cell to every text-bearing call. 46 */ 47class GfmTable extends AbstractRewriter 48{ 49 /** @inheritdoc */ 50 protected function getClosingCall(): string 51 { 52 return 'gfm_table_end'; 53 } 54 55 /** @inheritdoc */ 56 public function process() 57 { 58 ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos] = $this->groupRows(); 59 $rows = array_map($this->stripBoundaryEmpty(...), $rows); 60 61 $alignments = array_map( 62 fn($cell) => $this->parseAlign($this->cellText($cell)), 63 $rows[1] 64 ); 65 $cols = count($alignments); 66 67 // Header / delimiter column-count mismatch is the spec-203 fallback. 68 if (count($rows[0]) !== $cols) { 69 $this->emitFallback($rows, $startPos); 70 return $this->callWriter; 71 } 72 73 $headerRow = $this->unescapePipes($this->trimCellEdges($rows[0])); 74 $bodyRows = array_map( 75 fn($row) => $this->unescapePipes($this->trimCellEdges($this->padOrTruncate($row, $cols))), 76 array_slice($rows, 2) 77 ); 78 79 $out = $this->buildOutput($headerRow, $bodyRows, $alignments, $cols, $startPos, $endPos); 80 $this->callWriter->writeCalls($out); 81 return $this->callWriter; 82 } 83 84 /** 85 * Walk $this->calls and bucket them into rows-of-cells-of-calls. 86 * 87 * @return array{rows: array<int, array<int, array<int, array>>>, startPos: int, endPos: int} 88 * `rows[r][c]` is a list of handler calls captured inside row `r`'s 89 * cell `c`. `startPos` and `endPos` carry the table's opening and 90 * closing source positions. 91 */ 92 protected function groupRows(): array 93 { 94 $rows = []; 95 $rowIdx = -1; 96 $startPos = 0; 97 $endPos = 0; 98 99 foreach ($this->calls as $call) { 100 switch ($call[0]) { 101 case 'gfm_table_start': 102 $startPos = $call[1][0] ?? $call[2]; 103 break; 104 case 'gfm_table_end': 105 $endPos = $call[2]; 106 break; 107 case 'gfm_table_row': 108 $rows[] = []; 109 $rowIdx++; 110 break; 111 case 'gfm_table_cell': 112 $rows[$rowIdx][] = []; 113 break; 114 default: 115 if ($rowIdx >= 0 && !empty($rows[$rowIdx])) { 116 $cellIdx = count($rows[$rowIdx]) - 1; 117 $rows[$rowIdx][$cellIdx][] = $call; 118 } 119 break; 120 } 121 } 122 123 return ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos]; 124 } 125 126 /** 127 * Remove leading and trailing empty cell from given row. 128 * 129 * Effects of leading and trailing pipes: `| a | b |` parses into four 130 * cells `["", " a ", " b ", ""]`. A row with no surrounding pipes 131 * (`a | b`) parses into two non-empty cells, which stay untouched. 132 * 133 * @param array $row a row as a list of cells; each cell is a list of 134 * handler calls captured between separators 135 * @return array the row with at most one boundary empty cell stripped 136 * from each end 137 */ 138 protected function stripBoundaryEmpty(array $row): array 139 { 140 if ($row && $row[0] === []) array_shift($row); 141 if ($row && end($row) === []) array_pop($row); 142 return $row; 143 } 144 145 /** 146 * Concatenate the original source text of every text-bearing call in a 147 * cell. Used for delimiter parsing and the spec-203 fallback. 148 * 149 * Relies on the project-wide convention that any inline mode which 150 * swallows source text records the matched string at args[0] — true 151 * for `cdata`, `entity`, `unformatted`, `smiley`, `multiplyentity`, 152 * plugin substitutions, etc. Open/close pairs carry empty args and 153 * drop out naturally. 154 * 155 * Motivating case: Entity eats runs of `---` as em-dash entities, so 156 * a naive cdata-only join would lose the delimiter dashes and 157 * parseAlign() would refuse the column. 158 * 159 * Implementation: extract every call's args list, extract index 0 160 * from each, implode. 161 * 162 * @param array $cellCalls handler calls captured inside one cell 163 * @return string the concatenated source text 164 */ 165 protected function cellText(array $cellCalls): string 166 { 167 return implode('', array_column(array_column($cellCalls, 1), 0)); 168 } 169 170 /** 171 * Decode a single delimiter cell into 'left' / 'center' / 'right' / null. 172 * 173 * Trusts the entry pattern's validation that the cell has the shape 174 * `:?-+:?`; just checks for colons at the edges. 175 * 176 * @param string $cellText the joined source text of one delimiter cell 177 * @return string|null 'left', 'center', 'right', or null when no 178 * alignment marker is present 179 */ 180 protected function parseAlign(string $cellText): ?string 181 { 182 $trimmed = trim($cellText); 183 $left = str_starts_with($trimmed, ':'); 184 $right = str_ends_with($trimmed, ':'); 185 return match (true) { 186 $left && $right => 'center', 187 $right => 'right', 188 $left => 'left', 189 default => null, 190 }; 191 } 192 193 /** 194 * Return a copy of the row padded with empty cells (spec 202) or 195 * truncated to the header column count (spec 204). 196 * 197 * @param array $row a body row as a list of cells 198 * @param int $cols the target column count derived from the delimiter row 199 * @return array the row with exactly $cols cells 200 */ 201 protected function padOrTruncate(array $row, int $cols): array 202 { 203 $count = count($row); 204 if ($count < $cols) { 205 return array_pad($row, $cols, []); 206 } 207 if ($count > $cols) { 208 return array_slice($row, 0, $cols); 209 } 210 return $row; 211 } 212 213 /** 214 * Return a copy of the row with each cell's first cdata ltrimmed, 215 * its last cdata rtrimmed, and any cdata that became empty dropped. 216 * Intermediate cdata are left intact so internal spaces are preserved. 217 * 218 * @param array $row a row as a list of cells 219 * @return array the row with each cell's edge cdata trimmed 220 */ 221 protected function trimCellEdges(array $row): array 222 { 223 return array_map($this->trimCell(...), $row); 224 } 225 226 /** 227 * Helper for trimCellEdges: trim edge cdata of a single cell. 228 * 229 * @param array $cell the cell as a list of handler calls 230 * @return array the cell with its first cdata ltrimmed, its last 231 * cdata rtrimmed, and any cdata that became empty 232 * dropped 233 */ 234 protected function trimCell(array $cell): array 235 { 236 // get all cdata call indexes 237 $cdataIdx = array_keys(array_filter($cell, fn($c) => $c[0] === 'cdata')); 238 if ($cdataIdx) { 239 // if any, trim the first and last one's text 240 $cell[$cdataIdx[0]][1][0] = ltrim($cell[$cdataIdx[0]][1][0]); 241 $cell[end($cdataIdx)][1][0] = rtrim($cell[end($cdataIdx)][1][0]); 242 } 243 // return all cells that are not cdate or are not empty after trimming 244 return array_values(array_filter( 245 $cell, 246 fn($c) => $c[0] !== 'cdata' || $c[1][0] !== '' 247 )); 248 } 249 250 /** 251 * Apply the GFM tables-extension rule that `\|` always unescapes to 252 * `|` inside table cells — including the bodies of code spans and 253 * other whole-span PROTECTED captures, where standard §6.1 escape 254 * rules don't fire. Walks every text-bearing call (cdata, 255 * unformatted, entity, plugin substitutions, …) and str_replace's 256 * the literal two-char sequence on its first arg. Other escapes 257 * inside code spans are left alone — only `\|` gets the special 258 * table treatment. 259 * 260 * In normal cell text, GfmEscape has already consumed `\|` upstream, 261 * so this pass is a no-op there; its job is to catch the codespan 262 * case that bypasses the lexer. 263 * 264 * @param array $row a row as a list of cells 265 * @return array the row with `\|` rewritten to `|` in every cell 266 */ 267 protected function unescapePipes(array $row): array 268 { 269 foreach ($row as &$cell) { 270 foreach ($cell as &$call) { 271 if (isset($call[1][0]) && is_string($call[1][0])) { 272 $call[1][0] = str_replace('\\|', '|', $call[1][0]); 273 } 274 } 275 } 276 return $row; 277 } 278 279 /** 280 * Spec-203 fallback. Reconstruct a `|a|b|`-style line from each row's 281 * cells via cellText() and emit the joined block as a single cdata so 282 * the Block rewriter wraps it in a paragraph. Because cellText() also 283 * walks `entity` / `unformatted` / etc., the source-text delimiter 284 * characters survive even when an inline mode consumed them. 285 * 286 * @param array $rows the captured rows-of-cells-of-calls structure 287 * @param int $pos the source position to attach to the emitted cdata 288 */ 289 protected function emitFallback(array $rows, int $pos): void 290 { 291 $lines = []; 292 foreach ($rows as $row) { 293 $cellTexts = []; 294 foreach ($row as $cell) { 295 $cellTexts[] = $this->cellText($cell); 296 } 297 $lines[] = '|' . implode('|', $cellTexts) . '|'; 298 } 299 $text = implode("\n", $lines); 300 if ($text === '') return; 301 $this->callWriter->writeCall(['cdata', [$text], $pos]); 302 } 303 304 /** 305 * Assemble the canonical DokuWiki table-instruction sequence. 306 * 307 * `tabletbody_open` / `tabletbody_close` are emitted only when there 308 * are body rows. Suppressing them for empty-body tables (spec 205) 309 * matches the spec's "<thead> only, no <tbody>" expectation without 310 * any state-tracking on the renderer side. 311 * 312 * @param array $headerRow trimmed header row, one cell per column 313 * @param array $bodyRows trimmed body rows, each padded or truncated 314 * to $cols 315 * @param array $alignments per-column alignment from the delimiter 316 * row; each entry is 'left' / 'center' / 317 * 'right' / null 318 * @param int $cols column count derived from the delimiter row 319 * @param int $startPos source position of the table's start 320 * @param int $endPos source position of the table's end 321 * @return array the canonical DokuWiki table call sequence ready for 322 * the outer call writer 323 */ 324 protected function buildOutput( 325 array $headerRow, 326 array $bodyRows, 327 array $alignments, 328 int $cols, 329 int $startPos, 330 int $endPos 331 ): array { 332 $out = []; 333 $out[] = ['table_open', [$cols, 1 + count($bodyRows), $startPos], $startPos]; 334 $out[] = ['tablethead_open', [], $startPos]; 335 $out[] = ['tablerow_open', [], $startPos]; 336 foreach ($headerRow as $i => $cell) { 337 $out[] = ['tableheader_open', [1, $alignments[$i], 1], $startPos]; 338 foreach ($cell as $c) $out[] = $c; 339 $out[] = ['tableheader_close', [], $startPos]; 340 } 341 $out[] = ['tablerow_close', [], $startPos]; 342 $out[] = ['tablethead_close', [], $startPos]; 343 344 if ($bodyRows) { 345 $out[] = ['tabletbody_open', [], $startPos]; 346 foreach ($bodyRows as $row) { 347 $out[] = ['tablerow_open', [], $startPos]; 348 foreach ($row as $i => $cell) { 349 $out[] = ['tablecell_open', [1, $alignments[$i], 1], $startPos]; 350 foreach ($cell as $c) $out[] = $c; 351 $out[] = ['tablecell_close', [], $startPos]; 352 } 353 $out[] = ['tablerow_close', [], $startPos]; 354 } 355 $out[] = ['tabletbody_close', [], $startPos]; 356 } 357 $out[] = ['table_close', [$endPos], $endPos]; 358 return $out; 359 } 360} 361