1<?php 2 3namespace dokuwiki\Parsing\Handler; 4 5/** 6 * CallWriter rewriter for GFM tables. 7 * 8 * GfmTable's lexer state emits a flat token stream of marker calls 9 * (`gfm_table_start`, `gfm_table_row`, `gfm_table_cell`, `gfm_table_end`) 10 * interleaved with whatever inline modes (emphasis, code spans, links, …) 11 * matched inside the cells. This rewriter: 12 * 13 * 1. Groups the flat stream into rows-of-cells, where each cell carries 14 * its own list of nested handler calls. 15 * 2. Strips the empty leading and trailing cells that result from leading 16 * and trailing pipes (`| a | b |` → cells `["", " a ", " b ", ""]` → 17 * `[" a ", " b "]`). 18 * 3. Parses the second row as the GFM delimiter row, deriving per-column 19 * alignment from `:-+:?` patterns and the column count from the cell 20 * count. 21 * 4. Validates that the header row's cell count matches the delimiter's. 22 * On mismatch (spec example 203), emits the captured text back as a 23 * single cdata so the Block rewriter wraps it in a paragraph. 24 * 5. Pads body rows that are short (spec 202) and truncates body rows 25 * that are long (spec 204) to the header's column count. 26 * 6. Trims leading/trailing whitespace from each cell's edge cdata calls 27 * ("Spaces between pipes and cell content are trimmed"). 28 * 7. Emits the canonical DokuWiki table call sequence — `table_open`, 29 * `tablethead_open`, `tablerow_open`, per-column `tableheader_open` 30 * with alignment, `tablethead_close`, then (only when there are 31 * body rows — spec 205) `tabletbody_open`, per-row `tablerow_open` 32 * with `tablecell_open`, `tabletbody_close`, and finally 33 * `table_close`. No new handler instructions are introduced; 34 * `tabletbody_open` / `tabletbody_close` are part of DokuWiki's 35 * base renderer API but were never emitted before — DW Table omits 36 * `<tbody>` entirely. Activating them here is what frees the test 37 * renderer from having to track tbody state. 38 * 39 * Backslash-escaped pipes (`\|`) are not unescaped here — that is 40 * GfmEscape's responsibility and applies project-wide. Until that mode 41 * lands, the literal `\|` survives in cell content. The lexer's cell- 42 * separator lookbehind ensures the escape at least keeps cells from 43 * being split on the protected pipe (spec 200, partially). 44 */ 45class GfmTable extends AbstractRewriter 46{ 47 /** @inheritdoc */ 48 protected function getClosingCall(): string 49 { 50 return 'gfm_table_end'; 51 } 52 53 /** @inheritdoc */ 54 public function process() 55 { 56 ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos] = $this->groupRows(); 57 $rows = array_map($this->stripBoundaryEmpty(...), $rows); 58 59 $alignments = array_map( 60 fn($cell) => $this->parseAlign($this->cellText($cell)), 61 $rows[1] 62 ); 63 $cols = count($alignments); 64 65 // Header / delimiter column-count mismatch is the spec-203 fallback. 66 if (count($rows[0]) !== $cols) { 67 $this->emitFallback($rows, $startPos); 68 return $this->callWriter; 69 } 70 71 $headerRow = $this->trimCellEdges($rows[0]); 72 $bodyRows = array_map( 73 fn($row) => $this->trimCellEdges($this->padOrTruncate($row, $cols)), 74 array_slice($rows, 2) 75 ); 76 77 $out = $this->buildOutput($headerRow, $bodyRows, $alignments, $cols, $startPos, $endPos); 78 $this->callWriter->writeCalls($out); 79 return $this->callWriter; 80 } 81 82 /** 83 * Walk $this->calls and bucket them into rows-of-cells-of-calls. 84 * 85 * @return array{rows: array<int, array<int, array<int, array>>>, startPos: int, endPos: int} 86 * `rows[r][c]` is a list of handler calls captured inside row `r`'s 87 * cell `c`. `startPos` and `endPos` carry the table's opening and 88 * closing source positions. 89 */ 90 protected function groupRows(): array 91 { 92 $rows = []; 93 $rowIdx = -1; 94 $startPos = 0; 95 $endPos = 0; 96 97 foreach ($this->calls as $call) { 98 switch ($call[0]) { 99 case 'gfm_table_start': 100 $startPos = $call[1][0] ?? $call[2]; 101 break; 102 case 'gfm_table_end': 103 $endPos = $call[2]; 104 break; 105 case 'gfm_table_row': 106 $rows[] = []; 107 $rowIdx++; 108 break; 109 case 'gfm_table_cell': 110 $rows[$rowIdx][] = []; 111 break; 112 default: 113 if ($rowIdx >= 0 && !empty($rows[$rowIdx])) { 114 $cellIdx = count($rows[$rowIdx]) - 1; 115 $rows[$rowIdx][$cellIdx][] = $call; 116 } 117 break; 118 } 119 } 120 121 return ['rows' => $rows, 'startPos' => $startPos, 'endPos' => $endPos]; 122 } 123 124 /** 125 * Remove leading and trailing empty cell from given row. 126 * 127 * Effects of leading and trailing pipes: `| a | b |` parses into four 128 * cells `["", " a ", " b ", ""]`. A row with no surrounding pipes 129 * (`a | b`) parses into two non-empty cells, which stay untouched. 130 * 131 * @param array $row a row as a list of cells; each cell is a list of 132 * handler calls captured between separators 133 * @return array the row with at most one boundary empty cell stripped 134 * from each end 135 */ 136 protected function stripBoundaryEmpty(array $row): array 137 { 138 if ($row && $row[0] === []) array_shift($row); 139 if ($row && end($row) === []) array_pop($row); 140 return $row; 141 } 142 143 /** 144 * Concatenate the original source text of every text-bearing call in a 145 * cell. Used for delimiter parsing and the spec-203 fallback. 146 * 147 * Relies on the project-wide convention that any inline mode which 148 * swallows source text records the matched string at args[0] — true 149 * for `cdata`, `entity`, `unformatted`, `smiley`, `multiplyentity`, 150 * plugin substitutions, etc. Open/close pairs carry empty args and 151 * drop out naturally. 152 * 153 * Motivating case: Entity eats runs of `---` as em-dash entities, so 154 * a naive cdata-only join would lose the delimiter dashes and 155 * parseAlign() would refuse the column. 156 * 157 * Implementation: extract every call's args list, extract index 0 158 * from each, implode. 159 * 160 * @param array $cellCalls handler calls captured inside one cell 161 * @return string the concatenated source text 162 */ 163 protected function cellText(array $cellCalls): string 164 { 165 return implode('', array_column(array_column($cellCalls, 1), 0)); 166 } 167 168 /** 169 * Decode a single delimiter cell into 'left' / 'center' / 'right' / null. 170 * 171 * Trusts the entry pattern's validation that the cell has the shape 172 * `:?-+:?`; just checks for colons at the edges. 173 * 174 * @param string $cellText the joined source text of one delimiter cell 175 * @return string|null 'left', 'center', 'right', or null when no 176 * alignment marker is present 177 */ 178 protected function parseAlign(string $cellText): ?string 179 { 180 $trimmed = trim($cellText); 181 $left = str_starts_with($trimmed, ':'); 182 $right = str_ends_with($trimmed, ':'); 183 return match (true) { 184 $left && $right => 'center', 185 $right => 'right', 186 $left => 'left', 187 default => null, 188 }; 189 } 190 191 /** 192 * Return a copy of the row padded with empty cells (spec 202) or 193 * truncated to the header column count (spec 204). 194 * 195 * @param array $row a body row as a list of cells 196 * @param int $cols the target column count derived from the delimiter row 197 * @return array the row with exactly $cols cells 198 */ 199 protected function padOrTruncate(array $row, int $cols): array 200 { 201 $count = count($row); 202 if ($count < $cols) { 203 return array_pad($row, $cols, []); 204 } 205 if ($count > $cols) { 206 return array_slice($row, 0, $cols); 207 } 208 return $row; 209 } 210 211 /** 212 * Return a copy of the row with each cell's first cdata ltrimmed, 213 * its last cdata rtrimmed, and any cdata that became empty dropped. 214 * Intermediate cdata are left intact so internal spaces are preserved. 215 * 216 * @param array $row a row as a list of cells 217 * @return array the row with each cell's edge cdata trimmed 218 */ 219 protected function trimCellEdges(array $row): array 220 { 221 return array_map($this->trimCell(...), $row); 222 } 223 224 /** 225 * Helper for trimCellEdges: trim edge cdata of a single cell. 226 * 227 * @param array $cell the cell as a list of handler calls 228 * @return array the cell with its first cdata ltrimmed, its last 229 * cdata rtrimmed, and any cdata that became empty 230 * dropped 231 */ 232 protected function trimCell(array $cell): array 233 { 234 // get all cdata call indexes 235 $cdataIdx = array_keys(array_filter($cell, fn($c) => $c[0] === 'cdata')); 236 if ($cdataIdx) { 237 // if any, trim the first and last one's text 238 $cell[$cdataIdx[0]][1][0] = ltrim($cell[$cdataIdx[0]][1][0]); 239 $cell[end($cdataIdx)][1][0] = rtrim($cell[end($cdataIdx)][1][0]); 240 } 241 // return all cells that are not cdate or are not empty after trimming 242 return array_values(array_filter( 243 $cell, 244 fn($c) => $c[0] !== 'cdata' || $c[1][0] !== '' 245 )); 246 } 247 248 /** 249 * Spec-203 fallback. Reconstruct a `|a|b|`-style line from each row's 250 * cells via cellText() and emit the joined block as a single cdata so 251 * the Block rewriter wraps it in a paragraph. Because cellText() also 252 * walks `entity` / `unformatted` / etc., the source-text delimiter 253 * characters survive even when an inline mode consumed them. 254 * 255 * @param array $rows the captured rows-of-cells-of-calls structure 256 * @param int $pos the source position to attach to the emitted cdata 257 */ 258 protected function emitFallback(array $rows, int $pos): void 259 { 260 $lines = []; 261 foreach ($rows as $row) { 262 $cellTexts = []; 263 foreach ($row as $cell) { 264 $cellTexts[] = $this->cellText($cell); 265 } 266 $lines[] = '|' . implode('|', $cellTexts) . '|'; 267 } 268 $text = implode("\n", $lines); 269 if ($text === '') return; 270 $this->callWriter->writeCall(['cdata', [$text], $pos]); 271 } 272 273 /** 274 * Assemble the canonical DokuWiki table-instruction sequence. 275 * 276 * `tabletbody_open` / `tabletbody_close` are emitted only when there 277 * are body rows. Suppressing them for empty-body tables (spec 205) 278 * matches the spec's "<thead> only, no <tbody>" expectation without 279 * any state-tracking on the renderer side. 280 * 281 * @param array $headerRow trimmed header row, one cell per column 282 * @param array $bodyRows trimmed body rows, each padded or truncated 283 * to $cols 284 * @param array $alignments per-column alignment from the delimiter 285 * row; each entry is 'left' / 'center' / 286 * 'right' / null 287 * @param int $cols column count derived from the delimiter row 288 * @param int $startPos source position of the table's start 289 * @param int $endPos source position of the table's end 290 * @return array the canonical DokuWiki table call sequence ready for 291 * the outer call writer 292 */ 293 protected function buildOutput( 294 array $headerRow, 295 array $bodyRows, 296 array $alignments, 297 int $cols, 298 int $startPos, 299 int $endPos 300 ): array { 301 $out = []; 302 $out[] = ['table_open', [$cols, 1 + count($bodyRows), $startPos], $startPos]; 303 $out[] = ['tablethead_open', [], $startPos]; 304 $out[] = ['tablerow_open', [], $startPos]; 305 foreach ($headerRow as $i => $cell) { 306 $out[] = ['tableheader_open', [1, $alignments[$i], 1], $startPos]; 307 foreach ($cell as $c) $out[] = $c; 308 $out[] = ['tableheader_close', [], $startPos]; 309 } 310 $out[] = ['tablerow_close', [], $startPos]; 311 $out[] = ['tablethead_close', [], $startPos]; 312 313 if ($bodyRows) { 314 $out[] = ['tabletbody_open', [], $startPos]; 315 foreach ($bodyRows as $row) { 316 $out[] = ['tablerow_open', [], $startPos]; 317 foreach ($row as $i => $cell) { 318 $out[] = ['tablecell_open', [1, $alignments[$i], 1], $startPos]; 319 foreach ($cell as $c) $out[] = $c; 320 $out[] = ['tablecell_close', [], $startPos]; 321 } 322 $out[] = ['tablerow_close', [], $startPos]; 323 } 324 $out[] = ['tabletbody_close', [], $startPos]; 325 } 326 $out[] = ['table_close', [$endPos], $endPos]; 327 return $out; 328 } 329} 330