xref: /dokuwiki/inc/Parsing/ParserMode/GfmTable.php (revision d331a8396503a69ec91cd77124b1b8983c251c54)
13dabe4e0SAndreas Gohr<?php
23dabe4e0SAndreas Gohr
33dabe4e0SAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
43dabe4e0SAndreas Gohr
53dabe4e0SAndreas Gohruse dokuwiki\Parsing\Handler;
63dabe4e0SAndreas Gohruse dokuwiki\Parsing\Handler\GfmTable as GfmTableRewriter;
73dabe4e0SAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
83dabe4e0SAndreas Gohr
93dabe4e0SAndreas Gohr/**
103dabe4e0SAndreas Gohr * GFM table block.
113dabe4e0SAndreas Gohr *
123dabe4e0SAndreas Gohr * Architecturally mirrors DokuWiki's native Table mode: an entry/exit
133dabe4e0SAndreas Gohr * lexer state with inline modes nested via `allowedModes`, plus a small
143dabe4e0SAndreas Gohr * post-processing rewriter (Handler\GfmTable) that turns the flat token
153dabe4e0SAndreas Gohr * stream into the canonical DokuWiki table call sequence.
163dabe4e0SAndreas Gohr *
173dabe4e0SAndreas Gohr * Cells are inline-only per spec ("Block-level elements cannot be inserted
183dabe4e0SAndreas Gohr * in a table"). Allowed nested categories therefore mirror DW Table:
19*d331a839SAndreas Gohr * FORMATTING, SUBSTITUTION, PROTECTED, DISABLED.
203dabe4e0SAndreas Gohr *
213dabe4e0SAndreas Gohr * Entry-pattern strategy: a single zero-width lookahead asserts the table
223dabe4e0SAndreas Gohr * shape (header line containing a pipe, followed by a delimiter row whose
233dabe4e0SAndreas Gohr * cells are exactly `:?-+:?`). Only the leading newline is consumed; the
243dabe4e0SAndreas Gohr * lookahead validates the rest. Non-tables — paragraphs that happen to
253dabe4e0SAndreas Gohr * contain pipes — never enter the mode.
263dabe4e0SAndreas Gohr *
273dabe4e0SAndreas Gohr * The internal patterns recognise:
283dabe4e0SAndreas Gohr *   - `\|` as a cell separator, with a `(?<!\\)` lookbehind so a backslash-
293dabe4e0SAndreas Gohr *     prefixed pipe is left as raw input — the cell-splitting concern. The
3074031e46SAndreas Gohr *     unescape (turning `\|` into a literal `|`) is handled downstream:
3174031e46SAndreas Gohr *     GfmEscape consumes `\|` in normal cell text, and Handler\GfmTable's
3274031e46SAndreas Gohr *     unescapePipes() applies the tables-extension rewrite inside code
3374031e46SAndreas Gohr *     spans, where standard §6.1 escapes don't fire.
343dabe4e0SAndreas Gohr *   - `\n` followed by a non-newline, non-`>` character as a row separator;
353dabe4e0SAndreas Gohr *   - any other `\n` exits the mode (blank line, blockquote start, EOF).
363dabe4e0SAndreas Gohr *
373dabe4e0SAndreas Gohr * Sort 55 — one below DW Table's 60 — so that in `dw+md` and `md+dw` (where
383dabe4e0SAndreas Gohr * both modes load) the GFM lookahead-validated entry tries first; if it
393dabe4e0SAndreas Gohr * does not see a valid delimiter row, DW Table at sort 60 takes over for
403dabe4e0SAndreas Gohr * `\n|` rows.
413dabe4e0SAndreas Gohr */
423dabe4e0SAndreas Gohrclass GfmTable extends AbstractMode
433dabe4e0SAndreas Gohr{
443dabe4e0SAndreas Gohr    /**
453dabe4e0SAndreas Gohr     * GFM table cells parse only inline content.
463dabe4e0SAndreas Gohr     */
473dabe4e0SAndreas Gohr    public function __construct()
483dabe4e0SAndreas Gohr    {
493dabe4e0SAndreas Gohr        $this->allowedModes = ModeRegistry::getInstance()->getModesForCategories([
503dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_FORMATTING,
51*d331a839SAndreas Gohr            ModeRegistry::CATEGORY_SUBSTITUTION,
523dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_PROTECTED,
533dabe4e0SAndreas Gohr            ModeRegistry::CATEGORY_DISABLED,
543dabe4e0SAndreas Gohr        ]);
553dabe4e0SAndreas Gohr    }
563dabe4e0SAndreas Gohr
573dabe4e0SAndreas Gohr    /** @inheritdoc */
583dabe4e0SAndreas Gohr    public function getSort()
593dabe4e0SAndreas Gohr    {
603dabe4e0SAndreas Gohr        return 55;
613dabe4e0SAndreas Gohr    }
623dabe4e0SAndreas Gohr
633dabe4e0SAndreas Gohr    /** @inheritdoc */
643dabe4e0SAndreas Gohr    public function preConnect()
653dabe4e0SAndreas Gohr    {
663dabe4e0SAndreas Gohr        ModeRegistry::getInstance()->registerBlockEolMode('gfm_table');
673dabe4e0SAndreas Gohr    }
683dabe4e0SAndreas Gohr
693dabe4e0SAndreas Gohr    /**
703dabe4e0SAndreas Gohr     * Entry pattern with lookahead-validated delimiter row.
713dabe4e0SAndreas Gohr     *
723dabe4e0SAndreas Gohr     * Consumes only `\n`; the zero-width lookahead asserts:
733dabe4e0SAndreas Gohr     *   - a header line containing at least one `|`, and
743dabe4e0SAndreas Gohr     *   - a delimiter row of `:?-+:?` cells separated by `|`.
753dabe4e0SAndreas Gohr     *
763dabe4e0SAndreas Gohr     * Without that validation, any paragraph containing a pipe would
773dabe4e0SAndreas Gohr     * trigger the table mode. With it, non-tables flow through as plain
783dabe4e0SAndreas Gohr     * paragraphs.
793dabe4e0SAndreas Gohr     *
803dabe4e0SAndreas Gohr     * @inheritdoc
813dabe4e0SAndreas Gohr     */
823dabe4e0SAndreas Gohr    public function connectTo($mode)
833dabe4e0SAndreas Gohr    {
843dabe4e0SAndreas Gohr        $delim =
853dabe4e0SAndreas Gohr            '[ \t]*\|?[ \t]*:?-+:?' .
863dabe4e0SAndreas Gohr            '(?:[ \t]*\|[ \t]*:?-+:?)*' .
873dabe4e0SAndreas Gohr            '[ \t]*\|?[ \t]*';
883dabe4e0SAndreas Gohr        $entry =
893dabe4e0SAndreas Gohr            '\n(?=' .
903dabe4e0SAndreas Gohr                '[^\n]*\|[^\n]*' .  // header line containing a pipe
913dabe4e0SAndreas Gohr                '\n' . $delim .
923dabe4e0SAndreas Gohr                '(?:\n|$)' .
933dabe4e0SAndreas Gohr            ')';
943dabe4e0SAndreas Gohr        $this->Lexer->addEntryPattern($entry, $mode, 'gfm_table');
953dabe4e0SAndreas Gohr    }
963dabe4e0SAndreas Gohr
973dabe4e0SAndreas Gohr    /** @inheritdoc */
983dabe4e0SAndreas Gohr    public function postConnect()
993dabe4e0SAndreas Gohr    {
1003dabe4e0SAndreas Gohr        // Cell separator. The `(?<!\\)` lookbehind keeps `\|` from being
1013dabe4e0SAndreas Gohr        // treated as a separator so backslash-escaped pipes don't split
1023dabe4e0SAndreas Gohr        // cells. The unescape — turning `\|` into a literal `|` in cell
10374031e46SAndreas Gohr        // content — is handled downstream: GfmEscape consumes `\|` in
10474031e46SAndreas Gohr        // normal text, and Handler\GfmTable::unescapePipes() applies the
10574031e46SAndreas Gohr        // tables-extension rewrite inside code spans. We just need the
10674031e46SAndreas Gohr        // cells to come out the right shape. Edge: `\\|` (escaped
10774031e46SAndreas Gohr        // backslash, then a real separator pipe) is technically wrong
10874031e46SAndreas Gohr        // here — the lookbehind sees the second `\` and refuses to split
10974031e46SAndreas Gohr        // — but GfmEscape consumes `\\` first, leaving a clean `|` at
11074031e46SAndreas Gohr        // separator position.
1113dabe4e0SAndreas Gohr        $this->Lexer->addPattern('(?<!\\\\)\|', 'gfm_table');
1123dabe4e0SAndreas Gohr        // Row separator: a newline followed by a non-newline, non-`>` char.
1133dabe4e0SAndreas Gohr        // Excluding `>` lets a blockquote terminate the table (spec 201);
1143dabe4e0SAndreas Gohr        // requiring a non-newline excludes blank lines and end-of-input.
1153dabe4e0SAndreas Gohr        $this->Lexer->addPattern('\n(?=[^\n>])', 'gfm_table');
1163dabe4e0SAndreas Gohr        // Any other newline (blank line, blockquote start, EOF) exits.
1173dabe4e0SAndreas Gohr        $this->Lexer->addExitPattern('\n', 'gfm_table');
1183dabe4e0SAndreas Gohr    }
1193dabe4e0SAndreas Gohr
1203dabe4e0SAndreas Gohr    /** @inheritdoc */
1213dabe4e0SAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
1223dabe4e0SAndreas Gohr    {
1233dabe4e0SAndreas Gohr        switch ($state) {
1243dabe4e0SAndreas Gohr            case DOKU_LEXER_ENTER:
1253dabe4e0SAndreas Gohr                $handler->setCallWriter(new GfmTableRewriter($handler->getCallWriter()));
1263dabe4e0SAndreas Gohr                // table_start carries the body position (skip the consumed `\n`).
1273dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_start', [$pos + 1], $pos);
1283dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_row', [], $pos);
1293dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_cell', [], $pos);
1303dabe4e0SAndreas Gohr                break;
1313dabe4e0SAndreas Gohr
1323dabe4e0SAndreas Gohr            case DOKU_LEXER_MATCHED:
1333dabe4e0SAndreas Gohr                if (str_contains($match, "\n")) {
1343dabe4e0SAndreas Gohr                    // Row separator: also opens the first cell of the new row.
1353dabe4e0SAndreas Gohr                    $handler->addCall('gfm_table_row', [], $pos);
1363dabe4e0SAndreas Gohr                    $handler->addCall('gfm_table_cell', [], $pos);
1373dabe4e0SAndreas Gohr                } else {
1383dabe4e0SAndreas Gohr                    // Bare `|` — cell separator within the current row.
1393dabe4e0SAndreas Gohr                    $handler->addCall('gfm_table_cell', [], $pos);
1403dabe4e0SAndreas Gohr                }
1413dabe4e0SAndreas Gohr                break;
1423dabe4e0SAndreas Gohr
1433dabe4e0SAndreas Gohr            case DOKU_LEXER_UNMATCHED:
1443dabe4e0SAndreas Gohr                $handler->addCall('cdata', [$match], $pos);
1453dabe4e0SAndreas Gohr                break;
1463dabe4e0SAndreas Gohr
1473dabe4e0SAndreas Gohr            case DOKU_LEXER_EXIT:
1483dabe4e0SAndreas Gohr                $handler->addCall('gfm_table_end', [], $pos);
1493dabe4e0SAndreas Gohr                /** @var GfmTableRewriter $reWriter */
1503dabe4e0SAndreas Gohr                $reWriter = $handler->getCallWriter();
1513dabe4e0SAndreas Gohr                $handler->setCallWriter($reWriter->process());
1523dabe4e0SAndreas Gohr                break;
1533dabe4e0SAndreas Gohr        }
1543dabe4e0SAndreas Gohr        return true;
1553dabe4e0SAndreas Gohr    }
1563dabe4e0SAndreas Gohr}
157