xref: /dokuwiki/inc/Parsing/ParserMode/GfmListblock.php (revision 685560eb3044321b3bdd0be40985871ced5f1d05)
1*685560ebSAndreas Gohr<?php
2*685560ebSAndreas Gohr
3*685560ebSAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4*685560ebSAndreas Gohr
5*685560ebSAndreas Gohruse dokuwiki\Parsing\Handler;
6*685560ebSAndreas Gohruse dokuwiki\Parsing\Handler\GfmLists;
7*685560ebSAndreas Gohruse dokuwiki\Parsing\Handler\Nest;
8*685560ebSAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
9*685560ebSAndreas Gohr
10*685560ebSAndreas Gohr/**
11*685560ebSAndreas Gohr * GFM list block.
12*685560ebSAndreas Gohr *
13*685560ebSAndreas Gohr * Captures an entire list block atomically (one addSpecialPattern match) and
14*685560ebSAndreas Gohr * walks the captured text in handle(), grouping lines into items. Each item's
15*685560ebSAndreas Gohr * body is dedented to its content column and parsed by a cached sub-parser
16*685560ebSAndreas Gohr * (ModeRegistry::getSubParser) so block content - paragraphs, fenced code,
17*685560ebSAndreas Gohr * blockquotes, plugin blocks - work inside items uniformly without depending
18*685560ebSAndreas Gohr * on column-0 anchoring of nested mode patterns.
19*685560ebSAndreas Gohr *
20*685560ebSAndreas Gohr * Sub-parser mode set: every active mode except CATEGORY_BASEONLY (i.e. no
21*685560ebSAndreas Gohr * Header inside list items, since `<h1>`-`<h6>` inside `<li>` is never
22*685560ebSAndreas Gohr * desirable and section nesting must not span into items) and gfm_listblock
23*685560ebSAndreas Gohr * itself (defensive guard against lexer re-entry on pathological inputs;
24*685560ebSAndreas Gohr * normal nested lists are caught by the outer pattern instead).
25*685560ebSAndreas Gohr *
26*685560ebSAndreas Gohr * Each item's sub-parsed calls are wrapped in a `nest` instruction (see
27*685560ebSAndreas Gohr * Handler\Nest) before they reach the outer handler. This is essential:
28*685560ebSAndreas Gohr * the sub-parser's Block rewriter has already wrapped multi-paragraph
29*685560ebSAndreas Gohr * content in `p_open`/`p_close`, and without nest-wrapping the main
30*685560ebSAndreas Gohr * handler's Block rewriter would see those paragraphs and add another
31*685560ebSAndreas Gohr * `<p>` around the entire replayed range, producing nested `<p>` tags.
32*685560ebSAndreas Gohr * Block treats `nest` as opaque and the renderer base class unwraps it
33*685560ebSAndreas Gohr * transparently — the same pattern Footnote uses.
34*685560ebSAndreas Gohr *
35*685560ebSAndreas Gohr * Indentation rule: depth = (indent / 2) + 1. Tabs become two spaces. 1- and
36*685560ebSAndreas Gohr * 3-space indents round down. Marker characters: -, *, + (unordered) and
37*685560ebSAndreas Gohr * digits followed by . or ) (ordered). Nested lists are caught by the
38*685560ebSAndreas Gohr * outer pattern (each marker at any 2-space-multiple indent is its own
39*685560ebSAndreas Gohr * item at the corresponding depth) and stitched back into nested HTML by
40*685560ebSAndreas Gohr * the GfmLists rewriter.
41*685560ebSAndreas Gohr */
42*685560ebSAndreas Gohrclass GfmListblock extends AbstractMode
43*685560ebSAndreas Gohr{
44*685560ebSAndreas Gohr    /**
45*685560ebSAndreas Gohr     * Regex fragment matching one list marker.
46*685560ebSAndreas Gohr     *
47*685560ebSAndreas Gohr     * Either an unordered marker (`-`, `*`, `+`) or an ordered marker
48*685560ebSAndreas Gohr     * (1-9 digits followed by `.` or `)`). Used by the entry pattern in
49*685560ebSAndreas Gohr     * connectTo() and by the per-line classifier in parseItems().
50*685560ebSAndreas Gohr     */
51*685560ebSAndreas Gohr    protected const MARKER = '(?:[-*+]|\d{1,9}[.)])';
52*685560ebSAndreas Gohr
53*685560ebSAndreas Gohr    /** @inheritdoc */
54*685560ebSAndreas Gohr    public function getSort()
55*685560ebSAndreas Gohr    {
56*685560ebSAndreas Gohr        return 10;
57*685560ebSAndreas Gohr    }
58*685560ebSAndreas Gohr
59*685560ebSAndreas Gohr    /** @inheritdoc */
60*685560ebSAndreas Gohr    public function preConnect()
61*685560ebSAndreas Gohr    {
62*685560ebSAndreas Gohr        ModeRegistry::getInstance()->registerBlockEolMode('gfm_listblock');
63*685560ebSAndreas Gohr    }
64*685560ebSAndreas Gohr
65*685560ebSAndreas Gohr    /**
66*685560ebSAndreas Gohr     * Register the special pattern that captures a whole list block.
67*685560ebSAndreas Gohr     *
68*685560ebSAndreas Gohr     * The pattern starts on a marker line (any indent) and then loops over
69*685560ebSAndreas Gohr     * four alternatives until none matches:
70*685560ebSAndreas Gohr     *
71*685560ebSAndreas Gohr     *   1. A subsequent marker line at any indent.
72*685560ebSAndreas Gohr     *   2. An indented continuation line (>= 2 leading spaces with content).
73*685560ebSAndreas Gohr     *   3. A blank line followed by indented content (any number of
74*685560ebSAndreas Gohr     *      intervening blank lines tolerated via the lookahead).
75*685560ebSAndreas Gohr     *   4. A blank line followed by a next marker (same multi-blank
76*685560ebSAndreas Gohr     *      tolerance as alt 3).
77*685560ebSAndreas Gohr     *
78*685560ebSAndreas Gohr     * The block ends naturally when none of the alternatives match — for
79*685560ebSAndreas Gohr     * example a column-0 non-marker line, or two-or-more blank lines
80*685560ebSAndreas Gohr     * followed by non-list content.
81*685560ebSAndreas Gohr     *
82*685560ebSAndreas Gohr     * @inheritdoc
83*685560ebSAndreas Gohr     */
84*685560ebSAndreas Gohr    public function connectTo($mode)
85*685560ebSAndreas Gohr    {
86*685560ebSAndreas Gohr        $pattern =
87*685560ebSAndreas Gohr            '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
88*685560ebSAndreas Gohr            '(?:' .
89*685560ebSAndreas Gohr                '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
90*685560ebSAndreas Gohr            '|' . '\n[ \t]{2,}\S[^\n]*' .
91*685560ebSAndreas Gohr            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]{2,}\S)' .
92*685560ebSAndreas Gohr            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]*' . self::MARKER . ')' .
93*685560ebSAndreas Gohr            ')*';
94*685560ebSAndreas Gohr        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_listblock');
95*685560ebSAndreas Gohr    }
96*685560ebSAndreas Gohr
97*685560ebSAndreas Gohr    /**
98*685560ebSAndreas Gohr     * Convert the captured block into handler calls.
99*685560ebSAndreas Gohr     *
100*685560ebSAndreas Gohr     * Sequence:
101*685560ebSAndreas Gohr     *   1. parseItems() splits the captured text into per-item records.
102*685560ebSAndreas Gohr     *   2. Install GfmLists as a CallWriter rewriter on the main handler.
103*685560ebSAndreas Gohr     *   3. Emit list_open carrying the first item's marker — the rewriter's
104*685560ebSAndreas Gohr     *      handleListOpen opens the `<ul>`/`<ol>` and the first `<li>`.
105*685560ebSAndreas Gohr     *   4. For each item:
106*685560ebSAndreas Gohr     *        - If not the first, emit list_item (closes the previous `<li>`
107*685560ebSAndreas Gohr     *          and opens a new one in the rewriter).
108*685560ebSAndreas Gohr     *        - Sub-parse the dedented item body via the cached sub-parser.
109*685560ebSAndreas Gohr     *        - Filter document_start/end and the outer p_open/p_close pair
110*685560ebSAndreas Gohr     *          for tight items (single paragraph).
111*685560ebSAndreas Gohr     *        - Wrap the filtered calls in a Nest so the main handler's
112*685560ebSAndreas Gohr     *          Block rewriter treats them as opaque.
113*685560ebSAndreas Gohr     *   5. Emit list_close and finalise the GfmLists rewriter.
114*685560ebSAndreas Gohr     *
115*685560ebSAndreas Gohr     * @inheritdoc
116*685560ebSAndreas Gohr     */
117*685560ebSAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
118*685560ebSAndreas Gohr    {
119*685560ebSAndreas Gohr        $items = $this->parseItems($match);
120*685560ebSAndreas Gohr        if (empty($items)) {
121*685560ebSAndreas Gohr            $handler->addCall('cdata', [$match], $pos);
122*685560ebSAndreas Gohr            return true;
123*685560ebSAndreas Gohr        }
124*685560ebSAndreas Gohr
125*685560ebSAndreas Gohr        $handler->setCallWriter(new GfmLists($handler->getCallWriter()));
126*685560ebSAndreas Gohr        $handler->addCall('list_open', [$items[0]['markerMatch']], $pos);
127*685560ebSAndreas Gohr
128*685560ebSAndreas Gohr        $subParser = ModeRegistry::getInstance()
129*685560ebSAndreas Gohr            ->getSubParser([ModeRegistry::CATEGORY_BASEONLY], ['gfm_listblock']);
130*685560ebSAndreas Gohr        $subHandler = $subParser->getHandler();
131*685560ebSAndreas Gohr
132*685560ebSAndreas Gohr        foreach ($items as $i => $item) {
133*685560ebSAndreas Gohr            $itemPos = $pos + $item['offset'];
134*685560ebSAndreas Gohr            if ($i > 0) {
135*685560ebSAndreas Gohr                $handler->addCall('list_item', [$item['markerMatch']], $itemPos);
136*685560ebSAndreas Gohr            }
137*685560ebSAndreas Gohr
138*685560ebSAndreas Gohr            $subHandler->reset();
139*685560ebSAndreas Gohr            $subParser->parse($item['body']);
140*685560ebSAndreas Gohr            $itemCalls = $this->filterSubCalls($subHandler->calls);
141*685560ebSAndreas Gohr            if (empty($itemCalls)) continue; // empty item — nothing to emit
142*685560ebSAndreas Gohr
143*685560ebSAndreas Gohr            // Wrap the item content in a Nest so the main handler's Block
144*685560ebSAndreas Gohr            // rewriter does not double-wrap our already-paragraphed content.
145*685560ebSAndreas Gohr            // Block treats `nest` as opaque and the renderer base class
146*685560ebSAndreas Gohr            // unwraps it transparently, the same pattern Footnote uses.
147*685560ebSAndreas Gohr            $outer = $handler->getCallWriter();
148*685560ebSAndreas Gohr            $nest = new Nest($outer);
149*685560ebSAndreas Gohr            $handler->setCallWriter($nest);
150*685560ebSAndreas Gohr            foreach ($itemCalls as $call) {
151*685560ebSAndreas Gohr                // sub-handler positions are relative to the item body; offset
152*685560ebSAndreas Gohr                // them back into the source so section-edit anchors work.
153*685560ebSAndreas Gohr                $handler->addCall($call[0], $call[1], $itemPos + $call[2]);
154*685560ebSAndreas Gohr            }
155*685560ebSAndreas Gohr            $handler->setCallWriter($nest->process());
156*685560ebSAndreas Gohr        }
157*685560ebSAndreas Gohr
158*685560ebSAndreas Gohr        $handler->addCall('list_close', [], $pos + strlen($match));
159*685560ebSAndreas Gohr        $reWriter = $handler->getCallWriter();
160*685560ebSAndreas Gohr        $handler->setCallWriter($reWriter->process());
161*685560ebSAndreas Gohr
162*685560ebSAndreas Gohr        return true;
163*685560ebSAndreas Gohr    }
164*685560ebSAndreas Gohr
165*685560ebSAndreas Gohr    /**
166*685560ebSAndreas Gohr     * Walk the captured block, grouping lines into items.
167*685560ebSAndreas Gohr     *
168*685560ebSAndreas Gohr     * Each returned item describes one list_item: its marker (in the form
169*685560ebSAndreas Gohr     * "\n{indent}{marker}" so GfmLists::interpretSyntax can parse it), the
170*685560ebSAndreas Gohr     * dedented body, dedent column, and absolute offset within $match.
171*685560ebSAndreas Gohr     *
172*685560ebSAndreas Gohr     * Lines are classified as marker / continuation / blank. A marker line
173*685560ebSAndreas Gohr     * starts a new item; continuation and blank lines accumulate into the
174*685560ebSAndreas Gohr     * current item's body. Continuation lines are dedented by up to
175*685560ebSAndreas Gohr     * indent + marker_width + 1 leading spaces (the item's content column
176*685560ebSAndreas Gohr     * for single-space-after-marker cases). Blank lines are kept as empty
177*685560ebSAndreas Gohr     * body lines while they're in the middle of the body and stripped
178*685560ebSAndreas Gohr     * from the trailing edge by joinBody() so single-paragraph items
179*685560ebSAndreas Gohr     * parse tight.
180*685560ebSAndreas Gohr     *
181*685560ebSAndreas Gohr     * @param string $match the raw special-pattern match (starts with \n)
182*685560ebSAndreas Gohr     * @return array<int, array{markerMatch: string, dedent: int, body: string, offset: int}>
183*685560ebSAndreas Gohr     */
184*685560ebSAndreas Gohr    protected function parseItems($match)
185*685560ebSAndreas Gohr    {
186*685560ebSAndreas Gohr        $stripped = ltrim($match, "\n");
187*685560ebSAndreas Gohr        $offsetBase = strlen($match) - strlen($stripped);
188*685560ebSAndreas Gohr        $lines = explode("\n", $stripped);
189*685560ebSAndreas Gohr
190*685560ebSAndreas Gohr        $items = [];
191*685560ebSAndreas Gohr        $current = null;
192*685560ebSAndreas Gohr        $bodyLines = [];
193*685560ebSAndreas Gohr        $cursor = $offsetBase;
194*685560ebSAndreas Gohr
195*685560ebSAndreas Gohr        foreach ($lines as $line) {
196*685560ebSAndreas Gohr            $isMarker = preg_match(
197*685560ebSAndreas Gohr                '/^([ \t]*)(' . self::MARKER . ')(?:[ \t](.*)|$)/',
198*685560ebSAndreas Gohr                $line,
199*685560ebSAndreas Gohr                $m
200*685560ebSAndreas Gohr            );
201*685560ebSAndreas Gohr
202*685560ebSAndreas Gohr            if ($isMarker) {
203*685560ebSAndreas Gohr                if ($current !== null) {
204*685560ebSAndreas Gohr                    $current['body'] = $this->joinBody($bodyLines);
205*685560ebSAndreas Gohr                    $items[] = $current;
206*685560ebSAndreas Gohr                }
207*685560ebSAndreas Gohr                $indent = str_replace("\t", "  ", $m[1]);
208*685560ebSAndreas Gohr                $marker = $m[2];
209*685560ebSAndreas Gohr                $firstLine = $m[3] ?? '';
210*685560ebSAndreas Gohr                $current = [
211*685560ebSAndreas Gohr                    'markerMatch' => "\n" . $indent . $marker,
212*685560ebSAndreas Gohr                    'dedent' => strlen($indent) + strlen($marker) + 1,
213*685560ebSAndreas Gohr                    'offset' => $cursor,
214*685560ebSAndreas Gohr                ];
215*685560ebSAndreas Gohr                $bodyLines = [$firstLine];
216*685560ebSAndreas Gohr            } elseif ($current !== null) {
217*685560ebSAndreas Gohr                if (trim($line) === '') {
218*685560ebSAndreas Gohr                    $bodyLines[] = '';
219*685560ebSAndreas Gohr                } else {
220*685560ebSAndreas Gohr                    $expanded = str_replace("\t", "  ", $line);
221*685560ebSAndreas Gohr                    $available = strlen($expanded) - strlen(ltrim($expanded, ' '));
222*685560ebSAndreas Gohr                    $strip = min($current['dedent'], $available);
223*685560ebSAndreas Gohr                    $bodyLines[] = substr($expanded, $strip);
224*685560ebSAndreas Gohr                }
225*685560ebSAndreas Gohr            }
226*685560ebSAndreas Gohr
227*685560ebSAndreas Gohr            $cursor += strlen($line) + 1; // +1 for the \n consumed by explode
228*685560ebSAndreas Gohr        }
229*685560ebSAndreas Gohr
230*685560ebSAndreas Gohr        if ($current !== null) {
231*685560ebSAndreas Gohr            $current['body'] = $this->joinBody($bodyLines);
232*685560ebSAndreas Gohr            $items[] = $current;
233*685560ebSAndreas Gohr        }
234*685560ebSAndreas Gohr
235*685560ebSAndreas Gohr        return $items;
236*685560ebSAndreas Gohr    }
237*685560ebSAndreas Gohr
238*685560ebSAndreas Gohr    /**
239*685560ebSAndreas Gohr     * Join body lines into a string, trimming trailing blank lines.
240*685560ebSAndreas Gohr     *
241*685560ebSAndreas Gohr     * Trailing blanks would reach the sub-parser and cause Block to wrap
242*685560ebSAndreas Gohr     * the otherwise-single paragraph content in `p_open`/`p_close`,
243*685560ebSAndreas Gohr     * forcing a tight item into loose-item shape. Stripping them here
244*685560ebSAndreas Gohr     * preserves the tight rendering for items that look tight in source.
245*685560ebSAndreas Gohr     *
246*685560ebSAndreas Gohr     * @param string[] $lines
247*685560ebSAndreas Gohr     */
248*685560ebSAndreas Gohr    protected function joinBody(array $lines): string
249*685560ebSAndreas Gohr    {
250*685560ebSAndreas Gohr        return rtrim(implode("\n", $lines), "\n");
251*685560ebSAndreas Gohr    }
252*685560ebSAndreas Gohr
253*685560ebSAndreas Gohr    /**
254*685560ebSAndreas Gohr     * Filter the sub-parser's flat call list before nest-wrapping it.
255*685560ebSAndreas Gohr     *
256*685560ebSAndreas Gohr     * Drops `document_start` / `document_end` (always emitted by
257*685560ebSAndreas Gohr     * Handler::finalize), and strips the outer `p_open` / `p_close` pair
258*685560ebSAndreas Gohr     * for tight items so their content sits inline inside `<li>`. Loose
259*685560ebSAndreas Gohr     * items (multiple paragraphs, more than one `p_open`) keep their
260*685560ebSAndreas Gohr     * inner pairs untouched. The filtered calls are then wrapped in a
261*685560ebSAndreas Gohr     * Nest by handle() before they reach the GfmLists rewriter.
262*685560ebSAndreas Gohr     *
263*685560ebSAndreas Gohr     * @param array $calls
264*685560ebSAndreas Gohr     * @return array
265*685560ebSAndreas Gohr     */
266*685560ebSAndreas Gohr    protected function filterSubCalls(array $calls)
267*685560ebSAndreas Gohr    {
268*685560ebSAndreas Gohr        if ($calls && $calls[0][0] === 'document_start') array_shift($calls);
269*685560ebSAndreas Gohr        if ($calls && end($calls)[0] === 'document_end') array_pop($calls);
270*685560ebSAndreas Gohr
271*685560ebSAndreas Gohr        $pCount = 0;
272*685560ebSAndreas Gohr        foreach ($calls as $c) {
273*685560ebSAndreas Gohr            if ($c[0] === 'p_open') $pCount++;
274*685560ebSAndreas Gohr        }
275*685560ebSAndreas Gohr
276*685560ebSAndreas Gohr        if ($pCount === 1
277*685560ebSAndreas Gohr            && $calls[0][0] === 'p_open'
278*685560ebSAndreas Gohr            && end($calls)[0] === 'p_close') {
279*685560ebSAndreas Gohr            array_shift($calls);
280*685560ebSAndreas Gohr            array_pop($calls);
281*685560ebSAndreas Gohr        }
282*685560ebSAndreas Gohr
283*685560ebSAndreas Gohr        return $calls;
284*685560ebSAndreas Gohr    }
285*685560ebSAndreas Gohr}
286