xref: /dokuwiki/inc/Parsing/ParserMode/GfmListblock.php (revision eb15e634e1400f6c4d78f5fb40179ca25f41574d)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Handler\GfmLists;
7use dokuwiki\Parsing\Handler\Nest;
8use dokuwiki\Parsing\ModeRegistry;
9
10/**
11 * GFM list block.
12 *
13 * Captures an entire list block atomically (one addSpecialPattern match) and
14 * walks the captured text in handle(), grouping lines into items. The per-item
15 * loop runs inside a ModeRegistry::withSubParser() callback so each item's
16 * body is dedented to its content column and parsed by a pooled sub-parser,
17 * and block content - paragraphs, fenced code, blockquotes, plugin blocks -
18 * works inside items uniformly without depending on column-0 anchoring of
19 * nested mode patterns. If any nested mode requests a sub-parser with the
20 * same exclusion key while ours is in use, the registry's pool hands them a
21 * different slot so their reset() does not corrupt our state.
22 *
23 * Sub-parser mode set: every active mode except CATEGORY_BASEONLY (i.e. no
24 * Header inside list items, since `<h1>`-`<h6>` inside `<li>` is never
25 * desirable and section nesting must not span into items) and gfm_listblock
26 * itself (defensive guard against lexer re-entry on pathological inputs;
27 * normal nested lists are caught by the outer pattern instead).
28 *
29 * Each item's sub-parsed calls are wrapped in a `nest` instruction (see
30 * Handler\Nest) before they reach the outer handler. This is essential:
31 * the sub-parser's Block rewriter has already wrapped multi-paragraph
32 * content in `p_open`/`p_close`, and without nest-wrapping the main
33 * handler's Block rewriter would see those paragraphs and add another
34 * `<p>` around the entire replayed range, producing nested `<p>` tags.
35 * Block treats `nest` as opaque and the renderer base class unwraps it
36 * transparently — the same pattern Footnote uses.
37 *
38 * Indentation rule: depth = (indent / 2) + 1. Tabs become two spaces. 1- and
39 * 3-space indents round down. Marker characters: -, *, + (unordered) and
40 * digits followed by . or ) (ordered). Nested lists are caught by the
41 * outer pattern (each marker at any 2-space-multiple indent is its own
42 * item at the corresponding depth) and stitched back into nested HTML by
43 * the GfmLists rewriter.
44 */
45class GfmListblock extends AbstractMode
46{
47    /**
48     * Regex fragment matching one list marker.
49     *
50     * Either an unordered marker (`-`, `*`, `+`) or an ordered marker
51     * (1-9 digits followed by `.` or `)`). Used by the entry pattern in
52     * connectTo() and by the per-line classifier in parseItems().
53     */
54    protected const MARKER = '(?:[-*+]|\d{1,9}[.)])';
55
56    /** @inheritdoc */
57    public function getSort()
58    {
59        return 10;
60    }
61
62    /** @inheritdoc */
63    public function preConnect()
64    {
65        ModeRegistry::getInstance()->registerBlockEolMode('gfm_listblock');
66    }
67
68    /**
69     * Register the special pattern that captures a whole list block.
70     *
71     * The pattern starts on a marker line (any indent) and then loops over
72     * four alternatives until none matches:
73     *
74     *   1. A subsequent marker line at any indent.
75     *   2. An indented continuation line (>= 2 leading spaces with content).
76     *   3. A blank line followed by indented content (any number of
77     *      intervening blank lines tolerated via the lookahead).
78     *   4. A blank line followed by a next marker (same multi-blank
79     *      tolerance as alt 3).
80     *
81     * The block ends naturally when none of the alternatives match — for
82     * example a column-0 non-marker line, or two-or-more blank lines
83     * followed by non-list content.
84     *
85     * @inheritdoc
86     */
87    public function connectTo($mode)
88    {
89        $pattern =
90            '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
91            '(?:' .
92                '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
93            '|' . '\n[ \t]{2,}\S[^\n]*' .
94            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]{2,}\S)' .
95            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]*' . self::MARKER . ')' .
96            ')*';
97        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_listblock');
98    }
99
100    /**
101     * Convert the captured block into handler calls.
102     *
103     * Sequence:
104     *   1. parseItems() splits the captured text into per-item records.
105     *   2. Install GfmLists as a CallWriter rewriter on the main handler.
106     *   3. Emit list_open carrying the first item's marker — the rewriter's
107     *      handleListOpen opens the `<ul>`/`<ol>` and the first `<li>`.
108     *   4. For each item:
109     *        - If not the first, emit list_item (closes the previous `<li>`
110     *          and opens a new one in the rewriter).
111     *        - Sub-parse the dedented item body via the cached sub-parser.
112     *        - Filter document_start/end and the outer p_open/p_close pair
113     *          for tight items (single paragraph).
114     *        - Wrap the filtered calls in a Nest so the main handler's
115     *          Block rewriter treats them as opaque.
116     *   5. Emit list_close and finalise the GfmLists rewriter.
117     *
118     * @inheritdoc
119     */
120    public function handle($match, $state, $pos, Handler $handler)
121    {
122        $items = $this->parseItems($match);
123        if (empty($items)) {
124            $handler->addCall('cdata', [$match], $pos);
125            return true;
126        }
127
128        $handler->setCallWriter(new GfmLists($handler->getCallWriter()));
129        $handler->addCall('list_open', [$items[0]['markerMatch']], $pos);
130
131        $registry = ModeRegistry::getInstance();
132        $excludeCats = [ModeRegistry::CATEGORY_BASEONLY];
133        $excludeModes = ['gfm_listblock'];
134        $subParser = $registry->acquireSubParser($excludeCats, $excludeModes);
135        $subHandler = $subParser->getHandler();
136
137        foreach ($items as $i => $item) {
138            $itemPos = $pos + $item['offset'];
139            if ($i > 0) {
140                $handler->addCall('list_item', [$item['markerMatch']], $itemPos);
141            }
142
143            $subHandler->reset();
144            $subParser->parse($item['body']);
145            $itemCalls = $this->filterSubCalls($subHandler->calls);
146            if (empty($itemCalls)) continue; // empty item — nothing to emit
147
148            // Wrap the item content in a Nest so the main handler's Block
149            // rewriter does not double-wrap our already-paragraphed content.
150            // Block treats `nest` as opaque and the renderer base class
151            // unwraps it transparently, the same pattern Footnote uses.
152            $outer = $handler->getCallWriter();
153            $nest = new Nest($outer);
154            $handler->setCallWriter($nest);
155            foreach ($itemCalls as $call) {
156                // sub-handler positions are relative to the item body; offset
157                // them back into the source so section-edit anchors work.
158                $handler->addCall($call[0], $call[1], $itemPos + $call[2]);
159            }
160            $handler->setCallWriter($nest->process());
161        }
162
163        $registry->releaseSubParser($excludeCats, $excludeModes);
164
165        $handler->addCall('list_close', [], $pos + strlen($match));
166        $reWriter = $handler->getCallWriter();
167        $handler->setCallWriter($reWriter->process());
168
169        return true;
170    }
171
172    /**
173     * Walk the captured block, grouping lines into items.
174     *
175     * Each returned item describes one list_item: its marker (in the form
176     * "\n{indent}{marker}" so GfmLists::interpretSyntax can parse it), the
177     * dedented body, dedent column, and absolute offset within $match.
178     *
179     * Lines are classified as marker / continuation / blank. A marker line
180     * starts a new item; continuation and blank lines accumulate into the
181     * current item's body. Continuation lines are dedented by up to
182     * indent + marker_width + 1 leading spaces (the item's content column
183     * for single-space-after-marker cases). Blank lines are kept as empty
184     * body lines while they're in the middle of the body and stripped
185     * from the trailing edge by joinBody() so single-paragraph items
186     * parse tight.
187     *
188     * @param string $match the raw special-pattern match (starts with \n)
189     * @return array<int, array{markerMatch: string, dedent: int, body: string, offset: int}>
190     */
191    protected function parseItems($match)
192    {
193        $stripped = ltrim($match, "\n");
194        $offsetBase = strlen($match) - strlen($stripped);
195        $lines = explode("\n", $stripped);
196
197        $items = [];
198        $current = null;
199        $bodyLines = [];
200        $cursor = $offsetBase;
201
202        foreach ($lines as $line) {
203            $isMarker = preg_match(
204                '/^([ \t]*)(' . self::MARKER . ')(?:[ \t](.*)|$)/',
205                $line,
206                $m
207            );
208
209            if ($isMarker) {
210                if ($current !== null) {
211                    $current['body'] = $this->joinBody($bodyLines);
212                    $items[] = $current;
213                }
214                $indent = str_replace("\t", "  ", $m[1]);
215                $marker = $m[2];
216                $firstLine = $m[3] ?? '';
217                $current = [
218                    'markerMatch' => "\n" . $indent . $marker,
219                    'dedent' => strlen($indent) + strlen($marker) + 1,
220                    'offset' => $cursor,
221                ];
222                $bodyLines = [$firstLine];
223            } elseif ($current !== null) {
224                if (trim($line) === '') {
225                    $bodyLines[] = '';
226                } else {
227                    $expanded = str_replace("\t", "  ", $line);
228                    $available = strlen($expanded) - strlen(ltrim($expanded, ' '));
229                    $strip = min($current['dedent'], $available);
230                    $bodyLines[] = substr($expanded, $strip);
231                }
232            }
233
234            $cursor += strlen($line) + 1; // +1 for the \n consumed by explode
235        }
236
237        if ($current !== null) {
238            $current['body'] = $this->joinBody($bodyLines);
239            $items[] = $current;
240        }
241
242        return $items;
243    }
244
245    /**
246     * Join body lines into a string, trimming trailing blank lines.
247     *
248     * Trailing blanks would reach the sub-parser and cause Block to wrap
249     * the otherwise-single paragraph content in `p_open`/`p_close`,
250     * forcing a tight item into loose-item shape. Stripping them here
251     * preserves the tight rendering for items that look tight in source.
252     *
253     * @param string[] $lines
254     */
255    protected function joinBody(array $lines): string
256    {
257        return rtrim(implode("\n", $lines), "\n");
258    }
259
260    /**
261     * Filter the sub-parser's flat call list before nest-wrapping it.
262     *
263     * Drops `document_start` / `document_end` (always emitted by
264     * Handler::finalize), and strips the outer `p_open` / `p_close` pair
265     * for tight items so their content sits inline inside `<li>`. Loose
266     * items (multiple paragraphs, more than one `p_open`) keep their
267     * inner pairs untouched. The filtered calls are then wrapped in a
268     * Nest by handle() before they reach the GfmLists rewriter.
269     *
270     * @param array $calls
271     * @return array
272     */
273    protected function filterSubCalls(array $calls)
274    {
275        if ($calls && $calls[0][0] === 'document_start') array_shift($calls);
276        if ($calls && end($calls)[0] === 'document_end') array_pop($calls);
277
278        $pCount = 0;
279        foreach ($calls as $c) {
280            if ($c[0] === 'p_open') $pCount++;
281        }
282
283        if ($pCount === 1
284            && $calls[0][0] === 'p_open'
285            && end($calls)[0] === 'p_close') {
286            array_shift($calls);
287            array_pop($calls);
288        }
289
290        return $calls;
291    }
292}
293