xref: /dokuwiki/inc/Parsing/ParserMode/GfmListblock.php (revision 74031e463764923581b9204cebc0fc3f34ce881f)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Handler\GfmLists;
7use dokuwiki\Parsing\Handler\Nest;
8use dokuwiki\Parsing\ModeRegistry;
9
10/**
11 * GFM list block.
12 *
13 * Captures an entire list block atomically (one addSpecialPattern match) and
14 * walks the captured text in handle(), grouping lines into items. Each item's
15 * body is dedented to its content column and parsed by a cached sub-parser
16 * (ModeRegistry::getSubParser) so block content - paragraphs, fenced code,
17 * blockquotes, plugin blocks - work inside items uniformly without depending
18 * on column-0 anchoring of nested mode patterns.
19 *
20 * Sub-parser mode set: every active mode except CATEGORY_BASEONLY (i.e. no
21 * Header inside list items, since `<h1>`-`<h6>` inside `<li>` is never
22 * desirable and section nesting must not span into items) and gfm_listblock
23 * itself (defensive guard against lexer re-entry on pathological inputs;
24 * normal nested lists are caught by the outer pattern instead).
25 *
26 * Each item's sub-parsed calls are wrapped in a `nest` instruction (see
27 * Handler\Nest) before they reach the outer handler. This is essential:
28 * the sub-parser's Block rewriter has already wrapped multi-paragraph
29 * content in `p_open`/`p_close`, and without nest-wrapping the main
30 * handler's Block rewriter would see those paragraphs and add another
31 * `<p>` around the entire replayed range, producing nested `<p>` tags.
32 * Block treats `nest` as opaque and the renderer base class unwraps it
33 * transparently — the same pattern Footnote uses.
34 *
35 * Indentation rule: depth = (indent / 2) + 1. Tabs become two spaces. 1- and
36 * 3-space indents round down. Marker characters: -, *, + (unordered) and
37 * digits followed by . or ) (ordered). Nested lists are caught by the
38 * outer pattern (each marker at any 2-space-multiple indent is its own
39 * item at the corresponding depth) and stitched back into nested HTML by
40 * the GfmLists rewriter.
41 */
42class GfmListblock extends AbstractMode
43{
44    /**
45     * Regex fragment matching one list marker.
46     *
47     * Either an unordered marker (`-`, `*`, `+`) or an ordered marker
48     * (1-9 digits followed by `.` or `)`). Used by the entry pattern in
49     * connectTo() and by the per-line classifier in parseItems().
50     */
51    protected const MARKER = '(?:[-*+]|\d{1,9}[.)])';
52
53    /** @inheritdoc */
54    public function getSort()
55    {
56        return 10;
57    }
58
59    /** @inheritdoc */
60    public function preConnect()
61    {
62        ModeRegistry::getInstance()->registerBlockEolMode('gfm_listblock');
63    }
64
65    /**
66     * Register the special pattern that captures a whole list block.
67     *
68     * The pattern starts on a marker line (any indent) and then loops over
69     * four alternatives until none matches:
70     *
71     *   1. A subsequent marker line at any indent.
72     *   2. An indented continuation line (>= 2 leading spaces with content).
73     *   3. A blank line followed by indented content (any number of
74     *      intervening blank lines tolerated via the lookahead).
75     *   4. A blank line followed by a next marker (same multi-blank
76     *      tolerance as alt 3).
77     *
78     * The block ends naturally when none of the alternatives match — for
79     * example a column-0 non-marker line, or two-or-more blank lines
80     * followed by non-list content.
81     *
82     * @inheritdoc
83     */
84    public function connectTo($mode)
85    {
86        $pattern =
87            '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
88            '(?:' .
89                '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
90            '|' . '\n[ \t]{2,}\S[^\n]*' .
91            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]{2,}\S)' .
92            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]*' . self::MARKER . ')' .
93            ')*';
94        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_listblock');
95    }
96
97    /**
98     * Convert the captured block into handler calls.
99     *
100     * Sequence:
101     *   1. parseItems() splits the captured text into per-item records.
102     *   2. Install GfmLists as a CallWriter rewriter on the main handler.
103     *   3. Emit list_open carrying the first item's marker — the rewriter's
104     *      handleListOpen opens the `<ul>`/`<ol>` and the first `<li>`.
105     *   4. For each item:
106     *        - If not the first, emit list_item (closes the previous `<li>`
107     *          and opens a new one in the rewriter).
108     *        - Sub-parse the dedented item body via the cached sub-parser.
109     *        - Filter document_start/end and the outer p_open/p_close pair
110     *          for tight items (single paragraph).
111     *        - Wrap the filtered calls in a Nest so the main handler's
112     *          Block rewriter treats them as opaque.
113     *   5. Emit list_close and finalise the GfmLists rewriter.
114     *
115     * @inheritdoc
116     */
117    public function handle($match, $state, $pos, Handler $handler)
118    {
119        $items = $this->parseItems($match);
120        if (empty($items)) {
121            $handler->addCall('cdata', [$match], $pos);
122            return true;
123        }
124
125        $handler->setCallWriter(new GfmLists($handler->getCallWriter()));
126        $handler->addCall('list_open', [$items[0]['markerMatch']], $pos);
127
128        $subParser = ModeRegistry::getInstance()
129            ->getSubParser([ModeRegistry::CATEGORY_BASEONLY], ['gfm_listblock']);
130        $subHandler = $subParser->getHandler();
131
132        foreach ($items as $i => $item) {
133            $itemPos = $pos + $item['offset'];
134            if ($i > 0) {
135                $handler->addCall('list_item', [$item['markerMatch']], $itemPos);
136            }
137
138            $subHandler->reset();
139            $subParser->parse($item['body']);
140            $itemCalls = $this->filterSubCalls($subHandler->calls);
141            if (empty($itemCalls)) continue; // empty item — nothing to emit
142
143            // Wrap the item content in a Nest so the main handler's Block
144            // rewriter does not double-wrap our already-paragraphed content.
145            // Block treats `nest` as opaque and the renderer base class
146            // unwraps it transparently, the same pattern Footnote uses.
147            $outer = $handler->getCallWriter();
148            $nest = new Nest($outer);
149            $handler->setCallWriter($nest);
150            foreach ($itemCalls as $call) {
151                // sub-handler positions are relative to the item body; offset
152                // them back into the source so section-edit anchors work.
153                $handler->addCall($call[0], $call[1], $itemPos + $call[2]);
154            }
155            $handler->setCallWriter($nest->process());
156        }
157
158        $handler->addCall('list_close', [], $pos + strlen($match));
159        $reWriter = $handler->getCallWriter();
160        $handler->setCallWriter($reWriter->process());
161
162        return true;
163    }
164
165    /**
166     * Walk the captured block, grouping lines into items.
167     *
168     * Each returned item describes one list_item: its marker (in the form
169     * "\n{indent}{marker}" so GfmLists::interpretSyntax can parse it), the
170     * dedented body, dedent column, and absolute offset within $match.
171     *
172     * Lines are classified as marker / continuation / blank. A marker line
173     * starts a new item; continuation and blank lines accumulate into the
174     * current item's body. Continuation lines are dedented by up to
175     * indent + marker_width + 1 leading spaces (the item's content column
176     * for single-space-after-marker cases). Blank lines are kept as empty
177     * body lines while they're in the middle of the body and stripped
178     * from the trailing edge by joinBody() so single-paragraph items
179     * parse tight.
180     *
181     * @param string $match the raw special-pattern match (starts with \n)
182     * @return array<int, array{markerMatch: string, dedent: int, body: string, offset: int}>
183     */
184    protected function parseItems($match)
185    {
186        $stripped = ltrim($match, "\n");
187        $offsetBase = strlen($match) - strlen($stripped);
188        $lines = explode("\n", $stripped);
189
190        $items = [];
191        $current = null;
192        $bodyLines = [];
193        $cursor = $offsetBase;
194
195        foreach ($lines as $line) {
196            $isMarker = preg_match(
197                '/^([ \t]*)(' . self::MARKER . ')(?:[ \t](.*)|$)/',
198                $line,
199                $m
200            );
201
202            if ($isMarker) {
203                if ($current !== null) {
204                    $current['body'] = $this->joinBody($bodyLines);
205                    $items[] = $current;
206                }
207                $indent = str_replace("\t", "  ", $m[1]);
208                $marker = $m[2];
209                $firstLine = $m[3] ?? '';
210                $current = [
211                    'markerMatch' => "\n" . $indent . $marker,
212                    'dedent' => strlen($indent) + strlen($marker) + 1,
213                    'offset' => $cursor,
214                ];
215                $bodyLines = [$firstLine];
216            } elseif ($current !== null) {
217                if (trim($line) === '') {
218                    $bodyLines[] = '';
219                } else {
220                    $expanded = str_replace("\t", "  ", $line);
221                    $available = strlen($expanded) - strlen(ltrim($expanded, ' '));
222                    $strip = min($current['dedent'], $available);
223                    $bodyLines[] = substr($expanded, $strip);
224                }
225            }
226
227            $cursor += strlen($line) + 1; // +1 for the \n consumed by explode
228        }
229
230        if ($current !== null) {
231            $current['body'] = $this->joinBody($bodyLines);
232            $items[] = $current;
233        }
234
235        return $items;
236    }
237
238    /**
239     * Join body lines into a string, trimming trailing blank lines.
240     *
241     * Trailing blanks would reach the sub-parser and cause Block to wrap
242     * the otherwise-single paragraph content in `p_open`/`p_close`,
243     * forcing a tight item into loose-item shape. Stripping them here
244     * preserves the tight rendering for items that look tight in source.
245     *
246     * @param string[] $lines
247     */
248    protected function joinBody(array $lines): string
249    {
250        return rtrim(implode("\n", $lines), "\n");
251    }
252
253    /**
254     * Filter the sub-parser's flat call list before nest-wrapping it.
255     *
256     * Drops `document_start` / `document_end` (always emitted by
257     * Handler::finalize), and strips the outer `p_open` / `p_close` pair
258     * for tight items so their content sits inline inside `<li>`. Loose
259     * items (multiple paragraphs, more than one `p_open`) keep their
260     * inner pairs untouched. The filtered calls are then wrapped in a
261     * Nest by handle() before they reach the GfmLists rewriter.
262     *
263     * @param array $calls
264     * @return array
265     */
266    protected function filterSubCalls(array $calls)
267    {
268        if ($calls && $calls[0][0] === 'document_start') array_shift($calls);
269        if ($calls && end($calls)[0] === 'document_end') array_pop($calls);
270
271        $pCount = 0;
272        foreach ($calls as $c) {
273            if ($c[0] === 'p_open') $pCount++;
274        }
275
276        if ($pCount === 1
277            && $calls[0][0] === 'p_open'
278            && end($calls)[0] === 'p_close') {
279            array_shift($calls);
280            array_pop($calls);
281        }
282
283        return $calls;
284    }
285}
286