xref: /dokuwiki/inc/Parsing/ParserMode/GfmListblock.php (revision 309a08521b24a6fff00f318e061096f69771bbad)
1685560ebSAndreas Gohr<?php
2685560ebSAndreas Gohr
3685560ebSAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4685560ebSAndreas Gohr
5685560ebSAndreas Gohruse dokuwiki\Parsing\Handler;
6685560ebSAndreas Gohruse dokuwiki\Parsing\Handler\GfmLists;
7685560ebSAndreas Gohruse dokuwiki\Parsing\Handler\Nest;
8685560ebSAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
9685560ebSAndreas Gohr
10685560ebSAndreas Gohr/**
11685560ebSAndreas Gohr * GFM list block.
12685560ebSAndreas Gohr *
13685560ebSAndreas Gohr * Captures an entire list block atomically (one addSpecialPattern match) and
14*309a0852SAndreas Gohr * walks the captured text in handle(), grouping lines into items. The per-item
15*309a0852SAndreas Gohr * loop runs inside a ModeRegistry::withSubParser() callback so each item's
16*309a0852SAndreas Gohr * body is dedented to its content column and parsed by a pooled sub-parser,
17*309a0852SAndreas Gohr * and block content - paragraphs, fenced code, blockquotes, plugin blocks -
18*309a0852SAndreas Gohr * works inside items uniformly without depending on column-0 anchoring of
19*309a0852SAndreas Gohr * nested mode patterns. If any nested mode requests a sub-parser with the
20*309a0852SAndreas Gohr * same exclusion key while ours is in use, the registry's pool hands them a
21*309a0852SAndreas Gohr * different slot so their reset() does not corrupt our state.
22685560ebSAndreas Gohr *
23685560ebSAndreas Gohr * Sub-parser mode set: every active mode except CATEGORY_BASEONLY (i.e. no
24685560ebSAndreas Gohr * Header inside list items, since `<h1>`-`<h6>` inside `<li>` is never
25685560ebSAndreas Gohr * desirable and section nesting must not span into items) and gfm_listblock
26685560ebSAndreas Gohr * itself (defensive guard against lexer re-entry on pathological inputs;
27685560ebSAndreas Gohr * normal nested lists are caught by the outer pattern instead).
28685560ebSAndreas Gohr *
29685560ebSAndreas Gohr * Each item's sub-parsed calls are wrapped in a `nest` instruction (see
30685560ebSAndreas Gohr * Handler\Nest) before they reach the outer handler. This is essential:
31685560ebSAndreas Gohr * the sub-parser's Block rewriter has already wrapped multi-paragraph
32685560ebSAndreas Gohr * content in `p_open`/`p_close`, and without nest-wrapping the main
33685560ebSAndreas Gohr * handler's Block rewriter would see those paragraphs and add another
34685560ebSAndreas Gohr * `<p>` around the entire replayed range, producing nested `<p>` tags.
35685560ebSAndreas Gohr * Block treats `nest` as opaque and the renderer base class unwraps it
36685560ebSAndreas Gohr * transparently — the same pattern Footnote uses.
37685560ebSAndreas Gohr *
38685560ebSAndreas Gohr * Indentation rule: depth = (indent / 2) + 1. Tabs become two spaces. 1- and
39685560ebSAndreas Gohr * 3-space indents round down. Marker characters: -, *, + (unordered) and
40685560ebSAndreas Gohr * digits followed by . or ) (ordered). Nested lists are caught by the
41685560ebSAndreas Gohr * outer pattern (each marker at any 2-space-multiple indent is its own
42685560ebSAndreas Gohr * item at the corresponding depth) and stitched back into nested HTML by
43685560ebSAndreas Gohr * the GfmLists rewriter.
44685560ebSAndreas Gohr */
45685560ebSAndreas Gohrclass GfmListblock extends AbstractMode
46685560ebSAndreas Gohr{
47685560ebSAndreas Gohr    /**
48685560ebSAndreas Gohr     * Regex fragment matching one list marker.
49685560ebSAndreas Gohr     *
50685560ebSAndreas Gohr     * Either an unordered marker (`-`, `*`, `+`) or an ordered marker
51685560ebSAndreas Gohr     * (1-9 digits followed by `.` or `)`). Used by the entry pattern in
52685560ebSAndreas Gohr     * connectTo() and by the per-line classifier in parseItems().
53685560ebSAndreas Gohr     */
54685560ebSAndreas Gohr    protected const MARKER = '(?:[-*+]|\d{1,9}[.)])';
55685560ebSAndreas Gohr
56685560ebSAndreas Gohr    /** @inheritdoc */
57685560ebSAndreas Gohr    public function getSort()
58685560ebSAndreas Gohr    {
59685560ebSAndreas Gohr        return 10;
60685560ebSAndreas Gohr    }
61685560ebSAndreas Gohr
62685560ebSAndreas Gohr    /** @inheritdoc */
63685560ebSAndreas Gohr    public function preConnect()
64685560ebSAndreas Gohr    {
65685560ebSAndreas Gohr        ModeRegistry::getInstance()->registerBlockEolMode('gfm_listblock');
66685560ebSAndreas Gohr    }
67685560ebSAndreas Gohr
68685560ebSAndreas Gohr    /**
69685560ebSAndreas Gohr     * Register the special pattern that captures a whole list block.
70685560ebSAndreas Gohr     *
71685560ebSAndreas Gohr     * The pattern starts on a marker line (any indent) and then loops over
72685560ebSAndreas Gohr     * four alternatives until none matches:
73685560ebSAndreas Gohr     *
74685560ebSAndreas Gohr     *   1. A subsequent marker line at any indent.
75685560ebSAndreas Gohr     *   2. An indented continuation line (>= 2 leading spaces with content).
76685560ebSAndreas Gohr     *   3. A blank line followed by indented content (any number of
77685560ebSAndreas Gohr     *      intervening blank lines tolerated via the lookahead).
78685560ebSAndreas Gohr     *   4. A blank line followed by a next marker (same multi-blank
79685560ebSAndreas Gohr     *      tolerance as alt 3).
80685560ebSAndreas Gohr     *
81685560ebSAndreas Gohr     * The block ends naturally when none of the alternatives match — for
82685560ebSAndreas Gohr     * example a column-0 non-marker line, or two-or-more blank lines
83685560ebSAndreas Gohr     * followed by non-list content.
84685560ebSAndreas Gohr     *
85685560ebSAndreas Gohr     * @inheritdoc
86685560ebSAndreas Gohr     */
87685560ebSAndreas Gohr    public function connectTo($mode)
88685560ebSAndreas Gohr    {
89685560ebSAndreas Gohr        $pattern =
90685560ebSAndreas Gohr            '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
91685560ebSAndreas Gohr            '(?:' .
92685560ebSAndreas Gohr                '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' .
93685560ebSAndreas Gohr            '|' . '\n[ \t]{2,}\S[^\n]*' .
94685560ebSAndreas Gohr            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]{2,}\S)' .
95685560ebSAndreas Gohr            '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]*' . self::MARKER . ')' .
96685560ebSAndreas Gohr            ')*';
97685560ebSAndreas Gohr        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_listblock');
98685560ebSAndreas Gohr    }
99685560ebSAndreas Gohr
100685560ebSAndreas Gohr    /**
101685560ebSAndreas Gohr     * Convert the captured block into handler calls.
102685560ebSAndreas Gohr     *
103685560ebSAndreas Gohr     * Sequence:
104685560ebSAndreas Gohr     *   1. parseItems() splits the captured text into per-item records.
105685560ebSAndreas Gohr     *   2. Install GfmLists as a CallWriter rewriter on the main handler.
106685560ebSAndreas Gohr     *   3. Emit list_open carrying the first item's marker — the rewriter's
107685560ebSAndreas Gohr     *      handleListOpen opens the `<ul>`/`<ol>` and the first `<li>`.
108685560ebSAndreas Gohr     *   4. For each item:
109685560ebSAndreas Gohr     *        - If not the first, emit list_item (closes the previous `<li>`
110685560ebSAndreas Gohr     *          and opens a new one in the rewriter).
111685560ebSAndreas Gohr     *        - Sub-parse the dedented item body via the cached sub-parser.
112685560ebSAndreas Gohr     *        - Filter document_start/end and the outer p_open/p_close pair
113685560ebSAndreas Gohr     *          for tight items (single paragraph).
114685560ebSAndreas Gohr     *        - Wrap the filtered calls in a Nest so the main handler's
115685560ebSAndreas Gohr     *          Block rewriter treats them as opaque.
116685560ebSAndreas Gohr     *   5. Emit list_close and finalise the GfmLists rewriter.
117685560ebSAndreas Gohr     *
118685560ebSAndreas Gohr     * @inheritdoc
119685560ebSAndreas Gohr     */
120685560ebSAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
121685560ebSAndreas Gohr    {
122685560ebSAndreas Gohr        $items = $this->parseItems($match);
123685560ebSAndreas Gohr        if (empty($items)) {
124685560ebSAndreas Gohr            $handler->addCall('cdata', [$match], $pos);
125685560ebSAndreas Gohr            return true;
126685560ebSAndreas Gohr        }
127685560ebSAndreas Gohr
128685560ebSAndreas Gohr        $handler->setCallWriter(new GfmLists($handler->getCallWriter()));
129685560ebSAndreas Gohr        $handler->addCall('list_open', [$items[0]['markerMatch']], $pos);
130685560ebSAndreas Gohr
131*309a0852SAndreas Gohr        $registry = ModeRegistry::getInstance();
132*309a0852SAndreas Gohr        $excludeCats = [ModeRegistry::CATEGORY_BASEONLY];
133*309a0852SAndreas Gohr        $excludeModes = ['gfm_listblock'];
134*309a0852SAndreas Gohr        $subParser = $registry->acquireSubParser($excludeCats, $excludeModes);
135685560ebSAndreas Gohr        $subHandler = $subParser->getHandler();
136685560ebSAndreas Gohr
137685560ebSAndreas Gohr        foreach ($items as $i => $item) {
138685560ebSAndreas Gohr            $itemPos = $pos + $item['offset'];
139685560ebSAndreas Gohr            if ($i > 0) {
140685560ebSAndreas Gohr                $handler->addCall('list_item', [$item['markerMatch']], $itemPos);
141685560ebSAndreas Gohr            }
142685560ebSAndreas Gohr
143685560ebSAndreas Gohr            $subHandler->reset();
144685560ebSAndreas Gohr            $subParser->parse($item['body']);
145685560ebSAndreas Gohr            $itemCalls = $this->filterSubCalls($subHandler->calls);
146685560ebSAndreas Gohr            if (empty($itemCalls)) continue; // empty item — nothing to emit
147685560ebSAndreas Gohr
148685560ebSAndreas Gohr            // Wrap the item content in a Nest so the main handler's Block
149685560ebSAndreas Gohr            // rewriter does not double-wrap our already-paragraphed content.
150685560ebSAndreas Gohr            // Block treats `nest` as opaque and the renderer base class
151685560ebSAndreas Gohr            // unwraps it transparently, the same pattern Footnote uses.
152685560ebSAndreas Gohr            $outer = $handler->getCallWriter();
153685560ebSAndreas Gohr            $nest = new Nest($outer);
154685560ebSAndreas Gohr            $handler->setCallWriter($nest);
155685560ebSAndreas Gohr            foreach ($itemCalls as $call) {
156685560ebSAndreas Gohr                // sub-handler positions are relative to the item body; offset
157685560ebSAndreas Gohr                // them back into the source so section-edit anchors work.
158685560ebSAndreas Gohr                $handler->addCall($call[0], $call[1], $itemPos + $call[2]);
159685560ebSAndreas Gohr            }
160685560ebSAndreas Gohr            $handler->setCallWriter($nest->process());
161685560ebSAndreas Gohr        }
162685560ebSAndreas Gohr
163*309a0852SAndreas Gohr        $registry->releaseSubParser($excludeCats, $excludeModes);
164*309a0852SAndreas Gohr
165685560ebSAndreas Gohr        $handler->addCall('list_close', [], $pos + strlen($match));
166685560ebSAndreas Gohr        $reWriter = $handler->getCallWriter();
167685560ebSAndreas Gohr        $handler->setCallWriter($reWriter->process());
168685560ebSAndreas Gohr
169685560ebSAndreas Gohr        return true;
170685560ebSAndreas Gohr    }
171685560ebSAndreas Gohr
172685560ebSAndreas Gohr    /**
173685560ebSAndreas Gohr     * Walk the captured block, grouping lines into items.
174685560ebSAndreas Gohr     *
175685560ebSAndreas Gohr     * Each returned item describes one list_item: its marker (in the form
176685560ebSAndreas Gohr     * "\n{indent}{marker}" so GfmLists::interpretSyntax can parse it), the
177685560ebSAndreas Gohr     * dedented body, dedent column, and absolute offset within $match.
178685560ebSAndreas Gohr     *
179685560ebSAndreas Gohr     * Lines are classified as marker / continuation / blank. A marker line
180685560ebSAndreas Gohr     * starts a new item; continuation and blank lines accumulate into the
181685560ebSAndreas Gohr     * current item's body. Continuation lines are dedented by up to
182685560ebSAndreas Gohr     * indent + marker_width + 1 leading spaces (the item's content column
183685560ebSAndreas Gohr     * for single-space-after-marker cases). Blank lines are kept as empty
184685560ebSAndreas Gohr     * body lines while they're in the middle of the body and stripped
185685560ebSAndreas Gohr     * from the trailing edge by joinBody() so single-paragraph items
186685560ebSAndreas Gohr     * parse tight.
187685560ebSAndreas Gohr     *
188685560ebSAndreas Gohr     * @param string $match the raw special-pattern match (starts with \n)
189685560ebSAndreas Gohr     * @return array<int, array{markerMatch: string, dedent: int, body: string, offset: int}>
190685560ebSAndreas Gohr     */
191685560ebSAndreas Gohr    protected function parseItems($match)
192685560ebSAndreas Gohr    {
193685560ebSAndreas Gohr        $stripped = ltrim($match, "\n");
194685560ebSAndreas Gohr        $offsetBase = strlen($match) - strlen($stripped);
195685560ebSAndreas Gohr        $lines = explode("\n", $stripped);
196685560ebSAndreas Gohr
197685560ebSAndreas Gohr        $items = [];
198685560ebSAndreas Gohr        $current = null;
199685560ebSAndreas Gohr        $bodyLines = [];
200685560ebSAndreas Gohr        $cursor = $offsetBase;
201685560ebSAndreas Gohr
202685560ebSAndreas Gohr        foreach ($lines as $line) {
203685560ebSAndreas Gohr            $isMarker = preg_match(
204685560ebSAndreas Gohr                '/^([ \t]*)(' . self::MARKER . ')(?:[ \t](.*)|$)/',
205685560ebSAndreas Gohr                $line,
206685560ebSAndreas Gohr                $m
207685560ebSAndreas Gohr            );
208685560ebSAndreas Gohr
209685560ebSAndreas Gohr            if ($isMarker) {
210685560ebSAndreas Gohr                if ($current !== null) {
211685560ebSAndreas Gohr                    $current['body'] = $this->joinBody($bodyLines);
212685560ebSAndreas Gohr                    $items[] = $current;
213685560ebSAndreas Gohr                }
214685560ebSAndreas Gohr                $indent = str_replace("\t", "  ", $m[1]);
215685560ebSAndreas Gohr                $marker = $m[2];
216685560ebSAndreas Gohr                $firstLine = $m[3] ?? '';
217685560ebSAndreas Gohr                $current = [
218685560ebSAndreas Gohr                    'markerMatch' => "\n" . $indent . $marker,
219685560ebSAndreas Gohr                    'dedent' => strlen($indent) + strlen($marker) + 1,
220685560ebSAndreas Gohr                    'offset' => $cursor,
221685560ebSAndreas Gohr                ];
222685560ebSAndreas Gohr                $bodyLines = [$firstLine];
223685560ebSAndreas Gohr            } elseif ($current !== null) {
224685560ebSAndreas Gohr                if (trim($line) === '') {
225685560ebSAndreas Gohr                    $bodyLines[] = '';
226685560ebSAndreas Gohr                } else {
227685560ebSAndreas Gohr                    $expanded = str_replace("\t", "  ", $line);
228685560ebSAndreas Gohr                    $available = strlen($expanded) - strlen(ltrim($expanded, ' '));
229685560ebSAndreas Gohr                    $strip = min($current['dedent'], $available);
230685560ebSAndreas Gohr                    $bodyLines[] = substr($expanded, $strip);
231685560ebSAndreas Gohr                }
232685560ebSAndreas Gohr            }
233685560ebSAndreas Gohr
234685560ebSAndreas Gohr            $cursor += strlen($line) + 1; // +1 for the \n consumed by explode
235685560ebSAndreas Gohr        }
236685560ebSAndreas Gohr
237685560ebSAndreas Gohr        if ($current !== null) {
238685560ebSAndreas Gohr            $current['body'] = $this->joinBody($bodyLines);
239685560ebSAndreas Gohr            $items[] = $current;
240685560ebSAndreas Gohr        }
241685560ebSAndreas Gohr
242685560ebSAndreas Gohr        return $items;
243685560ebSAndreas Gohr    }
244685560ebSAndreas Gohr
245685560ebSAndreas Gohr    /**
246685560ebSAndreas Gohr     * Join body lines into a string, trimming trailing blank lines.
247685560ebSAndreas Gohr     *
248685560ebSAndreas Gohr     * Trailing blanks would reach the sub-parser and cause Block to wrap
249685560ebSAndreas Gohr     * the otherwise-single paragraph content in `p_open`/`p_close`,
250685560ebSAndreas Gohr     * forcing a tight item into loose-item shape. Stripping them here
251685560ebSAndreas Gohr     * preserves the tight rendering for items that look tight in source.
252685560ebSAndreas Gohr     *
253685560ebSAndreas Gohr     * @param string[] $lines
254685560ebSAndreas Gohr     */
255685560ebSAndreas Gohr    protected function joinBody(array $lines): string
256685560ebSAndreas Gohr    {
257685560ebSAndreas Gohr        return rtrim(implode("\n", $lines), "\n");
258685560ebSAndreas Gohr    }
259685560ebSAndreas Gohr
260685560ebSAndreas Gohr    /**
261685560ebSAndreas Gohr     * Filter the sub-parser's flat call list before nest-wrapping it.
262685560ebSAndreas Gohr     *
263685560ebSAndreas Gohr     * Drops `document_start` / `document_end` (always emitted by
264685560ebSAndreas Gohr     * Handler::finalize), and strips the outer `p_open` / `p_close` pair
265685560ebSAndreas Gohr     * for tight items so their content sits inline inside `<li>`. Loose
266685560ebSAndreas Gohr     * items (multiple paragraphs, more than one `p_open`) keep their
267685560ebSAndreas Gohr     * inner pairs untouched. The filtered calls are then wrapped in a
268685560ebSAndreas Gohr     * Nest by handle() before they reach the GfmLists rewriter.
269685560ebSAndreas Gohr     *
270685560ebSAndreas Gohr     * @param array $calls
271685560ebSAndreas Gohr     * @return array
272685560ebSAndreas Gohr     */
273685560ebSAndreas Gohr    protected function filterSubCalls(array $calls)
274685560ebSAndreas Gohr    {
275685560ebSAndreas Gohr        if ($calls && $calls[0][0] === 'document_start') array_shift($calls);
276685560ebSAndreas Gohr        if ($calls && end($calls)[0] === 'document_end') array_pop($calls);
277685560ebSAndreas Gohr
278685560ebSAndreas Gohr        $pCount = 0;
279685560ebSAndreas Gohr        foreach ($calls as $c) {
280685560ebSAndreas Gohr            if ($c[0] === 'p_open') $pCount++;
281685560ebSAndreas Gohr        }
282685560ebSAndreas Gohr
283685560ebSAndreas Gohr        if ($pCount === 1
284685560ebSAndreas Gohr            && $calls[0][0] === 'p_open'
285685560ebSAndreas Gohr            && end($calls)[0] === 'p_close') {
286685560ebSAndreas Gohr            array_shift($calls);
287685560ebSAndreas Gohr            array_pop($calls);
288685560ebSAndreas Gohr        }
289685560ebSAndreas Gohr
290685560ebSAndreas Gohr        return $calls;
291685560ebSAndreas Gohr    }
292685560ebSAndreas Gohr}
293