xref: /dokuwiki/inc/Parsing/ParserMode/GfmQuote.php (revision 309a08521b24a6fff00f318e061096f69771bbad)
1*309a0852SAndreas Gohr<?php
2*309a0852SAndreas Gohr
3*309a0852SAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4*309a0852SAndreas Gohr
5*309a0852SAndreas Gohruse dokuwiki\Parsing\Handler;
6*309a0852SAndreas Gohruse dokuwiki\Parsing\Handler\Nest;
7*309a0852SAndreas Gohruse dokuwiki\Parsing\ModeRegistry;
8*309a0852SAndreas Gohr
9*309a0852SAndreas Gohr/**
10*309a0852SAndreas Gohr * Block quotes — single mode covering both DokuWiki and GFM dialects.
11*309a0852SAndreas Gohr *
12*309a0852SAndreas Gohr * Captures one or more consecutive column-0 `>`-prefixed lines via
13*309a0852SAndreas Gohr * addSpecialPattern. Nesting is resolved at this level by counting
14*309a0852SAndreas Gohr * leading `>` markers per line and emitting `quote_open` / `quote_close`
15*309a0852SAndreas Gohr * pairs around per-depth body segments — sub-parser recursion is
16*309a0852SAndreas Gohr * deliberately not used because each sub-parser invocation needs its
17*309a0852SAndreas Gohr * own Handler instance and threading the nesting through the registry
18*309a0852SAndreas Gohr * pool would only buy us back what depth-walking already provides.
19*309a0852SAndreas Gohr *
20*309a0852SAndreas Gohr * Each per-depth segment's body is sub-parsed via
21*309a0852SAndreas Gohr * ModeRegistry::withSubParser() so block content (lists, fenced code,
22*309a0852SAndreas Gohr * tables) works inside the body. The sub-parser excludes BASEONLY so
23*309a0852SAndreas Gohr * headers do not fire inside a blockquote — same rationale as
24*309a0852SAndreas Gohr * GfmListblock: header instructions drive TOC entries, section-edit
25*309a0852SAndreas Gohr * anchors, and section_open/section_close ranges that don't compose
26*309a0852SAndreas Gohr * with a `<blockquote>` container. The sub-parser also excludes
27*309a0852SAndreas Gohr * gfm_quote itself; nesting is handled at this level, not via
28*309a0852SAndreas Gohr * sub-parser recursion. When a list inside a quote re-fires gfm_quote
29*309a0852SAndreas Gohr * during the list-item sub-parse, the registry's pool hands the
30*309a0852SAndreas Gohr * inner call a different parser instance for the same exclusion key,
31*309a0852SAndreas Gohr * so the outer parse state is not corrupted.
32*309a0852SAndreas Gohr *
33*309a0852SAndreas Gohr * Lazy continuation is deliberately not supported. Every quote line
34*309a0852SAndreas Gohr * must begin with `>` at column 0; the first non-`>` line ends the
35*309a0852SAndreas Gohr * quote. This matches the policy GfmListblock enforces for lists —
36*309a0852SAndreas Gohr * markers required on every line. Trade-off: a few CommonMark
37*309a0852SAndreas Gohr * blockquote spec examples that rely on lazy continuation stay red,
38*309a0852SAndreas Gohr * but the parser stays single-pass and predictable.
39*309a0852SAndreas Gohr *
40*309a0852SAndreas Gohr * Rendering shape depends on syntax preference. Under MD-preferred
41*309a0852SAndreas Gohr * (`markdown`, `md+dw`) the sub-parser's paragraph wrapping survives:
42*309a0852SAndreas Gohr * a quote with one paragraph emits `<blockquote><p>...</p></blockquote>`.
43*309a0852SAndreas Gohr * Under DW-preferred (`dokuwiki`, `dw+md`) a post-pass flattens
44*309a0852SAndreas Gohr * paragraph wrapping into explicit `linebreak` calls so existing DW
45*309a0852SAndreas Gohr * pages keep their `<blockquote>...line1<br/>line2...</blockquote>`
46*309a0852SAndreas Gohr * rendering. Same `quote_open` / `quote_close` instructions in both
47*309a0852SAndreas Gohr * modes — no renderer change required.
48*309a0852SAndreas Gohr */
49*309a0852SAndreas Gohrclass GfmQuote extends AbstractMode
50*309a0852SAndreas Gohr{
51*309a0852SAndreas Gohr    /** @inheritdoc */
52*309a0852SAndreas Gohr    public function getSort()
53*309a0852SAndreas Gohr    {
54*309a0852SAndreas Gohr        return 220;
55*309a0852SAndreas Gohr    }
56*309a0852SAndreas Gohr
57*309a0852SAndreas Gohr    /** @inheritdoc */
58*309a0852SAndreas Gohr    public function preConnect()
59*309a0852SAndreas Gohr    {
60*309a0852SAndreas Gohr        ModeRegistry::getInstance()->registerBlockEolMode('gfm_quote');
61*309a0852SAndreas Gohr    }
62*309a0852SAndreas Gohr
63*309a0852SAndreas Gohr    /**
64*309a0852SAndreas Gohr     * Capture an entire blockquote in one match.
65*309a0852SAndreas Gohr     *
66*309a0852SAndreas Gohr     * The pattern requires a column-0 `>` on every line. The first
67*309a0852SAndreas Gohr     * non-`>` line ends the capture (no lazy continuation). A bare `>`
68*309a0852SAndreas Gohr     * with no body is valid — it represents an empty paragraph break
69*309a0852SAndreas Gohr     * inside the quote (spec 240) or an empty quote (spec 239).
70*309a0852SAndreas Gohr     *
71*309a0852SAndreas Gohr     * @param string $mode the lexer state name to wire the pattern into
72*309a0852SAndreas Gohr     */
73*309a0852SAndreas Gohr    public function connectTo($mode)
74*309a0852SAndreas Gohr    {
75*309a0852SAndreas Gohr        $this->Lexer->addSpecialPattern('\n>[^\n]*(?:\n>[^\n]*)*', $mode, 'gfm_quote');
76*309a0852SAndreas Gohr    }
77*309a0852SAndreas Gohr
78*309a0852SAndreas Gohr    /** @inheritdoc */
79*309a0852SAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
80*309a0852SAndreas Gohr    {
81*309a0852SAndreas Gohr        $stripped = ltrim($match, "\n");
82*309a0852SAndreas Gohr        $cursor = strlen($match) - strlen($stripped);
83*309a0852SAndreas Gohr
84*309a0852SAndreas Gohr        $parsed = [];
85*309a0852SAndreas Gohr        foreach (explode("\n", $stripped) as $line) {
86*309a0852SAndreas Gohr            $parsed[] = $this->parseLine($line, $pos + $cursor);
87*309a0852SAndreas Gohr            $cursor += strlen($line) + 1; // +1 for the \n consumed by explode
88*309a0852SAndreas Gohr        }
89*309a0852SAndreas Gohr
90*309a0852SAndreas Gohr        $currentDepth = 0;
91*309a0852SAndreas Gohr        $buffer = [];
92*309a0852SAndreas Gohr        $segmentStart = $pos;
93*309a0852SAndreas Gohr
94*309a0852SAndreas Gohr        foreach ($parsed as $p) {
95*309a0852SAndreas Gohr            if ($p['depth'] !== $currentDepth) {
96*309a0852SAndreas Gohr                if ($buffer) {
97*309a0852SAndreas Gohr                    $this->emitBody($handler, $segmentStart, implode("\n", $buffer));
98*309a0852SAndreas Gohr                    $buffer = [];
99*309a0852SAndreas Gohr                }
100*309a0852SAndreas Gohr                while ($currentDepth < $p['depth']) {
101*309a0852SAndreas Gohr                    $handler->addCall('quote_open', [], $pos);
102*309a0852SAndreas Gohr                    $currentDepth++;
103*309a0852SAndreas Gohr                }
104*309a0852SAndreas Gohr                while ($currentDepth > $p['depth']) {
105*309a0852SAndreas Gohr                    $handler->addCall('quote_close', [], $pos);
106*309a0852SAndreas Gohr                    $currentDepth--;
107*309a0852SAndreas Gohr                }
108*309a0852SAndreas Gohr            }
109*309a0852SAndreas Gohr            if (!$buffer) $segmentStart = $p['offset'];
110*309a0852SAndreas Gohr            $buffer[] = $p['content'];
111*309a0852SAndreas Gohr        }
112*309a0852SAndreas Gohr
113*309a0852SAndreas Gohr        if ($buffer) {
114*309a0852SAndreas Gohr            $this->emitBody($handler, $segmentStart, implode("\n", $buffer));
115*309a0852SAndreas Gohr        }
116*309a0852SAndreas Gohr        while ($currentDepth > 0) {
117*309a0852SAndreas Gohr            $handler->addCall('quote_close', [], $pos + strlen($match));
118*309a0852SAndreas Gohr            $currentDepth--;
119*309a0852SAndreas Gohr        }
120*309a0852SAndreas Gohr
121*309a0852SAndreas Gohr        return true;
122*309a0852SAndreas Gohr    }
123*309a0852SAndreas Gohr
124*309a0852SAndreas Gohr    /**
125*309a0852SAndreas Gohr     * Parse one captured line into depth, content, and content offset.
126*309a0852SAndreas Gohr     *
127*309a0852SAndreas Gohr     * Counts leading `>` characters (each consuming one optional
128*309a0852SAndreas Gohr     * trailing space) to compute the depth. The remainder of the line
129*309a0852SAndreas Gohr     * is the content for that depth. The returned `offset` is the
130*309a0852SAndreas Gohr     * absolute byte position of the content's first character within
131*309a0852SAndreas Gohr     * the source (`$lineStart` plus the length of the consumed marker
132*309a0852SAndreas Gohr     * prefix).
133*309a0852SAndreas Gohr     *
134*309a0852SAndreas Gohr     * `> > foo` → depth 2, content `foo`. `>>foo` → depth 2, content
135*309a0852SAndreas Gohr     * `foo`. `>` alone → depth 1, content empty.
136*309a0852SAndreas Gohr     *
137*309a0852SAndreas Gohr     * @param string $line one line of captured blockquote text, with
138*309a0852SAndreas Gohr     *     no surrounding newlines
139*309a0852SAndreas Gohr     * @param int $lineStart absolute byte offset of the line's first
140*309a0852SAndreas Gohr     *     character within the source
141*309a0852SAndreas Gohr     * @return array{depth: int, content: string, offset: int}
142*309a0852SAndreas Gohr     */
143*309a0852SAndreas Gohr    protected function parseLine(string $line, int $lineStart): array
144*309a0852SAndreas Gohr    {
145*309a0852SAndreas Gohr        $depth = 0;
146*309a0852SAndreas Gohr        $i = 0;
147*309a0852SAndreas Gohr        $len = strlen($line);
148*309a0852SAndreas Gohr        while ($i < $len && $line[$i] === '>') {
149*309a0852SAndreas Gohr            $depth++;
150*309a0852SAndreas Gohr            $i++;
151*309a0852SAndreas Gohr            if ($i < $len && $line[$i] === ' ') $i++;
152*309a0852SAndreas Gohr        }
153*309a0852SAndreas Gohr        return [
154*309a0852SAndreas Gohr            'depth'   => $depth,
155*309a0852SAndreas Gohr            'content' => substr($line, $i),
156*309a0852SAndreas Gohr            'offset'  => $lineStart + $i,
157*309a0852SAndreas Gohr        ];
158*309a0852SAndreas Gohr    }
159*309a0852SAndreas Gohr
160*309a0852SAndreas Gohr    /**
161*309a0852SAndreas Gohr     * Sub-parse a body segment and emit its calls inside a Nest.
162*309a0852SAndreas Gohr     *
163*309a0852SAndreas Gohr     * Drops `document_start` / `document_end` from the sub-parser
164*309a0852SAndreas Gohr     * output. Under DW-preferred syntax, also runs the linebreak
165*309a0852SAndreas Gohr     * post-pass so paragraph wrapping is flattened into explicit
166*309a0852SAndreas Gohr     * `linebreak` calls. Empty bodies emit nothing.
167*309a0852SAndreas Gohr     *
168*309a0852SAndreas Gohr     * `$segmentStart` is the absolute byte offset of the segment's
169*309a0852SAndreas Gohr     * first content character within the source. Sub-handler positions
170*309a0852SAndreas Gohr     * are relative to the sub-parsed body, which begins at the first
171*309a0852SAndreas Gohr     * line of the segment, so adding `$segmentStart` to each
172*309a0852SAndreas Gohr     * sub-handler position lands the call back on the right byte in
173*309a0852SAndreas Gohr     * the source. Lines after the first drift slightly because the
174*309a0852SAndreas Gohr     * `>[ ]?` prefix between source lines collapses to a single `\n`
175*309a0852SAndreas Gohr     * in the sub-parsed body — drift is bounded by the prefix length
176*309a0852SAndreas Gohr     * (one or two bytes per line skipped).
177*309a0852SAndreas Gohr     *
178*309a0852SAndreas Gohr     * @param Handler $handler outer handler to emit calls on
179*309a0852SAndreas Gohr     * @param int $segmentStart absolute byte offset of the segment's
180*309a0852SAndreas Gohr     *     first content character within the source
181*309a0852SAndreas Gohr     * @param string $body concatenated content of the buffered lines,
182*309a0852SAndreas Gohr     *     separated by `\n`
183*309a0852SAndreas Gohr     */
184*309a0852SAndreas Gohr    protected function emitBody(Handler $handler, int $segmentStart, string $body): void
185*309a0852SAndreas Gohr    {
186*309a0852SAndreas Gohr        $registry = ModeRegistry::getInstance();
187*309a0852SAndreas Gohr        $calls = $registry->withSubParser(
188*309a0852SAndreas Gohr            [ModeRegistry::CATEGORY_BASEONLY],
189*309a0852SAndreas Gohr            ['gfm_quote'],
190*309a0852SAndreas Gohr            static function ($subParser) use ($body) {
191*309a0852SAndreas Gohr                $subParser->getHandler()->reset();
192*309a0852SAndreas Gohr                $subParser->parse($body);
193*309a0852SAndreas Gohr                return $subParser->getHandler()->calls;
194*309a0852SAndreas Gohr            }
195*309a0852SAndreas Gohr        );
196*309a0852SAndreas Gohr
197*309a0852SAndreas Gohr        if ($calls && $calls[0][0] === 'document_start') array_shift($calls);
198*309a0852SAndreas Gohr        if ($calls && end($calls)[0] === 'document_end') array_pop($calls);
199*309a0852SAndreas Gohr
200*309a0852SAndreas Gohr        if ($registry->isDwPreferred()) {
201*309a0852SAndreas Gohr            $calls = $this->flattenForDwRendering($calls);
202*309a0852SAndreas Gohr        }
203*309a0852SAndreas Gohr
204*309a0852SAndreas Gohr        if (!$calls) return;
205*309a0852SAndreas Gohr
206*309a0852SAndreas Gohr        $outer = $handler->getCallWriter();
207*309a0852SAndreas Gohr        $nest = new Nest($outer);
208*309a0852SAndreas Gohr        $handler->setCallWriter($nest);
209*309a0852SAndreas Gohr        foreach ($calls as $call) {
210*309a0852SAndreas Gohr            $handler->addCall($call[0], $call[1], $segmentStart + $call[2]);
211*309a0852SAndreas Gohr        }
212*309a0852SAndreas Gohr        $handler->setCallWriter($nest->process());
213*309a0852SAndreas Gohr    }
214*309a0852SAndreas Gohr
215*309a0852SAndreas Gohr    /**
216*309a0852SAndreas Gohr     * Flatten paragraph structure into linebreak-separated cdata.
217*309a0852SAndreas Gohr     *
218*309a0852SAndreas Gohr     * DW Quote historically rendered each `>`-line as a separate visible
219*309a0852SAndreas Gohr     * line via an explicit `<br/>` between same-depth markers. To
220*309a0852SAndreas Gohr     * preserve that rendering for DW-preferred installs, this pass:
221*309a0852SAndreas Gohr     *
222*309a0852SAndreas Gohr     *   1. Replaces every `p_open` and `p_close` with a `linebreak`
223*309a0852SAndreas Gohr     *      call. After this, paragraph boundaries become two adjacent
224*309a0852SAndreas Gohr     *      linebreaks (the close-of-prev plus the open-of-next), which
225*309a0852SAndreas Gohr     *      matches the DW two-`<br/>`-for-blank-line shape.
226*309a0852SAndreas Gohr     *   2. Drops the first and last `linebreak` calls so the run starts
227*309a0852SAndreas Gohr     *      and ends with content, not break markers.
228*309a0852SAndreas Gohr     *   3. Splits any `cdata` containing `\n` into multiple `cdata`
229*309a0852SAndreas Gohr     *      calls separated by `linebreak` — sub-parsed paragraphs may
230*309a0852SAndreas Gohr     *      contain soft breaks that a renderer would otherwise collapse
231*309a0852SAndreas Gohr     *      to a single space.
232*309a0852SAndreas Gohr     *
233*309a0852SAndreas Gohr     * Block-level calls inside the body (list_open from a list inside
234*309a0852SAndreas Gohr     * a quote, code, etc.) are passed through unchanged.
235*309a0852SAndreas Gohr     *
236*309a0852SAndreas Gohr     * @param array $calls sub-parsed call list to flatten
237*309a0852SAndreas Gohr     * @return array the flattened call list
238*309a0852SAndreas Gohr     */
239*309a0852SAndreas Gohr    protected function flattenForDwRendering(array $calls): array
240*309a0852SAndreas Gohr    {
241*309a0852SAndreas Gohr        $stage = [];
242*309a0852SAndreas Gohr        foreach ($calls as $call) {
243*309a0852SAndreas Gohr            if ($call[0] === 'p_open' || $call[0] === 'p_close') {
244*309a0852SAndreas Gohr                $stage[] = ['linebreak', [], $call[2]];
245*309a0852SAndreas Gohr            } else {
246*309a0852SAndreas Gohr                $stage[] = $call;
247*309a0852SAndreas Gohr            }
248*309a0852SAndreas Gohr        }
249*309a0852SAndreas Gohr
250*309a0852SAndreas Gohr        while ($stage && $stage[0][0] === 'linebreak') array_shift($stage);
251*309a0852SAndreas Gohr        while ($stage && end($stage)[0] === 'linebreak') array_pop($stage);
252*309a0852SAndreas Gohr
253*309a0852SAndreas Gohr        $out = [];
254*309a0852SAndreas Gohr        foreach ($stage as $call) {
255*309a0852SAndreas Gohr            if ($call[0] === 'cdata' && str_contains($call[1][0], "\n")) {
256*309a0852SAndreas Gohr                $parts = explode("\n", $call[1][0]);
257*309a0852SAndreas Gohr                foreach ($parts as $i => $part) {
258*309a0852SAndreas Gohr                    if ($i > 0) $out[] = ['linebreak', [], $call[2]];
259*309a0852SAndreas Gohr                    if ($part !== '') $out[] = ['cdata', [$part], $call[2]];
260*309a0852SAndreas Gohr                }
261*309a0852SAndreas Gohr            } else {
262*309a0852SAndreas Gohr                $out[] = $call;
263*309a0852SAndreas Gohr            }
264*309a0852SAndreas Gohr        }
265*309a0852SAndreas Gohr
266*309a0852SAndreas Gohr        return $out;
267*309a0852SAndreas Gohr    }
268*309a0852SAndreas Gohr}
269