1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Handler\Nest; 7use dokuwiki\Parsing\ModeRegistry; 8 9/** 10 * Block quotes — single mode covering both DokuWiki and GFM dialects. 11 * 12 * Captures one or more consecutive column-0 `>`-prefixed lines via 13 * addSpecialPattern. Nesting is resolved at this level by counting 14 * leading `>` markers per line and emitting `quote_open` / `quote_close` 15 * pairs around per-depth body segments — sub-parser recursion is 16 * deliberately not used because each sub-parser invocation needs its 17 * own Handler instance and threading the nesting through the registry 18 * pool would only buy us back what depth-walking already provides. 19 * 20 * Each per-depth segment's body is sub-parsed via 21 * ModeRegistry::withSubParser() so block content (lists, fenced code, 22 * tables) works inside the body. The sub-parser excludes BASEONLY so 23 * headers do not fire inside a blockquote — same rationale as 24 * GfmListblock: header instructions drive TOC entries, section-edit 25 * anchors, and section_open/section_close ranges that don't compose 26 * with a `<blockquote>` container. The sub-parser also excludes 27 * gfm_quote itself; nesting is handled at this level, not via 28 * sub-parser recursion. When a list inside a quote re-fires gfm_quote 29 * during the list-item sub-parse, the registry's pool hands the 30 * inner call a different parser instance for the same exclusion key, 31 * so the outer parse state is not corrupted. 32 * 33 * Lazy continuation is deliberately not supported. Every quote line 34 * must begin with `>` at column 0; the first non-`>` line ends the 35 * quote. This matches the policy GfmListblock enforces for lists — 36 * markers required on every line. Trade-off: a few CommonMark 37 * blockquote spec examples that rely on lazy continuation stay red, 38 * but the parser stays single-pass and predictable. 39 * 40 * Rendering shape depends on syntax preference. Under MD-preferred 41 * (`md`, `md+dw`) the sub-parser's paragraph wrapping survives: 42 * a quote with one paragraph emits `<blockquote><p>...</p></blockquote>`. 43 * Under DW-preferred (`dw`, `dw+md`) a post-pass flattens 44 * paragraph wrapping into explicit `linebreak` calls so existing DW 45 * pages keep their `<blockquote>...line1<br/>line2...</blockquote>` 46 * rendering. Same `quote_open` / `quote_close` instructions in both 47 * modes — no renderer change required. 48 */ 49class GfmQuote extends AbstractMode 50{ 51 /** @inheritdoc */ 52 public function getSort() 53 { 54 return 220; 55 } 56 57 /** @inheritdoc */ 58 public function preConnect() 59 { 60 ModeRegistry::getInstance()->registerBlockEolMode('gfm_quote'); 61 } 62 63 /** 64 * Capture an entire blockquote in one match. 65 * 66 * The pattern requires a column-0 `>` on every line. The first 67 * non-`>` line ends the capture (no lazy continuation). A bare `>` 68 * with no body is valid — it represents an empty paragraph break 69 * inside the quote (spec 240) or an empty quote (spec 239). 70 * 71 * The first line uses (?:^|\n)> rather than \n> so the blockquote 72 * can take over when a preceding block mode (a table or a list) 73 * consumed the boundary \n on its way out. Those modes' exit 74 * patterns are \n by structural necessity: at the boundary there 75 * is no leading unmatched content for a zero-width lookahead exit 76 * to attach to, and a pure-lookahead exit would trip the lexer's 77 * no-advance safety check. Accepting either a literal \n or a line 78 * start (^ in PCRE multiline mode, which also matches the position 79 * immediately after a consumed \n) lets the blockquote start 80 * regardless. Subsequent quote lines still anchor on \n> because 81 * the previous line consumed up to but not including the \n, so 82 * it is always available for them. 83 * 84 * @param string $mode the lexer state name to wire the pattern into 85 */ 86 public function connectTo($mode) 87 { 88 $this->Lexer->addSpecialPattern('(?:^|\n)>[^\n]*(?:\n>[^\n]*)*', $mode, 'gfm_quote'); 89 } 90 91 /** @inheritdoc */ 92 public function handle($match, $state, $pos, Handler $handler) 93 { 94 $stripped = ltrim($match, "\n"); 95 $cursor = strlen($match) - strlen($stripped); 96 97 $parsed = []; 98 foreach (explode("\n", $stripped) as $line) { 99 $parsed[] = $this->parseLine($line, $pos + $cursor); 100 $cursor += strlen($line) + 1; // +1 for the \n consumed by explode 101 } 102 103 $currentDepth = 0; 104 $buffer = []; 105 $segmentStart = $pos; 106 107 foreach ($parsed as $p) { 108 if ($p['depth'] !== $currentDepth) { 109 if ($buffer) { 110 $this->emitBody($handler, $segmentStart, implode("\n", $buffer)); 111 $buffer = []; 112 } 113 while ($currentDepth < $p['depth']) { 114 $handler->addCall('quote_open', [], $pos); 115 $currentDepth++; 116 } 117 while ($currentDepth > $p['depth']) { 118 $handler->addCall('quote_close', [], $pos); 119 $currentDepth--; 120 } 121 } 122 if (!$buffer) $segmentStart = $p['offset']; 123 $buffer[] = $p['content']; 124 } 125 126 if ($buffer) { 127 $this->emitBody($handler, $segmentStart, implode("\n", $buffer)); 128 } 129 while ($currentDepth > 0) { 130 $handler->addCall('quote_close', [], $pos + strlen($match)); 131 $currentDepth--; 132 } 133 134 return true; 135 } 136 137 /** 138 * Parse one captured line into depth, content, and content offset. 139 * 140 * Counts leading `>` characters (each consuming one optional 141 * trailing space) to compute the depth. The remainder of the line 142 * is the content for that depth. The returned `offset` is the 143 * absolute byte position of the content's first character within 144 * the source (`$lineStart` plus the length of the consumed marker 145 * prefix). 146 * 147 * `> > foo` → depth 2, content `foo`. `>>foo` → depth 2, content 148 * `foo`. `>` alone → depth 1, content empty. 149 * 150 * @param string $line one line of captured blockquote text, with 151 * no surrounding newlines 152 * @param int $lineStart absolute byte offset of the line's first 153 * character within the source 154 * @return array{depth: int, content: string, offset: int} 155 */ 156 protected function parseLine(string $line, int $lineStart): array 157 { 158 $depth = 0; 159 $i = 0; 160 $len = strlen($line); 161 while ($i < $len && $line[$i] === '>') { 162 $depth++; 163 $i++; 164 if ($i < $len && $line[$i] === ' ') $i++; 165 } 166 return [ 167 'depth' => $depth, 168 'content' => substr($line, $i), 169 'offset' => $lineStart + $i, 170 ]; 171 } 172 173 /** 174 * Sub-parse a body segment and emit its calls inside a Nest. 175 * 176 * Drops `document_start` / `document_end` from the sub-parser 177 * output. Under DW-preferred syntax, also runs the linebreak 178 * post-pass so paragraph wrapping is flattened into explicit 179 * `linebreak` calls. Empty bodies emit nothing. 180 * 181 * `$segmentStart` is the absolute byte offset of the segment's 182 * first content character within the source. Sub-handler positions 183 * are relative to the sub-parsed body, which begins at the first 184 * line of the segment, so adding `$segmentStart` to each 185 * sub-handler position lands the call back on the right byte in 186 * the source. Lines after the first drift slightly because the 187 * `>[ ]?` prefix between source lines collapses to a single `\n` 188 * in the sub-parsed body — drift is bounded by the prefix length 189 * (one or two bytes per line skipped). 190 * 191 * @param Handler $handler outer handler to emit calls on 192 * @param int $segmentStart absolute byte offset of the segment's 193 * first content character within the source 194 * @param string $body concatenated content of the buffered lines, 195 * separated by `\n` 196 */ 197 protected function emitBody(Handler $handler, int $segmentStart, string $body): void 198 { 199 $registry = ModeRegistry::getInstance(); 200 $calls = $registry->withSubParser( 201 [ModeRegistry::CATEGORY_BASEONLY], 202 ['gfm_quote'], 203 static function ($subParser) use ($body) { 204 $subParser->getHandler()->reset(); 205 $subParser->parse($body); 206 return $subParser->getHandler()->calls; 207 } 208 ); 209 210 if ($calls && $calls[0][0] === 'document_start') array_shift($calls); 211 if ($calls && end($calls)[0] === 'document_end') array_pop($calls); 212 213 if ($registry->isDwPreferred()) { 214 $calls = $this->flattenForDwRendering($calls); 215 } 216 217 if (!$calls) return; 218 219 $outer = $handler->getCallWriter(); 220 $nest = new Nest($outer); 221 $handler->setCallWriter($nest); 222 foreach ($calls as $call) { 223 $handler->addCall($call[0], $call[1], $segmentStart + $call[2]); 224 } 225 $handler->setCallWriter($nest->process()); 226 } 227 228 /** 229 * Flatten paragraph structure into linebreak-separated cdata. 230 * 231 * DW Quote historically rendered each `>`-line as a separate visible 232 * line via an explicit `<br/>` between same-depth markers. To 233 * preserve that rendering for DW-preferred installs, this pass: 234 * 235 * 1. Replaces every `p_open` and `p_close` with a `linebreak` 236 * call. After this, paragraph boundaries become two adjacent 237 * linebreaks (the close-of-prev plus the open-of-next), which 238 * matches the DW two-`<br/>`-for-blank-line shape. 239 * 2. Drops the first and last `linebreak` calls so the run starts 240 * and ends with content, not break markers. 241 * 3. Splits any `cdata` containing `\n` into multiple `cdata` 242 * calls separated by `linebreak` — sub-parsed paragraphs may 243 * contain soft breaks that a renderer would otherwise collapse 244 * to a single space. 245 * 246 * Block-level calls inside the body (list_open from a list inside 247 * a quote, code, etc.) are passed through unchanged. 248 * 249 * @param array $calls sub-parsed call list to flatten 250 * @return array the flattened call list 251 */ 252 protected function flattenForDwRendering(array $calls): array 253 { 254 $stage = []; 255 foreach ($calls as $call) { 256 if ($call[0] === 'p_open' || $call[0] === 'p_close') { 257 $stage[] = ['linebreak', [], $call[2]]; 258 } else { 259 $stage[] = $call; 260 } 261 } 262 263 while ($stage && $stage[0][0] === 'linebreak') array_shift($stage); 264 while ($stage && end($stage)[0] === 'linebreak') array_pop($stage); 265 266 $out = []; 267 foreach ($stage as $call) { 268 if ($call[0] === 'cdata' && str_contains($call[1][0], "\n")) { 269 $parts = explode("\n", $call[1][0]); 270 foreach ($parts as $i => $part) { 271 if ($i > 0) $out[] = ['linebreak', [], $call[2]]; 272 if ($part !== '') $out[] = ['cdata', [$part], $call[2]]; 273 } 274 } else { 275 $out[] = $call; 276 } 277 } 278 279 return $out; 280 } 281} 282