1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Handler\GfmLists; 7use dokuwiki\Parsing\Handler\Nest; 8use dokuwiki\Parsing\ModeRegistry; 9 10/** 11 * GFM list block. 12 * 13 * Captures an entire list block atomically (one addSpecialPattern match) and 14 * walks the captured text in handle(), grouping lines into items. The per-item 15 * loop runs inside a ModeRegistry::withSubParser() callback so each item's 16 * body is dedented to its content column and parsed by a pooled sub-parser, 17 * and block content - paragraphs, fenced code, blockquotes, plugin blocks - 18 * works inside items uniformly without depending on column-0 anchoring of 19 * nested mode patterns. If any nested mode requests a sub-parser with the 20 * same exclusion key while ours is in use, the registry's pool hands them a 21 * different slot so their reset() does not corrupt our state. 22 * 23 * Sub-parser mode set: every active mode except CATEGORY_BASEONLY (i.e. no 24 * Header inside list items, since `<h1>`-`<h6>` inside `<li>` is never 25 * desirable and section nesting must not span into items) and gfm_listblock 26 * itself (defensive guard against lexer re-entry on pathological inputs; 27 * normal nested lists are caught by the outer pattern instead). 28 * 29 * Each item's sub-parsed calls are wrapped in a `nest` instruction (see 30 * Handler\Nest) before they reach the outer handler. This is essential: 31 * the sub-parser's Block rewriter has already wrapped multi-paragraph 32 * content in `p_open`/`p_close`, and without nest-wrapping the main 33 * handler's Block rewriter would see those paragraphs and add another 34 * `<p>` around the entire replayed range, producing nested `<p>` tags. 35 * Block treats `nest` as opaque and the renderer base class unwraps it 36 * transparently — the same pattern Footnote uses. 37 * 38 * Indentation rule: depth = (indent / 2) + 1. Tabs become two spaces. 1- and 39 * 3-space indents round down. Marker characters: -, *, + (unordered) and 40 * digits followed by . or ) (ordered). Nested lists are caught by the 41 * outer pattern (each marker at any 2-space-multiple indent is its own 42 * item at the corresponding depth) and stitched back into nested HTML by 43 * the GfmLists rewriter. 44 */ 45class GfmListblock extends AbstractMode 46{ 47 /** 48 * Regex fragment matching one list marker. 49 * 50 * Either an unordered marker (`-`, `*`, `+`) or an ordered marker 51 * (1-9 digits followed by `.` or `)`). Used by the entry pattern in 52 * connectTo() and by the per-line classifier in parseItems(). 53 */ 54 protected const MARKER = '(?:[-*+]|\d{1,9}[.)])'; 55 56 /** @inheritdoc */ 57 public function getSort() 58 { 59 return 10; 60 } 61 62 /** @inheritdoc */ 63 public function preConnect() 64 { 65 ModeRegistry::getInstance()->registerBlockEolMode('gfm_listblock'); 66 } 67 68 /** 69 * Register the special pattern that captures a whole list block. 70 * 71 * The pattern starts on a marker line (any indent) and then loops over 72 * four alternatives until none matches: 73 * 74 * 1. A subsequent marker line at any indent. 75 * 2. An indented continuation line (>= 2 leading spaces with content). 76 * 3. A blank line followed by indented content (any number of 77 * intervening blank lines tolerated via the lookahead). 78 * 4. A blank line followed by a next marker (same multi-blank 79 * tolerance as alt 3). 80 * 81 * The block ends naturally when none of the alternatives match — for 82 * example a column-0 non-marker line, or two-or-more blank lines 83 * followed by non-list content. 84 * 85 * @inheritdoc 86 */ 87 public function connectTo($mode) 88 { 89 $pattern = 90 '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' . 91 '(?:' . 92 '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' . 93 '|' . '\n[ \t]{2,}\S[^\n]*' . 94 '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]{2,}\S)' . 95 '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]*' . self::MARKER . ')' . 96 ')*'; 97 $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_listblock'); 98 } 99 100 /** 101 * Convert the captured block into handler calls. 102 * 103 * Sequence: 104 * 1. parseItems() splits the captured text into per-item records. 105 * 2. Install GfmLists as a CallWriter rewriter on the main handler. 106 * 3. Emit list_open carrying the first item's marker — the rewriter's 107 * handleListOpen opens the `<ul>`/`<ol>` and the first `<li>`. 108 * 4. For each item: 109 * - If not the first, emit list_item (closes the previous `<li>` 110 * and opens a new one in the rewriter). 111 * - Sub-parse the dedented item body via the cached sub-parser. 112 * - Filter document_start/end and the outer p_open/p_close pair 113 * for tight items (single paragraph). 114 * - Wrap the filtered calls in a Nest so the main handler's 115 * Block rewriter treats them as opaque. 116 * 5. Emit list_close and finalise the GfmLists rewriter. 117 * 118 * @inheritdoc 119 */ 120 public function handle($match, $state, $pos, Handler $handler) 121 { 122 $items = $this->parseItems($match); 123 if (empty($items)) { 124 $handler->addCall('cdata', [$match], $pos); 125 return true; 126 } 127 128 $handler->setCallWriter(new GfmLists($handler->getCallWriter())); 129 $handler->addCall('list_open', [$items[0]['markerMatch']], $pos); 130 131 $registry = ModeRegistry::getInstance(); 132 $excludeCats = [ModeRegistry::CATEGORY_BASEONLY]; 133 $excludeModes = ['gfm_listblock']; 134 $subParser = $registry->acquireSubParser($excludeCats, $excludeModes); 135 $subHandler = $subParser->getHandler(); 136 137 foreach ($items as $i => $item) { 138 $itemPos = $pos + $item['offset']; 139 if ($i > 0) { 140 $handler->addCall('list_item', [$item['markerMatch']], $itemPos); 141 } 142 143 $subHandler->reset(); 144 $subParser->parse($item['body']); 145 $itemCalls = $this->filterSubCalls($subHandler->calls); 146 if (empty($itemCalls)) continue; // empty item — nothing to emit 147 148 // Wrap the item content in a Nest so the main handler's Block 149 // rewriter does not double-wrap our already-paragraphed content. 150 // Block treats `nest` as opaque and the renderer base class 151 // unwraps it transparently, the same pattern Footnote uses. 152 $outer = $handler->getCallWriter(); 153 $nest = new Nest($outer); 154 $handler->setCallWriter($nest); 155 foreach ($itemCalls as $call) { 156 // sub-handler positions are relative to the item body; offset 157 // them back into the source so section-edit anchors work. 158 $handler->addCall($call[0], $call[1], $itemPos + $call[2]); 159 } 160 $handler->setCallWriter($nest->process()); 161 } 162 163 $registry->releaseSubParser($excludeCats, $excludeModes); 164 165 $handler->addCall('list_close', [], $pos + strlen($match)); 166 $reWriter = $handler->getCallWriter(); 167 $handler->setCallWriter($reWriter->process()); 168 169 return true; 170 } 171 172 /** 173 * Walk the captured block, grouping lines into items. 174 * 175 * Each returned item describes one list_item: its marker (in the form 176 * "\n{indent}{marker}" so GfmLists::interpretSyntax can parse it), the 177 * dedented body, dedent column, and absolute offset within $match. 178 * 179 * Lines are classified as marker / continuation / blank. A marker line 180 * starts a new item; continuation and blank lines accumulate into the 181 * current item's body. Continuation lines are dedented by up to 182 * indent + marker_width + 1 leading spaces (the item's content column 183 * for single-space-after-marker cases). Blank lines are kept as empty 184 * body lines while they're in the middle of the body and stripped 185 * from the trailing edge by joinBody() so single-paragraph items 186 * parse tight. 187 * 188 * @param string $match the raw special-pattern match (starts with \n) 189 * @return array<int, array{markerMatch: string, dedent: int, body: string, offset: int}> 190 */ 191 protected function parseItems($match) 192 { 193 $stripped = ltrim($match, "\n"); 194 $offsetBase = strlen($match) - strlen($stripped); 195 $lines = explode("\n", $stripped); 196 197 $items = []; 198 $current = null; 199 $bodyLines = []; 200 $cursor = $offsetBase; 201 202 foreach ($lines as $line) { 203 $isMarker = preg_match( 204 '/^([ \t]*)(' . self::MARKER . ')(?:[ \t](.*)|$)/', 205 $line, 206 $m 207 ); 208 209 if ($isMarker) { 210 if ($current !== null) { 211 $current['body'] = $this->joinBody($bodyLines); 212 $items[] = $current; 213 } 214 $indent = str_replace("\t", " ", $m[1]); 215 $marker = $m[2]; 216 $firstLine = $m[3] ?? ''; 217 $current = [ 218 'markerMatch' => "\n" . $indent . $marker, 219 'dedent' => strlen($indent) + strlen($marker) + 1, 220 'offset' => $cursor, 221 ]; 222 $bodyLines = [$firstLine]; 223 } elseif ($current !== null) { 224 if (trim($line) === '') { 225 $bodyLines[] = ''; 226 } else { 227 $expanded = str_replace("\t", " ", $line); 228 $available = strlen($expanded) - strlen(ltrim($expanded, ' ')); 229 $strip = min($current['dedent'], $available); 230 $bodyLines[] = substr($expanded, $strip); 231 } 232 } 233 234 $cursor += strlen($line) + 1; // +1 for the \n consumed by explode 235 } 236 237 if ($current !== null) { 238 $current['body'] = $this->joinBody($bodyLines); 239 $items[] = $current; 240 } 241 242 return $items; 243 } 244 245 /** 246 * Join body lines into a string, trimming trailing blank lines. 247 * 248 * Trailing blanks would reach the sub-parser and cause Block to wrap 249 * the otherwise-single paragraph content in `p_open`/`p_close`, 250 * forcing a tight item into loose-item shape. Stripping them here 251 * preserves the tight rendering for items that look tight in source. 252 * 253 * @param string[] $lines 254 */ 255 protected function joinBody(array $lines): string 256 { 257 return rtrim(implode("\n", $lines), "\n"); 258 } 259 260 /** 261 * Filter the sub-parser's flat call list before nest-wrapping it. 262 * 263 * Drops `document_start` / `document_end` (always emitted by 264 * Handler::finalize), and strips the outer `p_open` / `p_close` pair 265 * for tight items so their content sits inline inside `<li>`. Loose 266 * items (multiple paragraphs, more than one `p_open`) keep their 267 * inner pairs untouched. The filtered calls are then wrapped in a 268 * Nest by handle() before they reach the GfmLists rewriter. 269 * 270 * @param array $calls 271 * @return array 272 */ 273 protected function filterSubCalls(array $calls) 274 { 275 if ($calls && $calls[0][0] === 'document_start') array_shift($calls); 276 if ($calls && end($calls)[0] === 'document_end') array_pop($calls); 277 278 $pCount = 0; 279 foreach ($calls as $c) { 280 if ($c[0] === 'p_open') $pCount++; 281 } 282 283 if ($pCount === 1 284 && $calls[0][0] === 'p_open' 285 && end($calls)[0] === 'p_close') { 286 array_shift($calls); 287 array_pop($calls); 288 } 289 290 return $calls; 291 } 292} 293