1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Handler\GfmLists; 7use dokuwiki\Parsing\Handler\Nest; 8use dokuwiki\Parsing\ModeRegistry; 9 10/** 11 * GFM list block. 12 * 13 * Captures an entire list block atomically (one addSpecialPattern match) and 14 * walks the captured text in handle(), grouping lines into items. Each item's 15 * body is dedented to its content column and parsed by a cached sub-parser 16 * (ModeRegistry::getSubParser) so block content - paragraphs, fenced code, 17 * blockquotes, plugin blocks - work inside items uniformly without depending 18 * on column-0 anchoring of nested mode patterns. 19 * 20 * Sub-parser mode set: every active mode except CATEGORY_BASEONLY (i.e. no 21 * Header inside list items, since `<h1>`-`<h6>` inside `<li>` is never 22 * desirable and section nesting must not span into items) and gfm_listblock 23 * itself (defensive guard against lexer re-entry on pathological inputs; 24 * normal nested lists are caught by the outer pattern instead). 25 * 26 * Each item's sub-parsed calls are wrapped in a `nest` instruction (see 27 * Handler\Nest) before they reach the outer handler. This is essential: 28 * the sub-parser's Block rewriter has already wrapped multi-paragraph 29 * content in `p_open`/`p_close`, and without nest-wrapping the main 30 * handler's Block rewriter would see those paragraphs and add another 31 * `<p>` around the entire replayed range, producing nested `<p>` tags. 32 * Block treats `nest` as opaque and the renderer base class unwraps it 33 * transparently — the same pattern Footnote uses. 34 * 35 * Indentation rule: depth = (indent / 2) + 1. Tabs become two spaces. 1- and 36 * 3-space indents round down. Marker characters: -, *, + (unordered) and 37 * digits followed by . or ) (ordered). Nested lists are caught by the 38 * outer pattern (each marker at any 2-space-multiple indent is its own 39 * item at the corresponding depth) and stitched back into nested HTML by 40 * the GfmLists rewriter. 41 */ 42class GfmListblock extends AbstractMode 43{ 44 /** 45 * Regex fragment matching one list marker. 46 * 47 * Either an unordered marker (`-`, `*`, `+`) or an ordered marker 48 * (1-9 digits followed by `.` or `)`). Used by the entry pattern in 49 * connectTo() and by the per-line classifier in parseItems(). 50 */ 51 protected const MARKER = '(?:[-*+]|\d{1,9}[.)])'; 52 53 /** @inheritdoc */ 54 public function getSort() 55 { 56 return 10; 57 } 58 59 /** @inheritdoc */ 60 public function preConnect() 61 { 62 ModeRegistry::getInstance()->registerBlockEolMode('gfm_listblock'); 63 } 64 65 /** 66 * Register the special pattern that captures a whole list block. 67 * 68 * The pattern starts on a marker line (any indent) and then loops over 69 * four alternatives until none matches: 70 * 71 * 1. A subsequent marker line at any indent. 72 * 2. An indented continuation line (>= 2 leading spaces with content). 73 * 3. A blank line followed by indented content (any number of 74 * intervening blank lines tolerated via the lookahead). 75 * 4. A blank line followed by a next marker (same multi-blank 76 * tolerance as alt 3). 77 * 78 * The block ends naturally when none of the alternatives match — for 79 * example a column-0 non-marker line, or two-or-more blank lines 80 * followed by non-list content. 81 * 82 * @inheritdoc 83 */ 84 public function connectTo($mode) 85 { 86 $pattern = 87 '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' . 88 '(?:' . 89 '\n[ \t]*' . self::MARKER . '(?:[ \t][^\n]*|(?=\n))' . 90 '|' . '\n[ \t]{2,}\S[^\n]*' . 91 '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]{2,}\S)' . 92 '|' . '\n[ \t]*(?=(?:\n[ \t]*)*\n[ \t]*' . self::MARKER . ')' . 93 ')*'; 94 $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_listblock'); 95 } 96 97 /** 98 * Convert the captured block into handler calls. 99 * 100 * Sequence: 101 * 1. parseItems() splits the captured text into per-item records. 102 * 2. Install GfmLists as a CallWriter rewriter on the main handler. 103 * 3. Emit list_open carrying the first item's marker — the rewriter's 104 * handleListOpen opens the `<ul>`/`<ol>` and the first `<li>`. 105 * 4. For each item: 106 * - If not the first, emit list_item (closes the previous `<li>` 107 * and opens a new one in the rewriter). 108 * - Sub-parse the dedented item body via the cached sub-parser. 109 * - Filter document_start/end and the outer p_open/p_close pair 110 * for tight items (single paragraph). 111 * - Wrap the filtered calls in a Nest so the main handler's 112 * Block rewriter treats them as opaque. 113 * 5. Emit list_close and finalise the GfmLists rewriter. 114 * 115 * @inheritdoc 116 */ 117 public function handle($match, $state, $pos, Handler $handler) 118 { 119 $items = $this->parseItems($match); 120 if (empty($items)) { 121 $handler->addCall('cdata', [$match], $pos); 122 return true; 123 } 124 125 $handler->setCallWriter(new GfmLists($handler->getCallWriter())); 126 $handler->addCall('list_open', [$items[0]['markerMatch']], $pos); 127 128 $subParser = ModeRegistry::getInstance() 129 ->getSubParser([ModeRegistry::CATEGORY_BASEONLY], ['gfm_listblock']); 130 $subHandler = $subParser->getHandler(); 131 132 foreach ($items as $i => $item) { 133 $itemPos = $pos + $item['offset']; 134 if ($i > 0) { 135 $handler->addCall('list_item', [$item['markerMatch']], $itemPos); 136 } 137 138 $subHandler->reset(); 139 $subParser->parse($item['body']); 140 $itemCalls = $this->filterSubCalls($subHandler->calls); 141 if (empty($itemCalls)) continue; // empty item — nothing to emit 142 143 // Wrap the item content in a Nest so the main handler's Block 144 // rewriter does not double-wrap our already-paragraphed content. 145 // Block treats `nest` as opaque and the renderer base class 146 // unwraps it transparently, the same pattern Footnote uses. 147 $outer = $handler->getCallWriter(); 148 $nest = new Nest($outer); 149 $handler->setCallWriter($nest); 150 foreach ($itemCalls as $call) { 151 // sub-handler positions are relative to the item body; offset 152 // them back into the source so section-edit anchors work. 153 $handler->addCall($call[0], $call[1], $itemPos + $call[2]); 154 } 155 $handler->setCallWriter($nest->process()); 156 } 157 158 $handler->addCall('list_close', [], $pos + strlen($match)); 159 $reWriter = $handler->getCallWriter(); 160 $handler->setCallWriter($reWriter->process()); 161 162 return true; 163 } 164 165 /** 166 * Walk the captured block, grouping lines into items. 167 * 168 * Each returned item describes one list_item: its marker (in the form 169 * "\n{indent}{marker}" so GfmLists::interpretSyntax can parse it), the 170 * dedented body, dedent column, and absolute offset within $match. 171 * 172 * Lines are classified as marker / continuation / blank. A marker line 173 * starts a new item; continuation and blank lines accumulate into the 174 * current item's body. Continuation lines are dedented by up to 175 * indent + marker_width + 1 leading spaces (the item's content column 176 * for single-space-after-marker cases). Blank lines are kept as empty 177 * body lines while they're in the middle of the body and stripped 178 * from the trailing edge by joinBody() so single-paragraph items 179 * parse tight. 180 * 181 * @param string $match the raw special-pattern match (starts with \n) 182 * @return array<int, array{markerMatch: string, dedent: int, body: string, offset: int}> 183 */ 184 protected function parseItems($match) 185 { 186 $stripped = ltrim($match, "\n"); 187 $offsetBase = strlen($match) - strlen($stripped); 188 $lines = explode("\n", $stripped); 189 190 $items = []; 191 $current = null; 192 $bodyLines = []; 193 $cursor = $offsetBase; 194 195 foreach ($lines as $line) { 196 $isMarker = preg_match( 197 '/^([ \t]*)(' . self::MARKER . ')(?:[ \t](.*)|$)/', 198 $line, 199 $m 200 ); 201 202 if ($isMarker) { 203 if ($current !== null) { 204 $current['body'] = $this->joinBody($bodyLines); 205 $items[] = $current; 206 } 207 $indent = str_replace("\t", " ", $m[1]); 208 $marker = $m[2]; 209 $firstLine = $m[3] ?? ''; 210 $current = [ 211 'markerMatch' => "\n" . $indent . $marker, 212 'dedent' => strlen($indent) + strlen($marker) + 1, 213 'offset' => $cursor, 214 ]; 215 $bodyLines = [$firstLine]; 216 } elseif ($current !== null) { 217 if (trim($line) === '') { 218 $bodyLines[] = ''; 219 } else { 220 $expanded = str_replace("\t", " ", $line); 221 $available = strlen($expanded) - strlen(ltrim($expanded, ' ')); 222 $strip = min($current['dedent'], $available); 223 $bodyLines[] = substr($expanded, $strip); 224 } 225 } 226 227 $cursor += strlen($line) + 1; // +1 for the \n consumed by explode 228 } 229 230 if ($current !== null) { 231 $current['body'] = $this->joinBody($bodyLines); 232 $items[] = $current; 233 } 234 235 return $items; 236 } 237 238 /** 239 * Join body lines into a string, trimming trailing blank lines. 240 * 241 * Trailing blanks would reach the sub-parser and cause Block to wrap 242 * the otherwise-single paragraph content in `p_open`/`p_close`, 243 * forcing a tight item into loose-item shape. Stripping them here 244 * preserves the tight rendering for items that look tight in source. 245 * 246 * @param string[] $lines 247 */ 248 protected function joinBody(array $lines): string 249 { 250 return rtrim(implode("\n", $lines), "\n"); 251 } 252 253 /** 254 * Filter the sub-parser's flat call list before nest-wrapping it. 255 * 256 * Drops `document_start` / `document_end` (always emitted by 257 * Handler::finalize), and strips the outer `p_open` / `p_close` pair 258 * for tight items so their content sits inline inside `<li>`. Loose 259 * items (multiple paragraphs, more than one `p_open`) keep their 260 * inner pairs untouched. The filtered calls are then wrapped in a 261 * Nest by handle() before they reach the GfmLists rewriter. 262 * 263 * @param array $calls 264 * @return array 265 */ 266 protected function filterSubCalls(array $calls) 267 { 268 if ($calls && $calls[0][0] === 'document_start') array_shift($calls); 269 if ($calls && end($calls)[0] === 'document_end') array_pop($calls); 270 271 $pCount = 0; 272 foreach ($calls as $c) { 273 if ($c[0] === 'p_open') $pCount++; 274 } 275 276 if ($pCount === 1 277 && $calls[0][0] === 'p_open' 278 && end($calls)[0] === 'p_close') { 279 array_shift($calls); 280 array_pop($calls); 281 } 282 283 return $calls; 284 } 285} 286