xref: /plugin/mdimport/MarkdownToDokuWiki.php (revision 7cb424c90c7e1aca7edae5f79baaec2e55306143)
1<?php
2
3declare(strict_types=1);
4
5/**
6 * Converts Markdown content to DokuWiki syntax.
7 *
8 * This class processes Markdown line by line, maintaining state for
9 * code blocks, tables, lists (with nesting), and paragraphs. It supports:
10 * - Headers (levels 1-6)
11 * - Bold, italic, inline code
12 * - Links and images
13 * - Unordered and ordered lists (with indentation)
14 * - Tables (with alignment detection for headers)
15 * - Code blocks (```)
16 * - Blockquotes (simple)
17 * - Horizontal rules
18 *
19 * @license GPL 3 http://www.gnu.org/licenses/gpl-3.0.html
20 * @author  sioc-de-narf
21 */
22class MarkdownToDokuWikiConverter
23{
24    /** @var bool Whether we are currently inside a code block */
25    private bool $inCodeBlock = false;
26
27    /** @var bool Whether we are currently inside a table */
28    private bool $inTable = false;
29
30    /** @var array<int, array<int, string>> Rows of the current table */
31    private array $tableRows = [];
32
33    /** @var array<int, string> Alignments for each column of the current table */
34    private array $tableAlignments = [];
35
36    /** @var array<int, array{indent: int, type: string}> Stack tracking list nesting (indentation and type) */
37    private array $listStack = [];
38
39    /** @var array<int, string> Buffer for paragraph lines before they are flushed */
40    private array $paragraphBuffer = [];
41
42    /**
43     * Remove YAML front matter from the beginning of the document.
44     *
45     * Detects a block starting with '---' at the very first line,
46     * followed by any lines, and ending with '---' or '...'.
47     * If such a block is found, it is stripped.
48     *
49     * @param string $markdown The raw Markdown.
50     * @return string Markdown without the front matter.
51     */
52    private function stripYamlFrontMatter(string $markdown): string
53    {
54        $lines = explode("\n", $markdown);
55        if (count($lines) === 0) {
56            return $markdown;
57        }
58
59        // Trim leading empty lines to find the first non-empty line
60        $firstNonEmpty = 0;
61        while ($firstNonEmpty < count($lines) && trim($lines[$firstNonEmpty]) === '') {
62            $firstNonEmpty++;
63        }
64
65        // If the first non-empty line is exactly '---', we have a front matter candidate
66        if ($firstNonEmpty < count($lines) && trim($lines[$firstNonEmpty]) === '---') {
67            $endLine = null;
68            // Look for the closing '---' or '...' after the opening
69            for ($i = $firstNonEmpty + 1; $i < count($lines); $i++) {
70                if (trim($lines[$i]) === '---' || trim($lines[$i]) === '...') {
71                    $endLine = $i;
72                    break;
73                }
74            }
75            // If we found a closing delimiter, remove all lines from start to end (inclusive)
76            if ($endLine !== null) {
77                $lines = array_slice($lines, $endLine + 1);
78                return implode("\n", $lines);
79            }
80        }
81
82        // No front matter detected, return original
83        return $markdown;
84    }
85
86    /**
87     * Convert Markdown to DokuWiki syntax.
88     *
89     * @param string $markdown The input Markdown text.
90     * @return string The converted DokuWiki text.
91     */
92    public function convert(string $markdown): string
93    {
94        // Strip YAML front matter
95        $markdown = $this->stripYamlFrontMatter($markdown);
96
97        // Normalize line endings and replace tabs with 4 spaces
98        $lines = explode("\n", str_replace(["\r\n", "\r", "\t"], ["\n", "\n", "    "], $markdown));
99        $output = [];
100        $this->reset();
101
102        $i = 0;
103        while ($i < count($lines)) {
104            $line = $lines[$i];
105            $nextLine = $i + 1 < count($lines) ? $lines[$i + 1] : null;
106
107            // Code block handling
108            if (str_starts_with(trim($line), '```')) {
109                $this->handleCodeBlock($line, $output);
110                $i++;
111                continue;
112            }
113            if ($this->inCodeBlock) {
114                $output[] = $line;
115                $i++;
116                continue;
117            }
118
119            // Table detection
120            if ($this->isTableStart($line, $nextLine)) {
121                $this->parseTable($lines, $i);
122                $output[] = $this->renderTable();
123                continue;
124            }
125
126            // Horizontal rule
127            if ($this->isHorizontalRule($line)) {
128                $this->flushParagraph($output);
129                $output[] = '----';
130                $i++;
131                continue;
132            }
133
134            // Blockquote
135            if ($this->isBlockquote($line)) {
136                $this->flushParagraph($output);
137                $output[] = $this->renderBlockquote($line);
138                $i++;
139                continue;
140            }
141
142            // List item
143            if ($this->isListItem($line)) {
144                $this->handleList($line, $output);
145                $i++;
146                continue;
147            }
148
149            // Header
150            if ($this->isTitle($line)) {
151                $this->flushParagraph($output);
152                $output[] = $this->renderTitle($line);
153                $i++;
154                continue;
155            }
156
157            // Empty line
158            if (trim($line) === '') {
159                $this->flushParagraph($output);
160                $output[] = '';
161                $i++;
162                continue;
163            }
164
165            // Normal paragraph line
166            $this->paragraphBuffer[] = $this->convertInline($line);
167            $i++;
168        }
169
170        $this->flushParagraph($output);
171        $this->closeLists($output);
172
173        return implode("\n", $output);
174    }
175
176    /**
177     * Reset internal state.
178     */
179    private function reset(): void
180    {
181        $this->inCodeBlock = false;
182        $this->inTable = false;
183        $this->tableRows = [];
184        $this->tableAlignments = [];
185        $this->listStack = [];
186        $this->paragraphBuffer = [];
187    }
188
189    /**
190     * Handle a code block delimiter (```).
191     *
192     * @param string   $line   The current line.
193     * @param string[] &$output The output array being built.
194     */
195    private function handleCodeBlock(string $line, array &$output): void
196    {
197        if (!$this->inCodeBlock) {
198            $lang = trim(substr(trim($line), 3));
199            $output[] = "<code" . ($lang ? " $lang" : "") . ">";
200            $this->inCodeBlock = true;
201        } else {
202            $output[] = "</code>";
203            $this->inCodeBlock = false;
204        }
205    }
206
207    /**
208     * Determine if a line starts a Markdown table.
209     *
210     * @param string      $line     The current line.
211     * @param string|null $nextLine The next line (if any).
212     * @return bool True if a table starts here.
213     */
214    private function isTableStart(string $line, ?string $nextLine): bool
215    {
216        return strpos($line, '|') !== false && $nextLine && preg_match('/^[\s\|:\-]+$/', $nextLine);
217    }
218
219    /**
220     * Parse a Markdown table from the current position.
221     *
222     * @param string[] $lines The whole array of lines.
223     * @param int      &$i    Current index (will be advanced to after the table).
224     */
225    private function parseTable(array $lines, int &$i): void
226    {
227        $headerLine = $lines[$i++];
228        $separatorLine = $lines[$i++];
229
230        // Detect column alignments from separator line
231        $this->tableAlignments = array_map(
232            fn($part) => match (true) {
233                str_starts_with(trim($part), ':') && str_ends_with(trim($part), ':') => 'center',
234                str_ends_with(trim($part), ':') => 'right',
235                str_starts_with(trim($part), ':') => 'left',
236                default => 'left',
237            },
238            explode('|', trim($separatorLine, '|'))
239        );
240
241        $this->tableRows = [$this->parseTableRow($headerLine)];
242        while ($i < count($lines) && strpos($lines[$i], '|') !== false && !preg_match('/^[\s\|:\-]+$/', $lines[$i])) {
243            $this->tableRows[] = $this->parseTableRow($lines[$i]);
244            $i++;
245        }
246    }
247
248    /**
249     * Parse a single Markdown table row into an array of cells.
250     *
251     * @param string $line The table row line.
252     * @return string[] Array of cell contents.
253     */
254    private function parseTableRow(string $line): array
255    {
256        return array_map('trim', explode('|', trim($line, '|')));
257    }
258
259    /**
260     * Render the parsed table as DokuWiki syntax.
261     *
262     * @return string DokuWiki table representation.
263     */
264    private function renderTable(): string
265    {
266        $output = [];
267        foreach ($this->tableRows as $rowIndex => $row) {
268            $dokuRow = [];
269            foreach ($row as $colIndex => $cell) {
270                $cell = $this->convertInline($cell);
271                $dokuRow[] = ($rowIndex === 0 ? '^ ' : '| ') . $cell . ($rowIndex === 0 ? ' ^' : ' |');
272            }
273            $output[] = implode('', $dokuRow);
274        }
275        return implode("\n", $output);
276    }
277
278    /**
279     * Check if a line is a Markdown list item.
280     *
281     * @param string $line The line.
282     * @return bool True if it's a list item.
283     */
284    private function isListItem(string $line): bool
285    {
286        return preg_match('/^\s*([\*\-\+]|\d+\.)\s/', $line) === 1;
287    }
288
289    /**
290     * Handle a list item line, managing nesting via indentation.
291     *
292     * @param string   $line   The list item line.
293     * @param string[] &$output The output array.
294     */
295    private function handleList(string $line, array &$output): void
296    {
297        $this->flushParagraph($output);
298        $indent = $this->calculateIndent($line);
299        $type = preg_match('/^\s*\d+\.\s/', $line) ? 'ordered' : 'unordered';
300
301        // Close deeper lists if indentation decreased
302        while (!empty($this->listStack) && $indent <= $this->listStack[count($this->listStack) - 1]['indent']) {
303            array_pop($this->listStack);
304        }
305
306        $this->listStack[] = ['indent' => $indent, 'type' => $type];
307        $dokuIndent = str_repeat('  ', count($this->listStack) - 1);
308
309        // Remove the list marker and any leading spaces, then convert inline
310        $content = $this->convertInline(preg_replace('/^\s*([\*\-\+]|\d+\.)\s+/', '', $line));
311        $output[] = $dokuIndent . ($type === 'ordered' ? '- ' : '* ') . $content;
312    }
313
314    /**
315     * Calculate the indentation level (number of leading spaces) of a line.
316     *
317     * @param string $line The line.
318     * @return int Number of leading spaces.
319     */
320    private function calculateIndent(string $line): int
321    {
322        return strlen($line) - strlen(ltrim($line));
323    }
324
325    /**
326     * Close any remaining open lists (reset stack).
327     *
328     * @param string[] &$output The output array (unused, kept for consistency).
329     */
330    private function closeLists(array &$output): void
331    {
332        $this->listStack = [];
333    }
334
335    /**
336     * Check if a line is a Markdown header (starts with #).
337     *
338     * @param string $line The line.
339     * @return bool True if it's a header.
340     */
341    private function isTitle(string $line): bool
342    {
343        return preg_match('/^(#{1,6})\s+(.+)$/', trim($line)) === 1;
344    }
345
346    /**
347     * Render a Markdown header as a DokuWiki header.
348     *
349     * @param string $line The header line.
350     * @return string DokuWiki header.
351     */
352    private function renderTitle(string $line): string
353    {
354        preg_match('/^(#{1,6})\s+(.+)$/', trim($line), $matches);
355        $level = strlen($matches[1]);
356        $title = trim($matches[2]);
357        $equals = str_repeat('=', 7 - $level);
358        return "$equals $title $equals";
359    }
360
361    /**
362     * Check if a line is a horizontal rule (three or more -, *, _).
363     *
364     * @param string $line The line.
365     * @return bool True if it's a horizontal rule.
366     */
367    private function isHorizontalRule(string $line): bool
368    {
369        return preg_match('/^[-*_]{3,}\s*$/', trim($line)) === 1;
370    }
371
372    /**
373     * Check if a line is a blockquote (starts with >).
374     *
375     * @param string $line The line.
376     * @return bool True if it's a blockquote.
377     */
378    private function isBlockquote(string $line): bool
379    {
380        return str_starts_with(ltrim($line), '>');
381    }
382
383    /**
384     * Render a blockquote line.
385     *
386     * @param string $line The blockquote line.
387     * @return string DokuWiki blockquote (>> ...).
388     */
389    private function renderBlockquote(string $line): string
390    {
391        // Remove leading '>' and any following space, then convert inline
392        return '>> ' . $this->convertInline(substr(ltrim($line), 1));
393    }
394
395    /**
396     * Convert inline Markdown formatting to DokuWiki.
397     *
398     * Handles bold, italic, inline code, images, and links.
399     *
400     * @param string $text The text to convert.
401     * @return string Converted text.
402     */
403    private function convertInline(string $text): string
404    {
405        // Bold: **text** or __text__ → **text** (same in DokuWiki)
406        $text = preg_replace('/\*\*(.+?)\*\*/', '**$1**', $text);
407        $text = preg_replace('/__(.+?)__/', '**$1**', $text);
408
409        // Italic: *text* or _text_ → //text//
410        $text = preg_replace('/\*(.+?)\*/', '//$1//', $text);
411        $text = preg_replace('/_(.+?)_/', '//$1//', $text);
412
413        // Inline code: `code` → ''code''
414        $text = preg_replace('/`(.+?)`/', "''$1''", $text);
415
416        // Images: ![alt](url) → {{url|alt}}
417        $text = preg_replace('/!\[([^\]]*)\]\(([^)]+)\)/', '{{$2|$1}}', $text);
418
419        // Links: [text](url) → [[url|text]]
420        $text = preg_replace('/\[([^\]]+)\]\(([^)]+)\)/', '[[$2|$1]]', $text);
421
422        return $text;
423    }
424
425    /**
426     * Flush any buffered paragraph lines to the output.
427     *
428     * @param string[] &$output The output array.
429     */
430    private function flushParagraph(array &$output): void
431    {
432        if (!empty($this->paragraphBuffer)) {
433            $output[] = implode(' ', $this->paragraphBuffer);
434            $this->paragraphBuffer = [];
435        }
436    }
437}
438