xref: /dokuwiki/_test/tests/Parsing/Markdown/SpecCompatRenderer.php (revision f9d3b7bd008099dc4c61ce262a02a0ed8bc94254)
13440a8c0SAndreas Gohr<?php
23440a8c0SAndreas Gohr
33440a8c0SAndreas Gohrnamespace dokuwiki\test\Parsing\Markdown;
43440a8c0SAndreas Gohr
53440a8c0SAndreas Gohruse Doku_Renderer_xhtml;
63440a8c0SAndreas Gohr
73440a8c0SAndreas Gohr/**
83440a8c0SAndreas Gohr * XHTML renderer tuned to emit the minimal HTML shape GFM's spec.txt uses.
93440a8c0SAndreas Gohr *
103440a8c0SAndreas Gohr * DokuWiki's production XHTML renderer wraps internal media in details
113440a8c0SAndreas Gohr * links pointing at `/lib/exe/fetch.php?media=...` / `/lib/exe/detail.php?media=...`,
123440a8c0SAndreas Gohr * rewrites internal link hrefs to `/doku.php?id=...`, and adds wiki-specific
133440a8c0SAndreas Gohr * classes and attributes. All of this is correct for live wiki pages but
143440a8c0SAndreas Gohr * diverges byte-for-byte from GFM's bare `<img src="...">` and
153440a8c0SAndreas Gohr * `<a href="...">...</a>`.
163440a8c0SAndreas Gohr *
173440a8c0SAndreas Gohr * This renderer is used only by {@see GfmSpecTest} so the spec roundtrip
183440a8c0SAndreas Gohr * can compare against byte-level spec HTML. Production rendering is
193440a8c0SAndreas Gohr * unchanged. Methods not overridden here fall through to the XHTML
203440a8c0SAndreas Gohr * renderer (paragraphs, emphasis, code spans, lists, etc.) — those render
213440a8c0SAndreas Gohr * the same shape the spec expects.
223440a8c0SAndreas Gohr *
233440a8c0SAndreas Gohr * Note: title attributes on links/images are discarded at handle time
243440a8c0SAndreas Gohr * (no DW instruction slot), so spec examples that expect `title="..."`
253440a8c0SAndreas Gohr * still don't pass and stay in `skip.php`.
263440a8c0SAndreas Gohr */
273440a8c0SAndreas Gohrclass SpecCompatRenderer extends Doku_Renderer_xhtml
283440a8c0SAndreas Gohr{
293dabe4e0SAndreas Gohr    public function table_open($maxcols = null, $numrows = null, $pos = null, $classes = null)
303dabe4e0SAndreas Gohr    {
313dabe4e0SAndreas Gohr        // Production DW wraps `<table>` in `<div class="table"><table class="inline">`;
323dabe4e0SAndreas Gohr        // the spec expects bare `<table>`.
333dabe4e0SAndreas Gohr        $this->doc .= "<table>\n";
343dabe4e0SAndreas Gohr    }
353dabe4e0SAndreas Gohr
363dabe4e0SAndreas Gohr    public function table_close($pos = null)
373dabe4e0SAndreas Gohr    {
383dabe4e0SAndreas Gohr        // Drop the matching `</div>` from the production wrapper.
393dabe4e0SAndreas Gohr        $this->doc .= "</table>";
403dabe4e0SAndreas Gohr    }
413dabe4e0SAndreas Gohr
423dabe4e0SAndreas Gohr    public function tablerow_open($classes = null)
433dabe4e0SAndreas Gohr    {
443dabe4e0SAndreas Gohr        // Strip DW's `class="rowN"` row counter — spec rows have no class.
453dabe4e0SAndreas Gohr        $this->doc .= "<tr>\n";
463dabe4e0SAndreas Gohr    }
473dabe4e0SAndreas Gohr
483dabe4e0SAndreas Gohr    public function tableheader_open($colspan = 1, $align = null, $rowspan = 1, $classes = null)
493dabe4e0SAndreas Gohr    {
503dabe4e0SAndreas Gohr        // Production DW emits alignment as `class="...align"`; the spec uses
513dabe4e0SAndreas Gohr        // an `align="..."` attribute. Drop the `class="colN"` counter too.
523dabe4e0SAndreas Gohr        $this->doc .= '<th' . $this->alignAttr($align) . '>';
533dabe4e0SAndreas Gohr    }
543dabe4e0SAndreas Gohr
553dabe4e0SAndreas Gohr    public function tablecell_open($colspan = 1, $align = null, $rowspan = 1, $classes = null)
563dabe4e0SAndreas Gohr    {
573dabe4e0SAndreas Gohr        $this->doc .= '<td' . $this->alignAttr($align) . '>';
583dabe4e0SAndreas Gohr    }
593dabe4e0SAndreas Gohr
603dabe4e0SAndreas Gohr    private function alignAttr(?string $align): string
613dabe4e0SAndreas Gohr    {
623dabe4e0SAndreas Gohr        if ($align === null) return '';
633dabe4e0SAndreas Gohr        return ' align="' . $align . '"';
643dabe4e0SAndreas Gohr    }
65b1c59bedSAndreas Gohr
663440a8c0SAndreas Gohr    public function internalmedia(
673440a8c0SAndreas Gohr        $src,
683440a8c0SAndreas Gohr        $title = null,
693440a8c0SAndreas Gohr        $align = null,
703440a8c0SAndreas Gohr        $width = null,
713440a8c0SAndreas Gohr        $height = null,
723440a8c0SAndreas Gohr        $cache = null,
733440a8c0SAndreas Gohr        $linking = null,
743440a8c0SAndreas Gohr        $return = false
753440a8c0SAndreas Gohr    ) {
763440a8c0SAndreas Gohr        $this->doc .= $this->specImg($src, $title, $width, $height);
773440a8c0SAndreas Gohr    }
783440a8c0SAndreas Gohr
793440a8c0SAndreas Gohr    public function externalmedia(
803440a8c0SAndreas Gohr        $src,
813440a8c0SAndreas Gohr        $title = null,
823440a8c0SAndreas Gohr        $align = null,
833440a8c0SAndreas Gohr        $width = null,
843440a8c0SAndreas Gohr        $height = null,
853440a8c0SAndreas Gohr        $cache = null,
863440a8c0SAndreas Gohr        $linking = null,
873440a8c0SAndreas Gohr        $return = false
883440a8c0SAndreas Gohr    ) {
893440a8c0SAndreas Gohr        $this->doc .= $this->specImg($src, $title, $width, $height);
903440a8c0SAndreas Gohr    }
913440a8c0SAndreas Gohr
923440a8c0SAndreas Gohr    public function internallink($id, $name = null, $search = null, $returnonly = false, $linktype = 'content')
933440a8c0SAndreas Gohr    {
943440a8c0SAndreas Gohr        $this->doc .= $this->specLink($id, $name);
953440a8c0SAndreas Gohr    }
963440a8c0SAndreas Gohr
973440a8c0SAndreas Gohr    public function externallink($url, $name = null, $returnonly = false)
983440a8c0SAndreas Gohr    {
993440a8c0SAndreas Gohr        $this->doc .= $this->specLink($url, $name);
1003440a8c0SAndreas Gohr    }
1013440a8c0SAndreas Gohr
1023440a8c0SAndreas Gohr    public function interwikilink($match, $name, $wikiName, $wikiUri, $returnonly = false)
1033440a8c0SAndreas Gohr    {
1043440a8c0SAndreas Gohr        // Spec has no interwiki expectations; emit the raw `wp>Page` form as
1053440a8c0SAndreas Gohr        // href so the mode is still visible but obviously non-standard.
1063440a8c0SAndreas Gohr        $this->doc .= $this->specLink($match, $name);
1073440a8c0SAndreas Gohr    }
1083440a8c0SAndreas Gohr
1093440a8c0SAndreas Gohr    public function emaillink($address, $name = null, $returnonly = false)
1103440a8c0SAndreas Gohr    {
1113440a8c0SAndreas Gohr        $this->doc .= $this->specLink('mailto:' . $address, $name ?? $address);
1123440a8c0SAndreas Gohr    }
1133440a8c0SAndreas Gohr
1143440a8c0SAndreas Gohr    public function locallink($hash, $name = null, $returnonly = false)
1153440a8c0SAndreas Gohr    {
1163440a8c0SAndreas Gohr        $this->doc .= $this->specLink('#' . $hash, $name ?? $hash);
1173440a8c0SAndreas Gohr    }
1183440a8c0SAndreas Gohr
1193440a8c0SAndreas Gohr    public function windowssharelink($url, $name = null, $returnonly = false)
1203440a8c0SAndreas Gohr    {
1213440a8c0SAndreas Gohr        $this->doc .= $this->specLink($url, $name);
1223440a8c0SAndreas Gohr    }
1233440a8c0SAndreas Gohr
124b1c59bedSAndreas Gohr    public function code($text, $language = null, $filename = null, $options = null)
125b1c59bedSAndreas Gohr    {
126b1c59bedSAndreas Gohr        $this->doc .= $this->specCode($text, $language);
127b1c59bedSAndreas Gohr    }
128b1c59bedSAndreas Gohr
129c4bcbc2eSAndreas Gohr    public function linebreak()
130c4bcbc2eSAndreas Gohr    {
131c4bcbc2eSAndreas Gohr        // Production DW emits `<br/>` (no space); the spec expects the
132c4bcbc2eSAndreas Gohr        // XHTML-classic `<br />` (space before the slash).
133c4bcbc2eSAndreas Gohr        $this->doc .= '<br />' . DOKU_LF;
134c4bcbc2eSAndreas Gohr    }
135c4bcbc2eSAndreas Gohr
136d379b737SAndreas Gohr    public function entity($entity)
137d379b737SAndreas Gohr    {
138d379b737SAndreas Gohr        // The Entity mode rewrites --, ---, ->, (c), ... and other prose
139d379b737SAndreas Gohr        // abbreviations into typographic glyphs via conf/entities.conf.
140d379b737SAndreas Gohr        // Correct for live wiki pages, diverges byte-for-byte from the
141d379b737SAndreas Gohr        // GFM spec corpus which expects those bytes preserved literally.
142d379b737SAndreas Gohr        // Emit the original match unchanged.
143d379b737SAndreas Gohr        $this->doc .= $this->_xmlEntities((string) $entity);
144d379b737SAndreas Gohr    }
145d379b737SAndreas Gohr
146d379b737SAndreas Gohr    public function _xmlEntities($string)
147d379b737SAndreas Gohr    {
148d379b737SAndreas Gohr        // Production hsc() escapes both `"` and `'` (ENT_QUOTES) so cdata
149d379b737SAndreas Gohr        // is safe to splice into any HTML attribute as well as body text.
150d379b737SAndreas Gohr        // CommonMark / GFM spec output uses a narrower body-text policy:
151d379b737SAndreas Gohr        // `"` is escaped to `&quot;` (e.g. example #323) but `'` is left
152d379b737SAndreas Gohr        // literal (e.g. example #670). ENT_COMPAT matches that exactly.
153d379b737SAndreas Gohr        // Attribute values rendered by SpecCompatRenderer (href, src, alt)
154d379b737SAndreas Gohr        // still go through hsc() in specLink / specImg, which escapes both.
155d379b737SAndreas Gohr        return htmlspecialchars(
156d379b737SAndreas Gohr            (string) $string,
157d379b737SAndreas Gohr            ENT_COMPAT | ENT_SUBSTITUTE | ENT_HTML401,
158d379b737SAndreas Gohr            'UTF-8'
159d379b737SAndreas Gohr        );
160d379b737SAndreas Gohr    }
161d379b737SAndreas Gohr
162309a0852SAndreas Gohr    public function quote_open()
163309a0852SAndreas Gohr    {
164309a0852SAndreas Gohr        // Production DW wraps blockquote content in `<div class="no">`;
165309a0852SAndreas Gohr        // the spec expects bare `<blockquote>...</blockquote>`.
166309a0852SAndreas Gohr        $this->doc .= "<blockquote>\n";
167309a0852SAndreas Gohr    }
168309a0852SAndreas Gohr
169309a0852SAndreas Gohr    public function quote_close()
170309a0852SAndreas Gohr    {
171309a0852SAndreas Gohr        $this->doc .= "</blockquote>\n";
172309a0852SAndreas Gohr    }
173309a0852SAndreas Gohr
174685560ebSAndreas Gohr    public function listu_open($classes = null)
175685560ebSAndreas Gohr    {
176685560ebSAndreas Gohr        $this->doc .= "<ul>\n";
177685560ebSAndreas Gohr    }
178685560ebSAndreas Gohr
179685560ebSAndreas Gohr    public function listu_close()
180685560ebSAndreas Gohr    {
181685560ebSAndreas Gohr        $this->doc .= "</ul>\n";
182685560ebSAndreas Gohr    }
183685560ebSAndreas Gohr
184f7c6e4acSAndreas Gohr    public function listo_open($classes = null)
185685560ebSAndreas Gohr    {
186685560ebSAndreas Gohr        $this->doc .= "<ol>\n";
187685560ebSAndreas Gohr    }
188f7c6e4acSAndreas Gohr
189f7c6e4acSAndreas Gohr    public function listo_open_start($start = 1)
190f7c6e4acSAndreas Gohr    {
191f7c6e4acSAndreas Gohr        $start = (int) $start;
192f7c6e4acSAndreas Gohr        if ($start === 1) {
193f7c6e4acSAndreas Gohr            $this->listo_open();
194f7c6e4acSAndreas Gohr            return;
195f7c6e4acSAndreas Gohr        }
196f7c6e4acSAndreas Gohr        $this->doc .= '<ol start="' . $start . "\">\n";
197685560ebSAndreas Gohr    }
198685560ebSAndreas Gohr
199685560ebSAndreas Gohr    public function listo_close()
200685560ebSAndreas Gohr    {
201685560ebSAndreas Gohr        $this->doc .= "</ol>\n";
202685560ebSAndreas Gohr    }
203685560ebSAndreas Gohr
204685560ebSAndreas Gohr    public function listitem_open($level, $node = false)
205685560ebSAndreas Gohr    {
206685560ebSAndreas Gohr        $this->doc .= '<li>';
207685560ebSAndreas Gohr    }
208685560ebSAndreas Gohr
209685560ebSAndreas Gohr    public function listitem_close()
210685560ebSAndreas Gohr    {
211685560ebSAndreas Gohr        $this->doc .= "</li>\n";
212685560ebSAndreas Gohr    }
213685560ebSAndreas Gohr
214685560ebSAndreas Gohr    public function listcontent_open()
215685560ebSAndreas Gohr    {
216685560ebSAndreas Gohr        // GFM has no per-item content wrapper - tight items put text directly
217685560ebSAndreas Gohr        // inside <li>, loose items wrap it in <p>. The handler emits/strips
218685560ebSAndreas Gohr        // p_open / p_close to drive that distinction; the wrapper itself
219685560ebSAndreas Gohr        // produces no output here.
220685560ebSAndreas Gohr    }
221685560ebSAndreas Gohr
222685560ebSAndreas Gohr    public function listcontent_close()
223685560ebSAndreas Gohr    {
224685560ebSAndreas Gohr    }
225685560ebSAndreas Gohr
226b1c59bedSAndreas Gohr    public function file($text, $language = null, $filename = null, $options = null)
227b1c59bedSAndreas Gohr    {
228b1c59bedSAndreas Gohr        $this->doc .= $this->specCode($text, $language);
229b1c59bedSAndreas Gohr    }
230b1c59bedSAndreas Gohr
231b1c59bedSAndreas Gohr    public function preformatted($text)
232b1c59bedSAndreas Gohr    {
233b1c59bedSAndreas Gohr        // The Preformatted CallWriter rewriter collapses start/content/
234b1c59bedSAndreas Gohr        // newline/end into one `preformatted` call. GFM expects the body
235b1c59bedSAndreas Gohr        // to end with a newline (spec example 104); DW's internal text
236b1c59bedSAndreas Gohr        // loses it to `trim()`, so we re-append here.
237b1c59bedSAndreas Gohr        $this->doc .= $this->specCode($text . "\n", null);
238b1c59bedSAndreas Gohr    }
239b1c59bedSAndreas Gohr
240b1c59bedSAndreas Gohr    /**
241b1c59bedSAndreas Gohr     * GFM shape: <pre><code class="language-xxx">...</code></pre>. The
242b1c59bedSAndreas Gohr     * production DW renderer emits <pre class="code"> with no inner
243b1c59bedSAndreas Gohr     * <code>, which diverges byte-for-byte.
244b1c59bedSAndreas Gohr     */
245b1c59bedSAndreas Gohr    private function specCode($text, $language): string
246b1c59bedSAndreas Gohr    {
247b1c59bedSAndreas Gohr        $classAttr = '';
248b1c59bedSAndreas Gohr        if ($language !== null && $language !== '') {
249b1c59bedSAndreas Gohr            $classAttr = ' class="language-' . hsc((string) $language) . '"';
250b1c59bedSAndreas Gohr        }
251b1c59bedSAndreas Gohr        return '<pre><code' . $classAttr . '>' . hsc((string) $text) . '</code></pre>';
252b1c59bedSAndreas Gohr    }
253b1c59bedSAndreas Gohr
2543440a8c0SAndreas Gohr    private function specImg($src, $alt, $width, $height): string
2553440a8c0SAndreas Gohr    {
2563440a8c0SAndreas Gohr        $out = '<img src="' . hsc((string) $src) . '"';
2573440a8c0SAndreas Gohr        $out .= ' alt="' . hsc((string) $alt) . '"';
2583440a8c0SAndreas Gohr        if ($width !== null)  $out .= ' width="' . (int) $width . '"';
2593440a8c0SAndreas Gohr        if ($height !== null) $out .= ' height="' . (int) $height . '"';
2603440a8c0SAndreas Gohr        $out .= ' />';
2613440a8c0SAndreas Gohr        return $out;
2623440a8c0SAndreas Gohr    }
2633440a8c0SAndreas Gohr
2643440a8c0SAndreas Gohr    /**
2653440a8c0SAndreas Gohr     * Emit a bare <a href="...">label</a>. If the label is a media
2663440a8c0SAndreas Gohr     * descriptor array (the shape Media::parseMedia() returns, passed by
2673440a8c0SAndreas Gohr     * Internallink / GfmLink when the label is `{{img}}` / `![alt](img)`),
2683440a8c0SAndreas Gohr     * render the <img> inside the <a>.
2693440a8c0SAndreas Gohr     */
2703440a8c0SAndreas Gohr    private function specLink($href, $label): string
2713440a8c0SAndreas Gohr    {
2726359e7fdSAndreas Gohr        $href = $this->specEncodeUrl((string) $href);
2733440a8c0SAndreas Gohr        if (is_array($label) && isset($label['type'])) {
2743440a8c0SAndreas Gohr            $img = $this->specImg(
2753440a8c0SAndreas Gohr                $label['src'],
2763440a8c0SAndreas Gohr                $label['title'],
2773440a8c0SAndreas Gohr                $label['width'] ?? null,
2783440a8c0SAndreas Gohr                $label['height'] ?? null
2793440a8c0SAndreas Gohr            );
2806359e7fdSAndreas Gohr            return '<a href="' . hsc($href) . '">' . $img . '</a>';
2813440a8c0SAndreas Gohr        }
2823440a8c0SAndreas Gohr        $text = ($label === null || $label === '') ? $href : $label;
2836359e7fdSAndreas Gohr        return '<a href="' . hsc($href) . '">' . hsc((string) $text) . '</a>';
2846359e7fdSAndreas Gohr    }
2856359e7fdSAndreas Gohr
2866359e7fdSAndreas Gohr    /**
287*f9d3b7bdSAndreas Gohr     * Percent-encode characters not in cmark-gfm's URL-safe set,
288*f9d3b7bdSAndreas Gohr     * preserving existing %XX sequences. cmark-gfm's HREF_SAFE table
289*f9d3b7bdSAndreas Gohr     * (houdini_href_e.c) excludes square brackets, backslash, caret,
290*f9d3b7bdSAndreas Gohr     * backtick, and braces from the safe set even though RFC 3986
291*f9d3b7bdSAndreas Gohr     * lists `[]` as reserved gen-delims; matching that table is what
292*f9d3b7bdSAndreas Gohr     * lets the spec corpus round-trip byte-for-byte.
2936359e7fdSAndreas Gohr     */
2946359e7fdSAndreas Gohr    private function specEncodeUrl(string $url): string
2956359e7fdSAndreas Gohr    {
2966359e7fdSAndreas Gohr        return preg_replace_callback(
297*f9d3b7bdSAndreas Gohr            "/[^A-Za-z0-9\\-._~:\\/?#@!$&'()*+,;=%]/",
2986359e7fdSAndreas Gohr            static fn($m) => '%' . strtoupper(bin2hex($m[0])),
2996359e7fdSAndreas Gohr            $url
3006359e7fdSAndreas Gohr        );
3013440a8c0SAndreas Gohr    }
3023440a8c0SAndreas Gohr}
303