xref: /dokuwiki/inc/Parsing/ParserMode/GfmLink.php (revision 47a02a102092be9e1e6f1ddaf158bdfffdb13d4f)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Escape;
7use dokuwiki\Parsing\Helpers\HtmlEntity;
8use dokuwiki\Parsing\Helpers\Link;
9use dokuwiki\Parsing\Helpers\Media as MediaHelper;
10
11/**
12 * GFM inline link [text](url) with optional title [text](url "title").
13 *
14 * The link text may be either plain text (the common case) or an inline
15 * image `![alt](imgUrl)` — the Markdown equivalent of DW's
16 * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link
17 * handler call with a media descriptor array in the label slot, reusing
18 * the same flow that `Internallink` already drives. No new handler
19 * instructions; renderers (xhtml, odt, metadata, …) already know how to
20 * render a link whose label is a media descriptor.
21 *
22 * Mirrors DW's `Internallink` architecture: a permissive outer pattern
23 * plus handle-time parsing, rather than encoding every GFM rule at
24 * pattern level.
25 *
26 * Deliberately not supported (see skip.php for the affected spec examples):
27 *
28 *   - Reference links [text][id] / [text][] / [foo] — the single-pass
29 *     lexer cannot resolve forward references to [foo]: url definitions.
30 *   - Pointy-bracket destinations [link](<foo bar>) — the simplified
31 *     pattern will happily match, but handle() produces an internallink
32 *     with a broken src; spec tests for this stay in skip.php.
33 *   - Balanced-parens inside URLs [link](foo(bar)) — matches truncate
34 *     at first `)`, producing odd output; also in skip.php.
35 *   - Title HTML attribute — DokuWiki link handler instructions have no
36 *     title-attribute slot, and plumbing one through every renderer just
37 *     for this is out of scope. The title parses cleanly but is discarded.
38 *   - Mixed text + image in the label ([prefix ![alt](img) suffix](url))
39 *     — matches DW's policy: Internallink only converts the label to a
40 *     media descriptor when it matches `^{{…}}$` exactly.
41 */
42class GfmLink extends AbstractMode
43{
44    // URL slot character set: any non-paren / non-newline char, OR a
45    // backslash-escape sequence so an escaped `\)` doesn't terminate the
46    // URL early (spec examples 504/506/508). Backslash-unescape is
47    // applied post-extraction; the pattern only needs to keep escaped
48    // close-parens from prematurely ending the match.
49    private const URL_CHAR = '(?:\\\\.|[^)\n])';
50
51    // Label character set: forbids unescaped `[` / `]` so the outer
52    // bracket pair stays balanced, but allows `\[` / `\]` so an escaped
53    // bracket can appear inside the label (spec example 523). The same
54    // backslash-escape trick the URL slot already uses. A bare `\n` is
55    // permitted as long as it is not followed by a blank line — soft
56    // line breaks inside link text are allowed by the spec, blank lines
57    // are not (and they would also tie up `\n#`-anchored block modes).
58    private const LABEL_CHAR = '(?:\\\\.|[^\[\]\n]|\n(?![ \t]*\n))';
59
60    // Image sub-pattern reused for both the label alternative in the main
61    // pattern and the image-as-label detector in handle(). No capture
62    // groups here — the lexer wraps user patterns in a capture and
63    // additional captures would renumber unpredictably.
64    private const IMAGE_SUB = '!\[' . self::LABEL_CHAR . '*\]\(' . self::URL_CHAR . '+\)';
65
66    /** @inheritdoc */
67    public function getSort()
68    {
69        return 300;
70    }
71
72    /** @inheritdoc */
73    public function connectTo($mode)
74    {
75        // Outer shape: `[text-or-image](url)`. Text class forbids
76        // unescaped brackets and newlines but allows `\[` / `\]`; the
77        // image alternative explicitly matches one inline image. URL
78        // slot is permissive — handle() does URL / title splitting
79        // post-entry, mirroring how DW Internallink parses inside `[[...]]`.
80        $pattern = '\[(?!\[)(?:' . self::LABEL_CHAR . '+|' . self::IMAGE_SUB . ')\]\(' . self::URL_CHAR . '+\)';
81        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link');
82    }
83
84    /** @inheritdoc */
85    public function handle($match, $state, $pos, Handler $handler)
86    {
87        // Detect image-as-label `[![alt](img)](target)`. Parallels
88        // Internallink's `^{{…}}$` check — when the label is exactly an
89        // inline image, parse it into a media descriptor; otherwise
90        // treat the label as plain text.
91        if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\((' . self::URL_CHAR . '+)\)$/', $match, $m)) {
92            $label     = $this->parseImageDescriptor($m[1]);
93            $targetUrl = $this->extractUrl($m[2]);
94        } else {
95            // Plain text label can't contain `]`, so the first `](` is
96            // the label/target separator.
97            $sep       = strpos($match, '](');
98            $label     = Escape::unescapeBackslashes(substr($match, 1, $sep - 1));
99            $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1));
100        }
101
102        // Classify on the raw URL so windowssharelink detection sees the
103        // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse
104        // would otherwise destroy the share prefix.
105        [$call, $args] = Link::classify($targetUrl, $label);
106        if ($call !== 'windowssharelink') {
107            $args[0] = Escape::unescapeBackslashes($args[0]);
108        }
109        $handler->addCall($call, $args, $pos);
110        return true;
111    }
112
113    /**
114     * Extract the URL from a parenthesized payload: trim surrounding
115     * whitespace, take the first whitespace-delimited token, then
116     * apply GFM's URL-slot transformations (entity decoding;
117     * backslash-unescape happens later, after Link::classify, because
118     * windowssharelink detection needs the raw `\\` runs intact).
119     * Any trailing title is discarded (no renderer slot for it).
120     */
121    private function extractUrl(string $inside): string
122    {
123        $inside = trim($inside);
124        $url    = substr($inside, 0, strcspn($inside, " \t\n")); // remove optional title
125        return HtmlEntity::decode($url);
126    }
127
128    /**
129     * Parse an inline image sub-match `![alt](imgUrl)` into the media
130     * descriptor shape Media::parseMedia() returns, so the link handler
131     * can treat it as a media label identically to `[[page|{{img}}]]`.
132     */
133    private function parseImageDescriptor(string $imageMatch): array
134    {
135        $sep    = strpos($imageMatch, '](');
136        $alt    = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2));
137        $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1)));
138
139        $p = MediaHelper::parseParameters($imgUrl);
140        $type = (media_isexternal($p['src']) || link_isinterwiki($p['src']))
141            ? 'externalmedia'
142            : 'internalmedia';
143
144        return [
145            'type'    => $type,
146            'src'     => $p['src'],
147            'title'   => $alt !== '' ? $alt : null,
148            'align'   => $p['align'],
149            'width'   => $p['width'],
150            'height'  => $p['height'],
151            'cache'   => $p['cache'],
152            'linking' => $p['linking'],
153        ];
154    }
155}
156