xref: /dokuwiki/inc/Parsing/ParserMode/GfmLink.php (revision eb15e634e1400f6c4d78f5fb40179ca25f41574d)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Escape;
7use dokuwiki\Parsing\Helpers\HtmlEntity;
8use dokuwiki\Parsing\Helpers\Link;
9use dokuwiki\Parsing\Helpers\Media as MediaHelper;
10
11/**
12 * GFM inline link [text](url) with optional title [text](url "title").
13 *
14 * The link text may be either plain text (the common case) or an inline
15 * image `![alt](imgUrl)` — the Markdown equivalent of DW's
16 * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link
17 * handler call with a media descriptor array in the label slot, reusing
18 * the same flow that `Internallink` already drives. No new handler
19 * instructions; renderers (xhtml, odt, metadata, …) already know how to
20 * render a link whose label is a media descriptor.
21 *
22 * Mirrors DW's `Internallink` architecture: a permissive outer pattern
23 * plus handle-time parsing, rather than encoding every GFM rule at
24 * pattern level.
25 *
26 * Deliberately not supported (see skip.php for the affected spec examples):
27 *
28 *   - Reference links [text][id] / [text][] / [foo] — the single-pass
29 *     lexer cannot resolve forward references to [foo]: url definitions.
30 *   - Pointy-bracket destinations [link](<foo bar>) — the simplified
31 *     pattern will happily match, but handle() produces an internallink
32 *     with a broken src; spec tests for this stay in skip.php.
33 *   - Balanced-parens inside URLs [link](foo(bar)) — matches truncate
34 *     at first `)`, producing odd output; also in skip.php.
35 *   - Title HTML attribute — DokuWiki link handler instructions have no
36 *     title-attribute slot, and plumbing one through every renderer just
37 *     for this is out of scope. The title parses cleanly but is discarded.
38 *   - Mixed text + image in the label ([prefix ![alt](img) suffix](url))
39 *     — matches DW's policy: Internallink only converts the label to a
40 *     media descriptor when it matches `^{{…}}$` exactly.
41 */
42class GfmLink extends AbstractMode
43{
44    // URL slot character set: any non-paren / non-newline char, OR a
45    // backslash-escape sequence so an escaped `\)` doesn't terminate the
46    // URL early (spec examples 504/506/508). Backslash-unescape is
47    // applied post-extraction; the pattern only needs to keep escaped
48    // close-parens from prematurely ending the match.
49    private const URL_CHAR = '(?:\\\\.|[^)\n])';
50
51    // Image sub-pattern reused for both the label alternative in the main
52    // pattern and the image-as-label detector in handle(). No capture
53    // groups here — the lexer wraps user patterns in a capture and
54    // additional captures would renumber unpredictably.
55    private const IMAGE_SUB = '!\[[^\[\]\n]*\]\(' . self::URL_CHAR . '+\)';
56
57    /** @inheritdoc */
58    public function getSort()
59    {
60        return 300;
61    }
62
63    /** @inheritdoc */
64    public function connectTo($mode)
65    {
66        // Outer shape: `[text-or-image](url)`. Text class forbids brackets
67        // and newlines; the image alternative explicitly matches one
68        // inline image. URL slot is permissive (`[^)\n]+`) — handle() does
69        // URL / title splitting post-entry, mirroring how DW Internallink
70        // parses inside `[[...]]`.
71        $pattern = '\[(?!\[)(?:[^\[\]\n]+|' . self::IMAGE_SUB . ')\]\(' . self::URL_CHAR . '+\)';
72        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link');
73    }
74
75    /** @inheritdoc */
76    public function handle($match, $state, $pos, Handler $handler)
77    {
78        // Detect image-as-label `[![alt](img)](target)`. Parallels
79        // Internallink's `^{{…}}$` check — when the label is exactly an
80        // inline image, parse it into a media descriptor; otherwise
81        // treat the label as plain text.
82        if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\((' . self::URL_CHAR . '+)\)$/', $match, $m)) {
83            $label     = $this->parseImageDescriptor($m[1]);
84            $targetUrl = $this->extractUrl($m[2]);
85        } else {
86            // Plain text label can't contain `]`, so the first `](` is
87            // the label/target separator.
88            $sep       = strpos($match, '](');
89            $label     = Escape::unescapeBackslashes(substr($match, 1, $sep - 1));
90            $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1));
91        }
92
93        // Classify on the raw URL so windowssharelink detection sees the
94        // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse
95        // would otherwise destroy the share prefix.
96        [$call, $args] = Link::classify($targetUrl, $label);
97        if ($call !== 'windowssharelink') {
98            $args[0] = Escape::unescapeBackslashes($args[0]);
99        }
100        $handler->addCall($call, $args, $pos);
101        return true;
102    }
103
104    /**
105     * Extract the URL from a parenthesized payload: trim surrounding
106     * whitespace, take the first whitespace-delimited token, then
107     * apply GFM's URL-slot transformations (entity decoding;
108     * backslash-unescape happens later, after Link::classify, because
109     * windowssharelink detection needs the raw `\\` runs intact).
110     * Any trailing title is discarded (no renderer slot for it).
111     */
112    private function extractUrl(string $inside): string
113    {
114        $inside = trim($inside);
115        $url    = substr($inside, 0, strcspn($inside, " \t\n")); // remove optional title
116        return HtmlEntity::decode($url);
117    }
118
119    /**
120     * Parse an inline image sub-match `![alt](imgUrl)` into the media
121     * descriptor shape Media::parseMedia() returns, so the link handler
122     * can treat it as a media label identically to `[[page|{{img}}]]`.
123     */
124    private function parseImageDescriptor(string $imageMatch): array
125    {
126        $sep    = strpos($imageMatch, '](');
127        $alt    = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2));
128        $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1)));
129
130        $p = MediaHelper::parseParameters($imgUrl);
131        $type = (media_isexternal($p['src']) || link_isinterwiki($p['src']))
132            ? 'externalmedia'
133            : 'internalmedia';
134
135        return [
136            'type'    => $type,
137            'src'     => $p['src'],
138            'title'   => $alt !== '' ? $alt : null,
139            'align'   => $p['align'],
140            'width'   => $p['width'],
141            'height'  => $p['height'],
142            'cache'   => $p['cache'],
143            'linking' => $p['linking'],
144        ];
145    }
146}
147