xref: /dokuwiki/inc/Parsing/ParserMode/GfmLink.php (revision 1beb745045ec82c76dc27b814d474310020435ed)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Escape;
7use dokuwiki\Parsing\Helpers\HtmlEntity;
8use dokuwiki\Parsing\Helpers\Link;
9use dokuwiki\Parsing\Helpers\Media as MediaHelper;
10
11/**
12 * GFM inline link [text](url) with optional title [text](url "title").
13 *
14 * The link text may be either plain text (the common case) or an inline
15 * image `![alt](imgUrl)` — the Markdown equivalent of DW's
16 * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link
17 * handler call with a media descriptor array in the label slot, reusing
18 * the same flow that `Internallink` already drives. No new handler
19 * instructions; renderers (xhtml, odt, metadata, …) already know how to
20 * render a link whose label is a media descriptor.
21 *
22 * Mirrors DW's `Internallink` architecture: a permissive outer pattern
23 * plus handle-time parsing, rather than encoding every GFM rule at
24 * pattern level.
25 *
26 * Deliberately not supported (see skip.php for the affected spec examples):
27 *
28 *   - Reference links [text][id] / [text][] / [foo] — the single-pass
29 *     lexer cannot resolve forward references to [foo]: url definitions.
30 *   - Pointy-bracket destinations [link](<foo bar>) — the simplified
31 *     pattern will happily match, but handle() produces an internallink
32 *     with a broken src; spec tests for this stay in skip.php.
33 *   - Balanced-parens inside URLs [link](foo(bar)) — matches truncate
34 *     at first `)`, producing odd output; also in skip.php.
35 *   - Title HTML attribute — DokuWiki link handler instructions have no
36 *     title-attribute slot, and plumbing one through every renderer just
37 *     for this is out of scope. The title parses cleanly but is discarded.
38 *   - Mixed text + image in the label ([prefix ![alt](img) suffix](url))
39 *     — matches DW's policy: Internallink only converts the label to a
40 *     media descriptor when it matches `^{{…}}$` exactly.
41 */
42class GfmLink extends AbstractMode
43{
44    // URL slot character set: any non-paren / non-newline char, OR a
45    // backslash-escape sequence so an escaped `\)` doesn't terminate the
46    // URL early (spec examples 504/506/508). Backslash-unescape is
47    // applied post-extraction; the pattern only needs to keep escaped
48    // close-parens from prematurely ending the match.
49    private const URL_CHAR = '(?:\\\\.|[^)\n])';
50
51    // Label character set: forbids unescaped `[` / `]` so the outer
52    // bracket pair stays balanced, but allows `\[` / `\]` so an escaped
53    // bracket can appear inside the label (spec example 523). The same
54    // backslash-escape trick the URL slot already uses.
55    private const LABEL_CHAR = '(?:\\\\.|[^\[\]\n])';
56
57    // Image sub-pattern reused for both the label alternative in the main
58    // pattern and the image-as-label detector in handle(). No capture
59    // groups here — the lexer wraps user patterns in a capture and
60    // additional captures would renumber unpredictably.
61    private const IMAGE_SUB = '!\[' . self::LABEL_CHAR . '*\]\(' . self::URL_CHAR . '+\)';
62
63    /** @inheritdoc */
64    public function getSort()
65    {
66        return 300;
67    }
68
69    /** @inheritdoc */
70    public function connectTo($mode)
71    {
72        // Outer shape: `[text-or-image](url)`. Text class forbids
73        // unescaped brackets and newlines but allows `\[` / `\]`; the
74        // image alternative explicitly matches one inline image. URL
75        // slot is permissive — handle() does URL / title splitting
76        // post-entry, mirroring how DW Internallink parses inside `[[...]]`.
77        $pattern = '\[(?!\[)(?:' . self::LABEL_CHAR . '+|' . self::IMAGE_SUB . ')\]\(' . self::URL_CHAR . '+\)';
78        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link');
79    }
80
81    /** @inheritdoc */
82    public function handle($match, $state, $pos, Handler $handler)
83    {
84        // Detect image-as-label `[![alt](img)](target)`. Parallels
85        // Internallink's `^{{…}}$` check — when the label is exactly an
86        // inline image, parse it into a media descriptor; otherwise
87        // treat the label as plain text.
88        if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\((' . self::URL_CHAR . '+)\)$/', $match, $m)) {
89            $label     = $this->parseImageDescriptor($m[1]);
90            $targetUrl = $this->extractUrl($m[2]);
91        } else {
92            // Plain text label can't contain `]`, so the first `](` is
93            // the label/target separator.
94            $sep       = strpos($match, '](');
95            $label     = Escape::unescapeBackslashes(substr($match, 1, $sep - 1));
96            $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1));
97        }
98
99        // Classify on the raw URL so windowssharelink detection sees the
100        // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse
101        // would otherwise destroy the share prefix.
102        [$call, $args] = Link::classify($targetUrl, $label);
103        if ($call !== 'windowssharelink') {
104            $args[0] = Escape::unescapeBackslashes($args[0]);
105        }
106        $handler->addCall($call, $args, $pos);
107        return true;
108    }
109
110    /**
111     * Extract the URL from a parenthesized payload: trim surrounding
112     * whitespace, take the first whitespace-delimited token, then
113     * apply GFM's URL-slot transformations (entity decoding;
114     * backslash-unescape happens later, after Link::classify, because
115     * windowssharelink detection needs the raw `\\` runs intact).
116     * Any trailing title is discarded (no renderer slot for it).
117     */
118    private function extractUrl(string $inside): string
119    {
120        $inside = trim($inside);
121        $url    = substr($inside, 0, strcspn($inside, " \t\n")); // remove optional title
122        return HtmlEntity::decode($url);
123    }
124
125    /**
126     * Parse an inline image sub-match `![alt](imgUrl)` into the media
127     * descriptor shape Media::parseMedia() returns, so the link handler
128     * can treat it as a media label identically to `[[page|{{img}}]]`.
129     */
130    private function parseImageDescriptor(string $imageMatch): array
131    {
132        $sep    = strpos($imageMatch, '](');
133        $alt    = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2));
134        $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1)));
135
136        $p = MediaHelper::parseParameters($imgUrl);
137        $type = (media_isexternal($p['src']) || link_isinterwiki($p['src']))
138            ? 'externalmedia'
139            : 'internalmedia';
140
141        return [
142            'type'    => $type,
143            'src'     => $p['src'],
144            'title'   => $alt !== '' ? $alt : null,
145            'align'   => $p['align'],
146            'width'   => $p['width'],
147            'height'  => $p['height'],
148            'cache'   => $p['cache'],
149            'linking' => $p['linking'],
150        ];
151    }
152}
153