xref: /dokuwiki/inc/Parsing/ParserMode/GfmLink.php (revision 74031e463764923581b9204cebc0fc3f34ce881f)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\Escape;
7use dokuwiki\Parsing\Helpers\Link;
8use dokuwiki\Parsing\Helpers\Media as MediaHelper;
9
10/**
11 * GFM inline link [text](url) with optional title [text](url "title").
12 *
13 * The link text may be either plain text (the common case) or an inline
14 * image `![alt](imgUrl)` — the Markdown equivalent of DW's
15 * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link
16 * handler call with a media descriptor array in the label slot, reusing
17 * the same flow that `Internallink` already drives. No new handler
18 * instructions; renderers (xhtml, odt, metadata, …) already know how to
19 * render a link whose label is a media descriptor.
20 *
21 * Mirrors DW's `Internallink` architecture: a permissive outer pattern
22 * plus handle-time parsing, rather than encoding every GFM rule at
23 * pattern level.
24 *
25 * Deliberately not supported (see skip.php for the affected spec examples):
26 *
27 *   - Reference links [text][id] / [text][] / [foo] — the single-pass
28 *     lexer cannot resolve forward references to [foo]: url definitions.
29 *   - Pointy-bracket destinations [link](<foo bar>) — the simplified
30 *     pattern will happily match, but handle() produces an internallink
31 *     with a broken src; spec tests for this stay in skip.php.
32 *   - Balanced-parens inside URLs [link](foo(bar)) — matches truncate
33 *     at first `)`, producing odd output; also in skip.php.
34 *   - Title HTML attribute — DokuWiki link handler instructions have no
35 *     title-attribute slot, and plumbing one through every renderer just
36 *     for this is out of scope. The title parses cleanly but is discarded.
37 *   - Mixed text + image in the label ([prefix ![alt](img) suffix](url))
38 *     — matches DW's policy: Internallink only converts the label to a
39 *     media descriptor when it matches `^{{…}}$` exactly.
40 */
41class GfmLink extends AbstractMode
42{
43    // Image sub-pattern reused for both the label alternative in the main
44    // pattern and the image-as-label detector in handle(). No capture
45    // groups here — the lexer wraps user patterns in a capture and
46    // additional captures would renumber unpredictably.
47    private const IMAGE_SUB = '!\[[^\[\]\n]*\]\([^)\n]+\)';
48
49    /** @inheritdoc */
50    public function getSort()
51    {
52        return 300;
53    }
54
55    /** @inheritdoc */
56    public function connectTo($mode)
57    {
58        // Outer shape: `[text-or-image](url)`. Text class forbids brackets
59        // and newlines; the image alternative explicitly matches one
60        // inline image. URL slot is permissive (`[^)\n]+`) — handle() does
61        // URL / title splitting post-entry, mirroring how DW Internallink
62        // parses inside `[[...]]`.
63        $pattern = '\[(?!\[)(?:[^\[\]\n]+|' . self::IMAGE_SUB . ')\]\([^)\n]+\)';
64        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link');
65    }
66
67    /** @inheritdoc */
68    public function handle($match, $state, $pos, Handler $handler)
69    {
70        // Detect image-as-label `[![alt](img)](target)`. Parallels
71        // Internallink's `^{{…}}$` check — when the label is exactly an
72        // inline image, parse it into a media descriptor; otherwise
73        // treat the label as plain text.
74        if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\(([^)\n]+)\)$/', $match, $m)) {
75            $label     = $this->parseImageDescriptor($m[1]);
76            $targetUrl = $this->extractUrl($m[2]);
77        } else {
78            // Plain text label can't contain `]`, so the first `](` is
79            // the label/target separator.
80            $sep       = strpos($match, '](');
81            $label     = Escape::unescapeBackslashes(substr($match, 1, $sep - 1));
82            $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1));
83        }
84
85        // Classify on the raw URL so windowssharelink detection sees the
86        // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse
87        // would otherwise destroy the share prefix.
88        [$call, $args] = Link::classify($targetUrl, $label);
89        if ($call !== 'windowssharelink') {
90            $args[0] = Escape::unescapeBackslashes($args[0]);
91        }
92        $handler->addCall($call, $args, $pos);
93        return true;
94    }
95
96    /**
97     * Extract the URL from a parenthesized payload: trim surrounding
98     * whitespace, then take the first whitespace-delimited token. Any
99     * trailing title is discarded (no renderer slot for it).
100     */
101    private function extractUrl(string $inside): string
102    {
103        $inside = trim($inside);
104        return substr($inside, 0, strcspn($inside, " \t\n"));
105    }
106
107    /**
108     * Parse an inline image sub-match `![alt](imgUrl)` into the media
109     * descriptor shape Media::parseMedia() returns, so the link handler
110     * can treat it as a media label identically to `[[page|{{img}}]]`.
111     */
112    private function parseImageDescriptor(string $imageMatch): array
113    {
114        $sep    = strpos($imageMatch, '](');
115        $alt    = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2));
116        $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1)));
117
118        $p = MediaHelper::parseParameters($imgUrl);
119        $type = (media_isexternal($p['src']) || link_isinterwiki($p['src']))
120            ? 'externalmedia'
121            : 'internalmedia';
122
123        return [
124            'type'    => $type,
125            'src'     => $p['src'],
126            'title'   => $alt !== '' ? $alt : null,
127            'align'   => $p['align'],
128            'width'   => $p['width'],
129            'height'  => $p['height'],
130            'cache'   => $p['cache'],
131            'linking' => $p['linking'],
132        ];
133    }
134}
135