xref: /dokuwiki/inc/Parsing/ParserMode/Externallink.php (revision e7dae73bcd947f44c901faaac9dd45de67633a3b)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\HtmlEntity;
7use dokuwiki\Parsing\ModeRegistry;
8
9/**
10 * Parser mode for external links (URLs).
11 *
12 * This mode is responsible for recognizing and handling external links in the text. It uses regular expressions
13 * to identify URLs based on common schemes and patterns, and it can handle both standard URLs and Markdown-style
14 * angle-bracket autolinks.
15 */
16class Externallink extends AbstractMode
17{
18    protected $schemes = [];
19    protected $patterns = [];
20
21    /** @inheritdoc */
22    public function getSort()
23    {
24        return 330;
25    }
26
27    /** @inheritdoc */
28    public function preConnect()
29    {
30        if (count($this->patterns)) return;
31
32        $ltrs = '\w';
33        $gunk = '/\#~:.?+=&%@!\-\[\]';
34        $punc = '.:?\-;,';
35        $tail = '';
36
37        // GFM autolink extension (Markdown-only):
38        //   - Parentheses are allowed inside URLs; trailing unbalanced `)` are trimmed in handle().
39        //   - A trailing entity-reference-like sequence (e.g. `&copy;`, `&hl;`) is consumed by the URL regex
40        //     and then stripped in handle(); decodeOne() expands valid named/numeric refs to their Unicode
41        //     character (`&copy;` -> `©`) while unknown names round-trip as literal text.
42        if (ModeRegistry::getInstance()->isMdPreferred()) {
43            $gunk .= '()';
44            $tail = '(?:' . HtmlEntity::PATTERN . ')?';
45        }
46
47        $host = $ltrs . $punc;
48        $any  = $ltrs . $gunk . $punc;
49
50        $this->schemes = getSchemes();
51        foreach ($this->schemes as $scheme) {
52            $this->patterns[] = '\b(?i)' . $scheme . '(?-i)://[' . $any . ']+?' . $tail .
53                '(?=[' . $punc . ']*[^' . $any . '])';
54        }
55
56        $this->patterns[] = '(?<![/\\\\])\b(?i)www?(?-i)\.[' . $host . ']+?\.' .
57                            '[' . $host . ']+?[' . $any . ']+?' . $tail .
58                            '(?=[' . $punc . ']*[^' . $any . '])';
59        $this->patterns[] = '(?<![/\\\\])\b(?i)ftp?(?-i)\.[' . $host . ']+?\.' .
60                            '[' . $host . ']+?[' . $any . ']+?' . $tail .
61                            '(?=[' . $punc . ']*[^' . $any . '])';
62
63        // Markdown-only: angle-bracket autolinks per CommonMark §6.5. One per-scheme pattern that captures the whole
64        // envelope; handle() decides at match time whether to emit a link or literal cdata based on whether the content
65        // contains whitespace (which disqualifies the autolink).
66        // Angle brackets with white space are basically a simple way to write a URL without triggering autolinking
67        if (ModeRegistry::getInstance()->isMdPreferred()) {
68            foreach ($this->schemes as $scheme) {
69                $this->patterns[] = '<[ \t]*(?i)' . $scheme . '(?-i)://[^<>\n]*>';
70            }
71        }
72    }
73
74    /** @inheritdoc */
75    public function connectTo($mode)
76    {
77
78        foreach ($this->patterns as $pattern) {
79            $this->Lexer->addSpecialPattern($pattern, $mode, 'externallink');
80        }
81    }
82
83    /** @inheritdoc */
84    public function handle($match, $state, $pos, Handler $handler)
85    {
86        if (str_starts_with($match, '<') && str_ends_with($match, '>')) {
87            $this->handleAngleAutolink($match, $pos, $handler);
88        } else {
89            $this->handleBareUrl($match, $pos, $handler);
90        }
91        return true;
92    }
93
94    /**
95     * Emit a Markdown angle-bracket autolink (CommonMark §6.5).
96     *
97     * Whitespace inside the brackets disqualifies the autolink; in that case the literal envelope is
98     * preserved as cdata so the brackets remain visible.
99     */
100    protected function handleAngleAutolink(string $match, int $pos, Handler $handler): void
101    {
102        if (preg_match('/\s/', $match)) {
103            $handler->addCall('cdata', [$match], $pos);
104            return;
105        }
106        $url = substr($match, 1, -1);
107        $handler->addCall('externallink', [$url, $url], $pos);
108    }
109
110    /**
111     * Emit a bare-URL autolink, optionally preceded by the GFM-extension trim step.
112     *
113     * In Markdown-preferred mode, peelGfmTail() removes characters the URL regex over-consumed
114     * (trailing entity references, unbalanced closing parens) and returns them as a cdata suffix.
115     */
116    protected function handleBareUrl(string $match, int $pos, Handler $handler): void
117    {
118        $url = $match;
119        $trailing = '';
120
121        if (ModeRegistry::getInstance()->isMdPreferred()) {
122            $trailing = $this->peelGfmTail($url);
123        }
124
125        $title = $this->addProtocolPrefix($url);
126
127        $handler->addCall('externallink', [$url, $title], $pos);
128        if ($trailing !== '') {
129            $handler->addCall('cdata', [$trailing], $pos);
130        }
131    }
132
133    /**
134     * Peel GFM-extension trailing chars off a URL.
135     *
136     * The URL regex deliberately over-consumes parentheses and entity references so this method can decide
137     * what really belongs to the URL. It peels one of two things at a time, repeating until neither applies:
138     *
139     *  - A trailing entity reference (e.g. &copy;): decoded via HtmlEntity::decodeOne so valid named or
140     *    numeric refs become their Unicode character and unknown ones round-trip as literal text.
141     *  - A trailing ) that has no matching ( earlier in the URL.
142     *
143     * Peels prepend to the trailing string so the final order matches the original source.
144     *
145     * @param string $url Mutated in place to the trimmed URL
146     * @return string The peeled-off chars, in original source order, ready to emit as cdata after the link
147     */
148    protected function peelGfmTail(string &$url): string
149    {
150        $trailing = '';
151        while (true) {
152            if (preg_match('/' . HtmlEntity::PATTERN . '$/', $url, $m)) {
153                $trailing = HtmlEntity::decodeOne($m[0]) . $trailing;
154                $url = substr($url, 0, -strlen($m[0]));
155            } elseif (str_ends_with($url, ')') && substr_count($url, ')') > substr_count($url, '(')) {
156                $trailing = ')' . $trailing;
157                $url = substr($url, 0, -1);
158            } else {
159                break;
160            }
161        }
162        return $trailing;
163    }
164
165    /**
166     * Add the implicit protocol on www./ftp. URLs and return the visible label.
167     *
168     * For scheme URLs (http://, ftp://, ...) the label is null, signalling the renderer to display the
169     * href verbatim. For www./ftp. shortcuts the label is the original unprefixed form.
170     *
171     * @param string $url Mutated in place to include the protocol prefix when one was added
172     * @return string|null The visible label, or null to use the prefixed URL as its own label
173     */
174    protected function addProtocolPrefix(string &$url): ?string
175    {
176        $title = null;
177        if (str_starts_with($url, 'ftp') && !str_starts_with($url, 'ftp://')) {
178            $title = $url;
179            $url = 'ftp://' . $url;
180        }
181        if (str_starts_with($url, 'www')) {
182            $title = $url;
183            $url = 'http://' . $url;
184        }
185        return $title;
186    }
187
188    /**
189     * @return array
190     */
191    public function getPatterns()
192    {
193        return $this->patterns;
194    }
195}
196