xref: /dokuwiki/inc/Parsing/ParserMode/Externallink.php (revision b73ece99c18919754d993a1d1f5cb27140555705)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Helpers\HtmlEntity;
7
8/**
9 * Parser mode for external links (URLs).
10 *
11 * This mode is responsible for recognizing and handling external links in the text. It uses regular expressions
12 * to identify URLs based on common schemes and patterns, and it can handle both standard URLs and Markdown-style
13 * angle-bracket autolinks.
14 */
15class Externallink extends AbstractMode
16{
17    protected $schemes = [];
18    protected $patterns = [];
19
20    /** @inheritdoc */
21    public function getSort()
22    {
23        return 330;
24    }
25
26    /** @inheritdoc */
27    public function preConnect()
28    {
29        if (count($this->patterns)) return;
30
31        $ltrs = '\w';
32        $gunk = '/\#~:.?+=&%@!\-\[\]';
33        $punc = '.:?\-;,';
34        $tail = '';
35
36        // GFM autolink extension (Markdown-only):
37        //   - Parentheses are allowed inside URLs; trailing unbalanced `)` are trimmed in handle().
38        //   - A trailing entity-reference-like sequence (e.g. `&copy;`, `&hl;`) is consumed by the URL regex
39        //     and then stripped in handle(); decodeOne() expands valid named/numeric refs to their Unicode
40        //     character (`&copy;` -> `©`) while unknown names round-trip as literal text.
41        if ($this->registry->isMdPreferred()) {
42            $gunk .= '()';
43            $tail = '(?:' . HtmlEntity::PATTERN . ')?';
44        }
45
46        $host = $ltrs . $punc;
47        $any  = $ltrs . $gunk . $punc;
48
49        $this->schemes = getSchemes();
50        foreach ($this->schemes as $scheme) {
51            $this->patterns[] = '\b(?i)' . $scheme . '(?-i)://[' . $any . ']+?' . $tail .
52                '(?=[' . $punc . ']*[^' . $any . '])';
53        }
54
55        $this->patterns[] = '(?<![/\\\\])\b(?i)www?(?-i)\.[' . $host . ']+?\.' .
56                            '[' . $host . ']+?[' . $any . ']+?' . $tail .
57                            '(?=[' . $punc . ']*[^' . $any . '])';
58        $this->patterns[] = '(?<![/\\\\])\b(?i)ftp?(?-i)\.[' . $host . ']+?\.' .
59                            '[' . $host . ']+?[' . $any . ']+?' . $tail .
60                            '(?=[' . $punc . ']*[^' . $any . '])';
61
62        // Markdown-only: angle-bracket autolinks per CommonMark §6.5. One per-scheme pattern that captures the whole
63        // envelope; handle() decides at match time whether to emit a link or literal cdata based on whether the content
64        // contains whitespace (which disqualifies the autolink).
65        // Angle brackets with white space are basically a simple way to write a URL without triggering autolinking
66        if ($this->registry->isMdPreferred()) {
67            foreach ($this->schemes as $scheme) {
68                $this->patterns[] = '<[ \t]*(?i)' . $scheme . '(?-i)://[^<>\n]*>';
69            }
70        }
71    }
72
73    /** @inheritdoc */
74    public function connectTo($mode)
75    {
76
77        foreach ($this->patterns as $pattern) {
78            $this->Lexer->addSpecialPattern($pattern, $mode, 'externallink');
79        }
80    }
81
82    /** @inheritdoc */
83    public function handle($match, $state, $pos, Handler $handler)
84    {
85        if (str_starts_with($match, '<') && str_ends_with($match, '>')) {
86            $this->handleAngleAutolink($match, $pos, $handler);
87        } else {
88            $this->handleBareUrl($match, $pos, $handler);
89        }
90        return true;
91    }
92
93    /**
94     * Emit a Markdown angle-bracket autolink (CommonMark §6.5).
95     *
96     * Whitespace inside the brackets disqualifies the autolink; in that case the literal envelope is
97     * preserved as cdata so the brackets remain visible.
98     */
99    protected function handleAngleAutolink(string $match, int $pos, Handler $handler): void
100    {
101        if (preg_match('/\s/', $match)) {
102            $handler->addCall('cdata', [$match], $pos);
103            return;
104        }
105        $url = substr($match, 1, -1);
106        $handler->addCall('externallink', [$url, $url], $pos);
107    }
108
109    /**
110     * Emit a bare-URL autolink, optionally preceded by the GFM-extension trim step.
111     *
112     * In Markdown-preferred mode, peelGfmTail() removes characters the URL regex over-consumed
113     * (trailing entity references, unbalanced closing parens) and returns them as a cdata suffix.
114     */
115    protected function handleBareUrl(string $match, int $pos, Handler $handler): void
116    {
117        $url = $match;
118        $trailing = '';
119
120        if ($this->registry->isMdPreferred()) {
121            $trailing = $this->peelGfmTail($url);
122        }
123
124        $title = $this->addProtocolPrefix($url);
125
126        $handler->addCall('externallink', [$url, $title], $pos);
127        if ($trailing !== '') {
128            $handler->addCall('cdata', [$trailing], $pos);
129        }
130    }
131
132    /**
133     * Peel GFM-extension trailing chars off a URL.
134     *
135     * The URL regex deliberately over-consumes parentheses and entity references so this method can decide
136     * what really belongs to the URL. It peels one of two things at a time, repeating until neither applies:
137     *
138     *  - A trailing entity reference (e.g. &copy;): decoded via HtmlEntity::decodeOne so valid named or
139     *    numeric refs become their Unicode character and unknown ones round-trip as literal text.
140     *  - A trailing ) that has no matching ( earlier in the URL.
141     *
142     * Peels prepend to the trailing string so the final order matches the original source.
143     *
144     * @param string $url Mutated in place to the trimmed URL
145     * @return string The peeled-off chars, in original source order, ready to emit as cdata after the link
146     */
147    protected function peelGfmTail(string &$url): string
148    {
149        $trailing = '';
150        while (true) {
151            if (preg_match('/' . HtmlEntity::PATTERN . '$/', $url, $m)) {
152                $trailing = HtmlEntity::decodeOne($m[0]) . $trailing;
153                $url = substr($url, 0, -strlen($m[0]));
154            } elseif (str_ends_with($url, ')') && substr_count($url, ')') > substr_count($url, '(')) {
155                $trailing = ')' . $trailing;
156                $url = substr($url, 0, -1);
157            } else {
158                break;
159            }
160        }
161        return $trailing;
162    }
163
164    /**
165     * Add the implicit protocol on www./ftp. URLs and return the visible label.
166     *
167     * For scheme URLs (http://, ftp://, ...) the label is null, signalling the renderer to display the
168     * href verbatim. For www./ftp. shortcuts the label is the original unprefixed form.
169     *
170     * @param string $url Mutated in place to include the protocol prefix when one was added
171     * @return string|null The visible label, or null to use the prefixed URL as its own label
172     */
173    protected function addProtocolPrefix(string &$url): ?string
174    {
175        $title = null;
176        if (str_starts_with($url, 'ftp') && !str_starts_with($url, 'ftp://')) {
177            $title = $url;
178            $url = 'ftp://' . $url;
179        }
180        if (str_starts_with($url, 'www')) {
181            $title = $url;
182            $url = 'http://' . $url;
183        }
184        return $title;
185    }
186
187    /**
188     * @return array
189     */
190    public function getPatterns()
191    {
192        return $this->patterns;
193    }
194}
195