1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Helpers\HtmlEntity; 7use dokuwiki\Parsing\ModeRegistry; 8 9/** 10 * Parser mode for external links (URLs). 11 * 12 * This mode is responsible for recognizing and handling external links in the text. It uses regular expressions 13 * to identify URLs based on common schemes and patterns, and it can handle both standard URLs and Markdown-style 14 * angle-bracket autolinks. 15 */ 16class Externallink extends AbstractMode 17{ 18 protected $schemes = []; 19 protected $patterns = []; 20 21 /** @inheritdoc */ 22 public function getSort() 23 { 24 return 330; 25 } 26 27 /** @inheritdoc */ 28 public function preConnect() 29 { 30 if (count($this->patterns)) return; 31 32 $ltrs = '\w'; 33 $gunk = '/\#~:.?+=&%@!\-\[\]'; 34 $punc = '.:?\-;,'; 35 $tail = ''; 36 37 // GFM autolink extension (Markdown-only): 38 // - Parentheses are allowed inside URLs; trailing unbalanced `)` are trimmed in handle(). 39 // - A trailing entity-reference-like sequence (e.g. `©`, `&hl;`) is consumed by the URL regex 40 // and then stripped in handle(); decodeOne() expands valid named/numeric refs to their Unicode 41 // character (`©` -> `©`) while unknown names round-trip as literal text. 42 if (ModeRegistry::getInstance()->isMdPreferred()) { 43 $gunk .= '()'; 44 $tail = '(?:' . HtmlEntity::PATTERN . ')?'; 45 } 46 47 $host = $ltrs . $punc; 48 $any = $ltrs . $gunk . $punc; 49 50 $this->schemes = getSchemes(); 51 foreach ($this->schemes as $scheme) { 52 $this->patterns[] = '\b(?i)' . $scheme . '(?-i)://[' . $any . ']+?' . $tail . 53 '(?=[' . $punc . ']*[^' . $any . '])'; 54 } 55 56 $this->patterns[] = '(?<![/\\\\])\b(?i)www?(?-i)\.[' . $host . ']+?\.' . 57 '[' . $host . ']+?[' . $any . ']+?' . $tail . 58 '(?=[' . $punc . ']*[^' . $any . '])'; 59 $this->patterns[] = '(?<![/\\\\])\b(?i)ftp?(?-i)\.[' . $host . ']+?\.' . 60 '[' . $host . ']+?[' . $any . ']+?' . $tail . 61 '(?=[' . $punc . ']*[^' . $any . '])'; 62 63 // Markdown-only: angle-bracket autolinks per CommonMark §6.5. One per-scheme pattern that captures the whole 64 // envelope; handle() decides at match time whether to emit a link or literal cdata based on whether the content 65 // contains whitespace (which disqualifies the autolink). 66 // Angle brackets with white space are basically a simple way to write a URL without triggering autolinking 67 if (ModeRegistry::getInstance()->isMdPreferred()) { 68 foreach ($this->schemes as $scheme) { 69 $this->patterns[] = '<[ \t]*(?i)' . $scheme . '(?-i)://[^<>\n]*>'; 70 } 71 } 72 } 73 74 /** @inheritdoc */ 75 public function connectTo($mode) 76 { 77 78 foreach ($this->patterns as $pattern) { 79 $this->Lexer->addSpecialPattern($pattern, $mode, 'externallink'); 80 } 81 } 82 83 /** @inheritdoc */ 84 public function handle($match, $state, $pos, Handler $handler) 85 { 86 if (str_starts_with($match, '<') && str_ends_with($match, '>')) { 87 $this->handleAngleAutolink($match, $pos, $handler); 88 } else { 89 $this->handleBareUrl($match, $pos, $handler); 90 } 91 return true; 92 } 93 94 /** 95 * Emit a Markdown angle-bracket autolink (CommonMark §6.5). 96 * 97 * Whitespace inside the brackets disqualifies the autolink; in that case the literal envelope is 98 * preserved as cdata so the brackets remain visible. 99 */ 100 protected function handleAngleAutolink(string $match, int $pos, Handler $handler): void 101 { 102 if (preg_match('/\s/', $match)) { 103 $handler->addCall('cdata', [$match], $pos); 104 return; 105 } 106 $url = substr($match, 1, -1); 107 $handler->addCall('externallink', [$url, $url], $pos); 108 } 109 110 /** 111 * Emit a bare-URL autolink, optionally preceded by the GFM-extension trim step. 112 * 113 * In Markdown-preferred mode, peelGfmTail() removes characters the URL regex over-consumed 114 * (trailing entity references, unbalanced closing parens) and returns them as a cdata suffix. 115 */ 116 protected function handleBareUrl(string $match, int $pos, Handler $handler): void 117 { 118 $url = $match; 119 $trailing = ''; 120 121 if (ModeRegistry::getInstance()->isMdPreferred()) { 122 $trailing = $this->peelGfmTail($url); 123 } 124 125 $title = $this->addProtocolPrefix($url); 126 127 $handler->addCall('externallink', [$url, $title], $pos); 128 if ($trailing !== '') { 129 $handler->addCall('cdata', [$trailing], $pos); 130 } 131 } 132 133 /** 134 * Peel GFM-extension trailing chars off a URL. 135 * 136 * The URL regex deliberately over-consumes parentheses and entity references so this method can decide 137 * what really belongs to the URL. It peels one of two things at a time, repeating until neither applies: 138 * 139 * - A trailing entity reference (e.g. ©): decoded via HtmlEntity::decodeOne so valid named or 140 * numeric refs become their Unicode character and unknown ones round-trip as literal text. 141 * - A trailing ) that has no matching ( earlier in the URL. 142 * 143 * Peels prepend to the trailing string so the final order matches the original source. 144 * 145 * @param string $url Mutated in place to the trimmed URL 146 * @return string The peeled-off chars, in original source order, ready to emit as cdata after the link 147 */ 148 protected function peelGfmTail(string &$url): string 149 { 150 $trailing = ''; 151 while (true) { 152 if (preg_match('/' . HtmlEntity::PATTERN . '$/', $url, $m)) { 153 $trailing = HtmlEntity::decodeOne($m[0]) . $trailing; 154 $url = substr($url, 0, -strlen($m[0])); 155 } elseif (str_ends_with($url, ')') && substr_count($url, ')') > substr_count($url, '(')) { 156 $trailing = ')' . $trailing; 157 $url = substr($url, 0, -1); 158 } else { 159 break; 160 } 161 } 162 return $trailing; 163 } 164 165 /** 166 * Add the implicit protocol on www./ftp. URLs and return the visible label. 167 * 168 * For scheme URLs (http://, ftp://, ...) the label is null, signalling the renderer to display the 169 * href verbatim. For www./ftp. shortcuts the label is the original unprefixed form. 170 * 171 * @param string $url Mutated in place to include the protocol prefix when one was added 172 * @return string|null The visible label, or null to use the prefixed URL as its own label 173 */ 174 protected function addProtocolPrefix(string &$url): ?string 175 { 176 $title = null; 177 if (str_starts_with($url, 'ftp') && !str_starts_with($url, 'ftp://')) { 178 $title = $url; 179 $url = 'ftp://' . $url; 180 } 181 if (str_starts_with($url, 'www')) { 182 $title = $url; 183 $url = 'http://' . $url; 184 } 185 return $title; 186 } 187 188 /** 189 * @return array 190 */ 191 public function getPatterns() 192 { 193 return $this->patterns; 194 } 195} 196