1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Helpers\HtmlEntity; 7 8/** 9 * Parser mode for external links (URLs). 10 * 11 * This mode is responsible for recognizing and handling external links in the text. It uses regular expressions 12 * to identify URLs based on common schemes and patterns, and it can handle both standard URLs and Markdown-style 13 * angle-bracket autolinks. 14 */ 15class Externallink extends AbstractMode 16{ 17 protected $schemes = []; 18 protected $patterns = []; 19 20 /** @inheritdoc */ 21 public function getSort() 22 { 23 return 330; 24 } 25 26 /** @inheritdoc */ 27 public function preConnect() 28 { 29 if (count($this->patterns)) return; 30 31 $ltrs = '\w'; 32 $gunk = '/\#~:.?+=&%@!\-\[\]'; 33 $punc = '.:?\-;,'; 34 $tail = ''; 35 36 // GFM autolink extension (Markdown-only): 37 // - Parentheses are allowed inside URLs; trailing unbalanced `)` are trimmed in handle(). 38 // - A trailing entity-reference-like sequence (e.g. `©`, `&hl;`) is consumed by the URL regex 39 // and then stripped in handle(); decodeOne() expands valid named/numeric refs to their Unicode 40 // character (`©` -> `©`) while unknown names round-trip as literal text. 41 if ($this->registry->isMdPreferred()) { 42 $gunk .= '()'; 43 $tail = '(?:' . HtmlEntity::PATTERN . ')?'; 44 } 45 46 $host = $ltrs . $punc; 47 $any = $ltrs . $gunk . $punc; 48 49 $this->schemes = getSchemes(); 50 foreach ($this->schemes as $scheme) { 51 $this->patterns[] = '\b(?i)' . $scheme . '(?-i)://[' . $any . ']+?' . $tail . 52 '(?=[' . $punc . ']*[^' . $any . '])'; 53 } 54 55 $this->patterns[] = '(?<![/\\\\])\b(?i)www?(?-i)\.[' . $host . ']+?\.' . 56 '[' . $host . ']+?[' . $any . ']+?' . $tail . 57 '(?=[' . $punc . ']*[^' . $any . '])'; 58 $this->patterns[] = '(?<![/\\\\])\b(?i)ftp?(?-i)\.[' . $host . ']+?\.' . 59 '[' . $host . ']+?[' . $any . ']+?' . $tail . 60 '(?=[' . $punc . ']*[^' . $any . '])'; 61 62 // Markdown-only: angle-bracket autolinks per CommonMark §6.5. One per-scheme pattern that captures the whole 63 // envelope; handle() decides at match time whether to emit a link or literal cdata based on whether the content 64 // contains whitespace (which disqualifies the autolink). 65 // Angle brackets with white space are basically a simple way to write a URL without triggering autolinking 66 if ($this->registry->isMdPreferred()) { 67 foreach ($this->schemes as $scheme) { 68 $this->patterns[] = '<[ \t]*(?i)' . $scheme . '(?-i)://[^<>\n]*>'; 69 } 70 } 71 } 72 73 /** @inheritdoc */ 74 public function connectTo($mode) 75 { 76 77 foreach ($this->patterns as $pattern) { 78 $this->Lexer->addSpecialPattern($pattern, $mode, 'externallink'); 79 } 80 } 81 82 /** @inheritdoc */ 83 public function handle($match, $state, $pos, Handler $handler) 84 { 85 if (str_starts_with($match, '<') && str_ends_with($match, '>')) { 86 $this->handleAngleAutolink($match, $pos, $handler); 87 } else { 88 $this->handleBareUrl($match, $pos, $handler); 89 } 90 return true; 91 } 92 93 /** 94 * Emit a Markdown angle-bracket autolink (CommonMark §6.5). 95 * 96 * Whitespace inside the brackets disqualifies the autolink; in that case the literal envelope is 97 * preserved as cdata so the brackets remain visible. 98 */ 99 protected function handleAngleAutolink(string $match, int $pos, Handler $handler): void 100 { 101 if (preg_match('/\s/', $match)) { 102 $handler->addCall('cdata', [$match], $pos); 103 return; 104 } 105 $url = substr($match, 1, -1); 106 $handler->addCall('externallink', [$url, $url], $pos); 107 } 108 109 /** 110 * Emit a bare-URL autolink, optionally preceded by the GFM-extension trim step. 111 * 112 * In Markdown-preferred mode, peelGfmTail() removes characters the URL regex over-consumed 113 * (trailing entity references, unbalanced closing parens) and returns them as a cdata suffix. 114 */ 115 protected function handleBareUrl(string $match, int $pos, Handler $handler): void 116 { 117 $url = $match; 118 $trailing = ''; 119 120 if ($this->registry->isMdPreferred()) { 121 $trailing = $this->peelGfmTail($url); 122 } 123 124 $title = $this->addProtocolPrefix($url); 125 126 $handler->addCall('externallink', [$url, $title], $pos); 127 if ($trailing !== '') { 128 $handler->addCall('cdata', [$trailing], $pos); 129 } 130 } 131 132 /** 133 * Peel GFM-extension trailing chars off a URL. 134 * 135 * The URL regex deliberately over-consumes parentheses and entity references so this method can decide 136 * what really belongs to the URL. It peels one of two things at a time, repeating until neither applies: 137 * 138 * - A trailing entity reference (e.g. ©): decoded via HtmlEntity::decodeOne so valid named or 139 * numeric refs become their Unicode character and unknown ones round-trip as literal text. 140 * - A trailing ) that has no matching ( earlier in the URL. 141 * 142 * Peels prepend to the trailing string so the final order matches the original source. 143 * 144 * @param string $url Mutated in place to the trimmed URL 145 * @return string The peeled-off chars, in original source order, ready to emit as cdata after the link 146 */ 147 protected function peelGfmTail(string &$url): string 148 { 149 $trailing = ''; 150 while (true) { 151 if (preg_match('/' . HtmlEntity::PATTERN . '$/', $url, $m)) { 152 $trailing = HtmlEntity::decodeOne($m[0]) . $trailing; 153 $url = substr($url, 0, -strlen($m[0])); 154 } elseif (str_ends_with($url, ')') && substr_count($url, ')') > substr_count($url, '(')) { 155 $trailing = ')' . $trailing; 156 $url = substr($url, 0, -1); 157 } else { 158 break; 159 } 160 } 161 return $trailing; 162 } 163 164 /** 165 * Add the implicit protocol on www./ftp. URLs and return the visible label. 166 * 167 * For scheme URLs (http://, ftp://, ...) the label is null, signalling the renderer to display the 168 * href verbatim. For www./ftp. shortcuts the label is the original unprefixed form. 169 * 170 * @param string $url Mutated in place to include the protocol prefix when one was added 171 * @return string|null The visible label, or null to use the prefixed URL as its own label 172 */ 173 protected function addProtocolPrefix(string &$url): ?string 174 { 175 $title = null; 176 if (str_starts_with($url, 'ftp') && !str_starts_with($url, 'ftp://')) { 177 $title = $url; 178 $url = 'ftp://' . $url; 179 } 180 if (str_starts_with($url, 'www')) { 181 $title = $url; 182 $url = 'http://' . $url; 183 } 184 return $title; 185 } 186 187 /** 188 * @return array 189 */ 190 public function getPatterns() 191 { 192 return $this->patterns; 193 } 194} 195