1<?php 2 3/* 4 * This file is part of the league/commonmark package. 5 * 6 * (c) Colin O'Dell <colinodell@gmail.com> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12namespace League\CommonMark\Extension\Autolink; 13 14use League\CommonMark\Event\DocumentParsedEvent; 15use League\CommonMark\Inline\Element\Link; 16use League\CommonMark\Inline\Element\Text; 17 18final class UrlAutolinkProcessor 19{ 20 // RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php 21 const REGEX = '~ 22 (?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(]) # Can only come at the beginning of a line, after whitespace, or certain delimiting characters 23 ( 24 # Must start with a supported scheme + auth, or "www" 25 (?: 26 (?:%s):// # protocol 27 (?:([\.\pL\pN-]+:)?([\.\pL\pN-]+)@)? # basic auth 28 |www\.) 29 (?: 30 (?:[\pL\pN\pS\-\.])+(?:\.?(?:[\pL\pN]|xn\-\-[\pL\pN-]+)+\.?) # a domain name 31 | # or 32 \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} # an IP address 33 | # or 34 \[ 35 (?:(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){6})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?:(?:[0-9a-f]{1,4})):){5})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){4})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,1}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){3})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,2}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){2})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,3}(?:(?:[0-9a-f]{1,4})))?::(?:(?:[0-9a-f]{1,4})):)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,4}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,5}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,6}(?:(?:[0-9a-f]{1,4})))?::)))) 36 \] # an IPv6 address 37 ) 38 (?::[0-9]+)? # a port (optional) 39 (?:/ (?:[\pL\pN\-._\~!$&\'()*+,;=:@]|%%[0-9A-Fa-f]{2})* )* # a path 40 (?:\? (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a query (optional) 41 (?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a fragment (optional) 42 )~ixu'; 43 44 /** @var string */ 45 private $finalRegex; 46 47 /** 48 * @param array<int, string> $allowedProtocols 49 */ 50 public function __construct(array $allowedProtocols = ['http', 'https', 'ftp']) 51 { 52 $this->finalRegex = \sprintf(self::REGEX, \implode('|', $allowedProtocols)); 53 } 54 55 /** 56 * @param DocumentParsedEvent $e 57 * 58 * @return void 59 */ 60 public function __invoke(DocumentParsedEvent $e) 61 { 62 $walker = $e->getDocument()->walker(); 63 64 while ($event = $walker->next()) { 65 $node = $event->getNode(); 66 if ($node instanceof Text && !($node->parent() instanceof Link)) { 67 self::processAutolinks($node, $this->finalRegex); 68 } 69 } 70 } 71 72 private static function processAutolinks(Text $node, string $regex): void 73 { 74 $contents = \preg_split($regex, $node->getContent(), -1, PREG_SPLIT_DELIM_CAPTURE); 75 76 if ($contents === false || \count($contents) === 1) { 77 return; 78 } 79 80 $leftovers = ''; 81 foreach ($contents as $i => $content) { 82 // Even-indexed elements are things before/after the URLs 83 if ($i % 2 === 0) { 84 // Insert any left-over characters here as well 85 $text = $leftovers . $content; 86 if ($text !== '') { 87 $node->insertBefore(new Text($leftovers . $content)); 88 } 89 90 $leftovers = ''; 91 continue; 92 } 93 94 $leftovers = ''; 95 96 // Does the URL end with punctuation that should be stripped? 97 if (\preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) { 98 // Add the punctuation later 99 $content = $matches[1]; 100 $leftovers = $matches[2]; 101 } 102 103 // Does the URL end with something that looks like an entity reference? 104 if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $content, $matches)) { 105 $content = $matches[1]; 106 $leftovers = $matches[2] . $leftovers; 107 } 108 109 // Does the URL need its closing paren chopped off? 110 if (\substr($content, -1) === ')' && ($diff = self::diffParens($content)) > 0) { 111 $content = \substr($content, 0, -$diff); 112 $leftovers = str_repeat(')', $diff) . $leftovers; 113 } 114 115 self::addLink($node, $content); 116 } 117 118 $node->detach(); 119 } 120 121 private static function addLink(Text $node, string $url): void 122 { 123 // Auto-prefix 'http://' onto 'www' URLs 124 if (\substr($url, 0, 4) === 'www.') { 125 $node->insertBefore(new Link('http://' . $url, $url)); 126 127 return; 128 } 129 130 $node->insertBefore(new Link($url, $url)); 131 } 132 133 /** 134 * @param string $content 135 * 136 * @return int 137 */ 138 private static function diffParens(string $content): int 139 { 140 // Scan the entire autolink for the total number of parentheses. 141 // If there is a greater number of closing parentheses than opening ones, 142 // we don’t consider ANY of the last characters as part of the autolink, 143 // in order to facilitate including an autolink inside a parenthesis. 144 \preg_match_all('/[()]/', $content, $matches); 145 146 $charCount = ['(' => 0, ')' => 0]; 147 foreach ($matches[0] as $char) { 148 $charCount[$char]++; 149 } 150 151 return $charCount[')'] - $charCount['(']; 152 } 153} 154