1<?php
2
3/*
4 * This file is part of the league/commonmark package.
5 *
6 * (c) Colin O'Dell <colinodell@gmail.com>
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11
12namespace League\CommonMark\Extension\Autolink;
13
14use League\CommonMark\Event\DocumentParsedEvent;
15use League\CommonMark\Inline\Element\Link;
16use League\CommonMark\Inline\Element\Text;
17
18final class UrlAutolinkProcessor
19{
20    // RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php
21    const REGEX = '~
22        (?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(])  # Can only come at the beginning of a line, after whitespace, or certain delimiting characters
23        (
24            # Must start with a supported scheme + auth, or "www"
25            (?:
26                (?:%s)://                                 # protocol
27                (?:([\.\pL\pN-]+:)?([\.\pL\pN-]+)@)?      # basic auth
28            |www\.)
29            (?:
30                (?:[\pL\pN\pS\-\.])+(?:\.?(?:[\pL\pN]|xn\-\-[\pL\pN-]+)+\.?) # a domain name
31                    |                                                 # or
32                \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}                    # an IP address
33                    |                                                 # or
34                \[
35                    (?:(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){6})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?:(?:[0-9a-f]{1,4})):){5})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){4})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,1}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){3})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,2}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){2})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,3}(?:(?:[0-9a-f]{1,4})))?::(?:(?:[0-9a-f]{1,4})):)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,4}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,5}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,6}(?:(?:[0-9a-f]{1,4})))?::))))
36                \]  # an IPv6 address
37            )
38            (?::[0-9]+)?                              # a port (optional)
39            (?:/ (?:[\pL\pN\-._\~!$&\'()*+,;=:@]|%%[0-9A-Fa-f]{2})* )*      # a path
40            (?:\? (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?   # a query (optional)
41            (?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?   # a fragment (optional)
42        )~ixu';
43
44    /** @var string */
45    private $finalRegex;
46
47    /**
48     * @param array<int, string> $allowedProtocols
49     */
50    public function __construct(array $allowedProtocols = ['http', 'https', 'ftp'])
51    {
52        $this->finalRegex = \sprintf(self::REGEX, \implode('|', $allowedProtocols));
53    }
54
55    /**
56     * @param DocumentParsedEvent $e
57     *
58     * @return void
59     */
60    public function __invoke(DocumentParsedEvent $e)
61    {
62        $walker = $e->getDocument()->walker();
63
64        while ($event = $walker->next()) {
65            $node = $event->getNode();
66            if ($node instanceof Text && !($node->parent() instanceof Link)) {
67                self::processAutolinks($node, $this->finalRegex);
68            }
69        }
70    }
71
72    private static function processAutolinks(Text $node, string $regex): void
73    {
74        $contents = \preg_split($regex, $node->getContent(), -1, PREG_SPLIT_DELIM_CAPTURE);
75
76        if ($contents === false || \count($contents) === 1) {
77            return;
78        }
79
80        $leftovers = '';
81        foreach ($contents as $i => $content) {
82            // Even-indexed elements are things before/after the URLs
83            if ($i % 2 === 0) {
84                // Insert any left-over characters here as well
85                $text = $leftovers . $content;
86                if ($text !== '') {
87                    $node->insertBefore(new Text($leftovers . $content));
88                }
89
90                $leftovers = '';
91                continue;
92            }
93
94            $leftovers = '';
95
96            // Does the URL end with punctuation that should be stripped?
97            if (\preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) {
98                // Add the punctuation later
99                $content = $matches[1];
100                $leftovers = $matches[2];
101            }
102
103            // Does the URL end with something that looks like an entity reference?
104            if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $content, $matches)) {
105                $content = $matches[1];
106                $leftovers = $matches[2] . $leftovers;
107            }
108
109            // Does the URL need its closing paren chopped off?
110            if (\substr($content, -1) === ')' && ($diff = self::diffParens($content)) > 0) {
111                $content = \substr($content, 0, -$diff);
112                $leftovers = str_repeat(')', $diff) . $leftovers;
113            }
114
115            self::addLink($node, $content);
116        }
117
118        $node->detach();
119    }
120
121    private static function addLink(Text $node, string $url): void
122    {
123        // Auto-prefix 'http://' onto 'www' URLs
124        if (\substr($url, 0, 4) === 'www.') {
125            $node->insertBefore(new Link('http://' . $url, $url));
126
127            return;
128        }
129
130        $node->insertBefore(new Link($url, $url));
131    }
132
133    /**
134     * @param string $content
135     *
136     * @return int
137     */
138    private static function diffParens(string $content): int
139    {
140        // Scan the entire autolink for the total number of parentheses.
141        // If there is a greater number of closing parentheses than opening ones,
142        // we don’t consider ANY of the last characters as part of the autolink,
143        // in order to facilitate including an autolink inside a parenthesis.
144        \preg_match_all('/[()]/', $content, $matches);
145
146        $charCount = ['(' => 0, ')' => 0];
147        foreach ($matches[0] as $char) {
148            $charCount[$char]++;
149        }
150
151        return $charCount[')'] - $charCount['('];
152    }
153}
154