1<?php 2 3declare(strict_types=1); 4 5namespace GuzzleHttp\Psr7; 6 7use Psr\Http\Message\UriInterface; 8 9/** 10 * Provides methods to normalize and compare URIs. 11 * 12 * @author Tobias Schultze 13 * 14 * @see https://datatracker.ietf.org/doc/html/rfc3986#section-6 15 */ 16final class UriNormalizer 17{ 18 /** 19 * Default normalizations which only include the ones that preserve semantics. 20 */ 21 public const PRESERVING_NORMALIZATIONS = 22 self::CAPITALIZE_PERCENT_ENCODING | 23 self::DECODE_UNRESERVED_CHARACTERS | 24 self::CONVERT_EMPTY_PATH | 25 self::REMOVE_DEFAULT_HOST | 26 self::REMOVE_DEFAULT_PORT | 27 self::REMOVE_DOT_SEGMENTS; 28 29 /** 30 * All letters within a percent-encoding triplet (e.g., "%3A") are case-insensitive, and should be capitalized. 31 * 32 * Example: http://example.org/a%c2%b1b → http://example.org/a%C2%B1b 33 */ 34 public const CAPITALIZE_PERCENT_ENCODING = 1; 35 36 /** 37 * Decodes percent-encoded octets of unreserved characters. 38 * 39 * For consistency, percent-encoded octets in the ranges of ALPHA (%41–%5A and %61–%7A), DIGIT (%30–%39), 40 * hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E) should not be created by URI producers and, 41 * when found in a URI, should be decoded to their corresponding unreserved characters by URI normalizers. 42 * 43 * Example: http://example.org/%7Eusern%61me/ → http://example.org/~username/ 44 */ 45 public const DECODE_UNRESERVED_CHARACTERS = 2; 46 47 /** 48 * Converts the empty path to "/" for http and https URIs. 49 * 50 * Example: http://example.org → http://example.org/ 51 */ 52 public const CONVERT_EMPTY_PATH = 4; 53 54 /** 55 * Removes the default host of the given URI scheme from the URI. 56 * 57 * Only the "file" scheme defines the default host "localhost". 58 * All of `file:/myfile`, `file:///myfile`, and `file://localhost/myfile` 59 * are equivalent according to RFC 3986. The first format is not accepted 60 * by PHPs stream functions and thus already normalized implicitly to the 61 * second format in the Uri class. See `GuzzleHttp\Psr7\Uri::composeComponents`. 62 * 63 * Example: file://localhost/myfile → file:///myfile 64 */ 65 public const REMOVE_DEFAULT_HOST = 8; 66 67 /** 68 * Removes the default port of the given URI scheme from the URI. 69 * 70 * Example: http://example.org:80/ → http://example.org/ 71 */ 72 public const REMOVE_DEFAULT_PORT = 16; 73 74 /** 75 * Removes unnecessary dot-segments. 76 * 77 * Dot-segments in relative-path references are not removed as it would 78 * change the semantics of the URI reference. 79 * 80 * Example: http://example.org/../a/b/../c/./d.html → http://example.org/a/c/d.html 81 */ 82 public const REMOVE_DOT_SEGMENTS = 32; 83 84 /** 85 * Paths which include two or more adjacent slashes are converted to one. 86 * 87 * Webservers usually ignore duplicate slashes and treat those URIs equivalent. 88 * But in theory those URIs do not need to be equivalent. So this normalization 89 * may change the semantics. Encoded slashes (%2F) are not removed. 90 * 91 * Example: http://example.org//foo///bar.html → http://example.org/foo/bar.html 92 */ 93 public const REMOVE_DUPLICATE_SLASHES = 64; 94 95 /** 96 * Sort query parameters with their values in alphabetical order. 97 * 98 * However, the order of parameters in a URI may be significant (this is not defined by the standard). 99 * So this normalization is not safe and may change the semantics of the URI. 100 * 101 * Example: ?lang=en&article=fred → ?article=fred&lang=en 102 * 103 * Note: The sorting is neither locale nor Unicode aware (the URI query does not get decoded at all) as the 104 * purpose is to be able to compare URIs in a reproducible way, not to have the params sorted perfectly. 105 */ 106 public const SORT_QUERY_PARAMETERS = 128; 107 108 /** 109 * Returns a normalized URI. 110 * 111 * The scheme and host component are already normalized to lowercase per PSR-7 UriInterface. 112 * This methods adds additional normalizations that can be configured with the $flags parameter. 113 * 114 * PSR-7 UriInterface cannot distinguish between an empty component and a missing component as 115 * getQuery(), getFragment() etc. always return a string. This means the URIs "/?#" and "/" are 116 * treated equivalent which is not necessarily true according to RFC 3986. But that difference 117 * is highly uncommon in reality. So this potential normalization is implied in PSR-7 as well. 118 * 119 * @param UriInterface $uri The URI to normalize 120 * @param int $flags A bitmask of normalizations to apply, see constants 121 * 122 * @see https://datatracker.ietf.org/doc/html/rfc3986#section-6.2 123 */ 124 public static function normalize(UriInterface $uri, int $flags = self::PRESERVING_NORMALIZATIONS): UriInterface 125 { 126 if ($flags & self::CAPITALIZE_PERCENT_ENCODING) { 127 $uri = self::capitalizePercentEncoding($uri); 128 } 129 130 if ($flags & self::DECODE_UNRESERVED_CHARACTERS) { 131 $uri = self::decodeUnreservedCharacters($uri); 132 } 133 134 if ($flags & self::CONVERT_EMPTY_PATH && $uri->getPath() === '' 135 && ($uri->getScheme() === 'http' || $uri->getScheme() === 'https') 136 ) { 137 $uri = $uri->withPath('/'); 138 } 139 140 if ($flags & self::REMOVE_DEFAULT_HOST && $uri->getScheme() === 'file' && $uri->getHost() === 'localhost') { 141 $uri = $uri->withHost(''); 142 } 143 144 if ($flags & self::REMOVE_DEFAULT_PORT && $uri->getPort() !== null && Uri::isDefaultPort($uri)) { 145 $uri = $uri->withPort(null); 146 } 147 148 if ($flags & self::REMOVE_DOT_SEGMENTS && !Uri::isRelativePathReference($uri)) { 149 $uri = $uri->withPath(UriResolver::removeDotSegments($uri->getPath())); 150 } 151 152 if ($flags & self::REMOVE_DUPLICATE_SLASHES) { 153 $uri = $uri->withPath(preg_replace('#//++#', '/', $uri->getPath())); 154 } 155 156 if ($flags & self::SORT_QUERY_PARAMETERS && $uri->getQuery() !== '') { 157 $queryKeyValues = explode('&', $uri->getQuery()); 158 sort($queryKeyValues); 159 $uri = $uri->withQuery(implode('&', $queryKeyValues)); 160 } 161 162 return $uri; 163 } 164 165 /** 166 * Whether two URIs can be considered equivalent. 167 * 168 * Both URIs are normalized automatically before comparison with the given $normalizations bitmask. The method also 169 * accepts relative URI references and returns true when they are equivalent. This of course assumes they will be 170 * resolved against the same base URI. If this is not the case, determination of equivalence or difference of 171 * relative references does not mean anything. 172 * 173 * @param UriInterface $uri1 An URI to compare 174 * @param UriInterface $uri2 An URI to compare 175 * @param int $normalizations A bitmask of normalizations to apply, see constants 176 * 177 * @see https://datatracker.ietf.org/doc/html/rfc3986#section-6.1 178 */ 179 public static function isEquivalent(UriInterface $uri1, UriInterface $uri2, int $normalizations = self::PRESERVING_NORMALIZATIONS): bool 180 { 181 return (string) self::normalize($uri1, $normalizations) === (string) self::normalize($uri2, $normalizations); 182 } 183 184 private static function capitalizePercentEncoding(UriInterface $uri): UriInterface 185 { 186 $regex = '/(?:%[A-Fa-f0-9]{2})++/'; 187 188 $callback = function (array $match): string { 189 return strtoupper($match[0]); 190 }; 191 192 return 193 $uri->withPath( 194 preg_replace_callback($regex, $callback, $uri->getPath()) 195 )->withQuery( 196 preg_replace_callback($regex, $callback, $uri->getQuery()) 197 ); 198 } 199 200 private static function decodeUnreservedCharacters(UriInterface $uri): UriInterface 201 { 202 $regex = '/%(?:2D|2E|5F|7E|3[0-9]|[46][1-9A-F]|[57][0-9A])/i'; 203 204 $callback = function (array $match): string { 205 return rawurldecode($match[0]); 206 }; 207 208 return 209 $uri->withPath( 210 preg_replace_callback($regex, $callback, $uri->getPath()) 211 )->withQuery( 212 preg_replace_callback($regex, $callback, $uri->getQuery()) 213 ); 214 } 215 216 private function __construct() 217 { 218 // cannot be instantiated 219 } 220} 221