UriString.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. <?php
  2. /**
  3. * League.Uri (https://uri.thephpleague.com)
  4. *
  5. * (c) Ignace Nyamagana Butera <nyamsprod@gmail.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. declare(strict_types=1);
  11. namespace League\Uri;
  12. use League\Uri\Exceptions\ConversionFailed;
  13. use League\Uri\Exceptions\MissingFeature;
  14. use League\Uri\Exceptions\SyntaxError;
  15. use League\Uri\Idna\Converter;
  16. use Stringable;
  17. use function array_merge;
  18. use function explode;
  19. use function filter_var;
  20. use function inet_pton;
  21. use function preg_match;
  22. use function rawurldecode;
  23. use function sprintf;
  24. use function strpos;
  25. use function substr;
  26. use const FILTER_FLAG_IPV6;
  27. use const FILTER_VALIDATE_IP;
  28. /**
  29. * A class to parse a URI string according to RFC3986.
  30. *
  31. * @link https://tools.ietf.org/html/rfc3986
  32. * @package League\Uri
  33. * @author Ignace Nyamagana Butera <nyamsprod@gmail.com>
  34. * @since 6.0.0
  35. *
  36. * @phpstan-type AuthorityMap array{user:?string, pass:?string, host:?string, port:?int}
  37. * @phpstan-type ComponentMap array{scheme:?string, user:?string, pass:?string, host:?string, port:?int, path:string, query:?string, fragment:?string}
  38. * @phpstan-type InputComponentMap array{scheme? : ?string, user? : ?string, pass? : ?string, host? : ?string, port? : ?int, path? : ?string, query? : ?string, fragment? : ?string}
  39. */
  40. final class UriString
  41. {
  42. /**
  43. * Default URI component values.
  44. *
  45. * @var ComponentMap
  46. */
  47. private const URI_COMPONENTS = [
  48. 'scheme' => null, 'user' => null, 'pass' => null, 'host' => null,
  49. 'port' => null, 'path' => '', 'query' => null, 'fragment' => null,
  50. ];
  51. /**
  52. * Simple URI which do not need any parsing.
  53. *
  54. * @var array<string, array<string>>
  55. */
  56. private const URI_SHORTCUTS = [
  57. '' => [],
  58. '#' => ['fragment' => ''],
  59. '?' => ['query' => ''],
  60. '?#' => ['query' => '', 'fragment' => ''],
  61. '/' => ['path' => '/'],
  62. '//' => ['host' => ''],
  63. ];
  64. /**
  65. * Range of invalid characters in URI string.
  66. *
  67. * @var string
  68. */
  69. private const REGEXP_INVALID_URI_CHARS = '/[\x00-\x1f\x7f]/';
  70. /**
  71. * RFC3986 regular expression URI splitter.
  72. *
  73. * @link https://tools.ietf.org/html/rfc3986#appendix-B
  74. * @var string
  75. */
  76. private const REGEXP_URI_PARTS = ',^
  77. (?<scheme>(?<scontent>[^:/?\#]+):)? # URI scheme component
  78. (?<authority>//(?<acontent>[^/?\#]*))? # URI authority part
  79. (?<path>[^?\#]*) # URI path component
  80. (?<query>\?(?<qcontent>[^\#]*))? # URI query component
  81. (?<fragment>\#(?<fcontent>.*))? # URI fragment component
  82. ,x';
  83. /**
  84. * URI scheme regular expression.
  85. *
  86. * @link https://tools.ietf.org/html/rfc3986#section-3.1
  87. * @var string
  88. */
  89. private const REGEXP_URI_SCHEME = '/^([a-z][a-z\d+.-]*)?$/i';
  90. /**
  91. * IPvFuture regular expression.
  92. *
  93. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  94. * @var string
  95. */
  96. private const REGEXP_IP_FUTURE = '/^
  97. v(?<version>[A-F0-9])+\.
  98. (?:
  99. (?<unreserved>[a-z0-9_~\-\.])|
  100. (?<sub_delims>[!$&\'()*+,;=:]) # also include the : character
  101. )+
  102. $/ix';
  103. /**
  104. * General registered name regular expression.
  105. *
  106. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  107. * @var string
  108. */
  109. private const REGEXP_REGISTERED_NAME = '/(?(DEFINE)
  110. (?<unreserved>[a-z0-9_~\-]) # . is missing as it is used to separate labels
  111. (?<sub_delims>[!$&\'()*+,;=])
  112. (?<encoded>%[A-F0-9]{2})
  113. (?<reg_name>(?:(?&unreserved)|(?&sub_delims)|(?&encoded))*)
  114. )
  115. ^(?:(?&reg_name)\.)*(?&reg_name)\.?$/ix';
  116. /**
  117. * Invalid characters in host regular expression.
  118. *
  119. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  120. * @var string
  121. */
  122. private const REGEXP_INVALID_HOST_CHARS = '/
  123. [:\/?#\[\]@ ] # gen-delims characters as well as the space character
  124. /ix';
  125. /**
  126. * Invalid path for URI without scheme and authority regular expression.
  127. *
  128. * @link https://tools.ietf.org/html/rfc3986#section-3.3
  129. * @var string
  130. */
  131. private const REGEXP_INVALID_PATH = ',^(([^/]*):)(.*)?/,';
  132. /**
  133. * Host and Port splitter regular expression.
  134. *
  135. * @var string
  136. */
  137. private const REGEXP_HOST_PORT = ',^(?<host>\[.*\]|[^:]*)(:(?<port>.*))?$,';
  138. /**
  139. * IDN Host detector regular expression.
  140. *
  141. * @var string
  142. */
  143. private const REGEXP_IDN_PATTERN = '/[^\x20-\x7f]/';
  144. /**
  145. * Only the address block fe80::/10 can have a Zone ID attach to
  146. * let's detect the link local significant 10 bits.
  147. *
  148. * @var string
  149. */
  150. private const ZONE_ID_ADDRESS_BLOCK = "\xfe\x80";
  151. /**
  152. * Maximum number of host cached.
  153. *
  154. * @var int
  155. */
  156. private const MAXIMUM_HOST_CACHED = 100;
  157. /**
  158. * Generate a URI string representation from its parsed representation
  159. * returned by League\UriString::parse() or PHP's parse_url.
  160. *
  161. * If you supply your own array, you are responsible for providing
  162. * valid components without their URI delimiters.
  163. *
  164. * @link https://tools.ietf.org/html/rfc3986#section-5.3
  165. * @link https://tools.ietf.org/html/rfc3986#section-7.5
  166. *
  167. * @param InputComponentMap $components
  168. */
  169. public static function build(array $components): string
  170. {
  171. return self::buildUri(
  172. $components['scheme'] ?? null,
  173. self::buildAuthority($components),
  174. $components['path'] ?? '',
  175. $components['query'] ?? null,
  176. $components['fragment'] ?? null,
  177. );
  178. }
  179. /**
  180. * Generate a URI string representation based on RFC3986 algorithm.
  181. *
  182. * valid URI component MUST be provided without their URI delimiters
  183. * but properly encoded.
  184. *
  185. * @link https://tools.ietf.org/html/rfc3986#section-5.3
  186. * @link https://tools.ietf.org/html/rfc3986#section-7.5
  187. */
  188. public static function buildUri(
  189. ?string $scheme,
  190. ?string $authority,
  191. string $path,
  192. ?string $query,
  193. ?string $fragment,
  194. ): string {
  195. $uri = '';
  196. if (null !== $scheme) {
  197. $uri .= $scheme.':';
  198. }
  199. if (null !== $authority) {
  200. $uri .= '//'.$authority;
  201. }
  202. $uri .= $path;
  203. if (null !== $query) {
  204. $uri .= '?'.$query;
  205. }
  206. if (null !== $fragment) {
  207. $uri .= '#'.$fragment;
  208. }
  209. return $uri;
  210. }
  211. /**
  212. * Generate a URI authority representation from its parsed representation.
  213. *
  214. * @param InputComponentMap $components
  215. */
  216. public static function buildAuthority(array $components): ?string
  217. {
  218. if (!isset($components['host'])) {
  219. return null;
  220. }
  221. $authority = $components['host'];
  222. if (isset($components['port'])) {
  223. $authority .= ':'.$components['port'];
  224. }
  225. if (!isset($components['user'])) {
  226. return $authority;
  227. }
  228. $authority = '@'.$authority;
  229. if (!isset($components['pass'])) {
  230. return $components['user'].$authority;
  231. }
  232. return $components['user'].':'.$components['pass'].$authority;
  233. }
  234. /**
  235. * Parse a URI string into its components.
  236. *
  237. * This method parses a URI and returns an associative array containing any
  238. * of the various components of the URI that are present.
  239. *
  240. * <code>
  241. * $components = UriString::parse('http://foo@test.example.com:42?query#');
  242. * var_export($components);
  243. * //will display
  244. * array(
  245. * 'scheme' => 'http', // the URI scheme component
  246. * 'user' => 'foo', // the URI user component
  247. * 'pass' => null, // the URI pass component
  248. * 'host' => 'test.example.com', // the URI host component
  249. * 'port' => 42, // the URI port component
  250. * 'path' => '', // the URI path component
  251. * 'query' => 'query', // the URI query component
  252. * 'fragment' => '', // the URI fragment component
  253. * );
  254. * </code>
  255. *
  256. * The returned array is similar to PHP's parse_url return value with the following
  257. * differences:
  258. *
  259. * <ul>
  260. * <li>All components are always present in the returned array</li>
  261. * <li>Empty and undefined component are treated differently. And empty component is
  262. * set to the empty string while an undefined component is set to the `null` value.</li>
  263. * <li>The path component is never undefined</li>
  264. * <li>The method parses the URI following the RFC3986 rules, but you are still
  265. * required to validate the returned components against its related scheme specific rules.</li>
  266. * </ul>
  267. *
  268. * @link https://tools.ietf.org/html/rfc3986
  269. *
  270. * @throws SyntaxError if the URI contains invalid characters
  271. * @throws SyntaxError if the URI contains an invalid scheme
  272. * @throws SyntaxError if the URI contains an invalid path
  273. *
  274. * @return ComponentMap
  275. */
  276. public static function parse(Stringable|string|int $uri): array
  277. {
  278. $uri = (string) $uri;
  279. if (isset(self::URI_SHORTCUTS[$uri])) {
  280. /** @var ComponentMap $components */
  281. $components = array_merge(self::URI_COMPONENTS, self::URI_SHORTCUTS[$uri]);
  282. return $components;
  283. }
  284. if (1 === preg_match(self::REGEXP_INVALID_URI_CHARS, $uri)) {
  285. throw new SyntaxError(sprintf('The uri `%s` contains invalid characters', $uri));
  286. }
  287. //if the first character is a known URI delimiter parsing can be simplified
  288. $first_char = $uri[0];
  289. //The URI is made of the fragment only
  290. if ('#' === $first_char) {
  291. [, $fragment] = explode('#', $uri, 2);
  292. $components = self::URI_COMPONENTS;
  293. $components['fragment'] = $fragment;
  294. return $components;
  295. }
  296. //The URI is made of the query and fragment
  297. if ('?' === $first_char) {
  298. [, $partial] = explode('?', $uri, 2);
  299. [$query, $fragment] = explode('#', $partial, 2) + [1 => null];
  300. $components = self::URI_COMPONENTS;
  301. $components['query'] = $query;
  302. $components['fragment'] = $fragment;
  303. return $components;
  304. }
  305. //use RFC3986 URI regexp to split the URI
  306. preg_match(self::REGEXP_URI_PARTS, $uri, $parts);
  307. $parts += ['query' => '', 'fragment' => ''];
  308. if (':' === ($parts['scheme'] ?? null) || 1 !== preg_match(self::REGEXP_URI_SCHEME, $parts['scontent'] ?? '')) {
  309. throw new SyntaxError(sprintf('The uri `%s` contains an invalid scheme', $uri));
  310. }
  311. if ('' === ($parts['scheme'] ?? '').($parts['authority'] ?? '') && 1 === preg_match(self::REGEXP_INVALID_PATH, $parts['path'] ?? '')) {
  312. throw new SyntaxError(sprintf('The uri `%s` contains an invalid path.', $uri));
  313. }
  314. /** @var ComponentMap $components */
  315. $components = array_merge(
  316. self::URI_COMPONENTS,
  317. '' === ($parts['authority'] ?? null) ? [] : self::parseAuthority($parts['acontent'] ?? null),
  318. [
  319. 'path' => $parts['path'] ?? '',
  320. 'scheme' => '' === ($parts['scheme'] ?? null) ? null : ($parts['scontent'] ?? null),
  321. 'query' => '' === $parts['query'] ? null : ($parts['qcontent'] ?? null),
  322. 'fragment' => '' === $parts['fragment'] ? null : ($parts['fcontent'] ?? null),
  323. ]
  324. );
  325. return $components;
  326. }
  327. /**
  328. * Parses the URI authority part.
  329. *
  330. * @link https://tools.ietf.org/html/rfc3986#section-3.2
  331. *
  332. * @throws SyntaxError If the port component is invalid
  333. *
  334. * @return AuthorityMap
  335. */
  336. public static function parseAuthority(Stringable|string|null $authority): array
  337. {
  338. $components = ['user' => null, 'pass' => null, 'host' => null, 'port' => null];
  339. if (null === $authority) {
  340. return $components;
  341. }
  342. $authority = (string) $authority;
  343. $components['host'] = '';
  344. if ('' === $authority) {
  345. return $components;
  346. }
  347. $parts = explode('@', $authority, 2);
  348. if (isset($parts[1])) {
  349. [$components['user'], $components['pass']] = explode(':', $parts[0], 2) + [1 => null];
  350. }
  351. preg_match(self::REGEXP_HOST_PORT, $parts[1] ?? $parts[0], $matches);
  352. $matches += ['port' => ''];
  353. $components['port'] = self::filterPort($matches['port']);
  354. $components['host'] = self::filterHost($matches['host'] ?? '');
  355. return $components;
  356. }
  357. /**
  358. * Filter and format the port component.
  359. *
  360. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  361. *
  362. * @throws SyntaxError if the registered name is invalid
  363. */
  364. private static function filterPort(string $port): ?int
  365. {
  366. return match (true) {
  367. '' === $port => null,
  368. 1 === preg_match('/^\d*$/', $port) => (int) $port,
  369. default => throw new SyntaxError(sprintf('The port `%s` is invalid', $port)),
  370. };
  371. }
  372. /**
  373. * Returns whether a hostname is valid.
  374. *
  375. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  376. *
  377. * @throws SyntaxError if the registered name is invalid
  378. */
  379. private static function filterHost(string $host): string
  380. {
  381. if ('' === $host) {
  382. return $host;
  383. }
  384. /** @var array<string, 1> $hostCache */
  385. static $hostCache = [];
  386. if (isset($hostCache[$host])) {
  387. return $host;
  388. }
  389. if (self::MAXIMUM_HOST_CACHED < count($hostCache)) {
  390. array_shift($hostCache);
  391. }
  392. if ('[' !== $host[0] || !str_ends_with($host, ']')) {
  393. self::filterRegisteredName($host);
  394. $hostCache[$host] = 1;
  395. return $host;
  396. }
  397. if (self::isIpHost(substr($host, 1, -1))) {
  398. $hostCache[$host] = 1;
  399. return $host;
  400. }
  401. throw new SyntaxError(sprintf('Host `%s` is invalid : the IP host is malformed', $host));
  402. }
  403. /**
  404. * Throws if the host is not a registered name and not a valid IDN host.
  405. *
  406. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  407. *
  408. * @throws SyntaxError if the registered name is invalid
  409. * @throws MissingFeature if IDN support or ICU requirement are not available or met.
  410. * @throws ConversionFailed if the submitted IDN host cannot be converted to a valid ascii form
  411. */
  412. private static function filterRegisteredName(string $host): void
  413. {
  414. $formattedHost = rawurldecode($host);
  415. if (1 === preg_match(self::REGEXP_REGISTERED_NAME, $formattedHost)) {
  416. return;
  417. }
  418. //to test IDN host non-ascii characters must be present in the host
  419. if (1 !== preg_match(self::REGEXP_IDN_PATTERN, $formattedHost)) {
  420. throw new SyntaxError(sprintf('Host `%s` is invalid: the host is not a valid registered name', $host));
  421. }
  422. Converter::toAsciiOrFail($host);
  423. }
  424. /**
  425. * Validates a IPv6/IPfuture host.
  426. *
  427. * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
  428. * @link https://tools.ietf.org/html/rfc6874#section-2
  429. * @link https://tools.ietf.org/html/rfc6874#section-4
  430. */
  431. private static function isIpHost(string $ipHost): bool
  432. {
  433. if (false !== filter_var($ipHost, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
  434. return true;
  435. }
  436. if (1 === preg_match(self::REGEXP_IP_FUTURE, $ipHost, $matches)) {
  437. return !in_array($matches['version'], ['4', '6'], true);
  438. }
  439. $pos = strpos($ipHost, '%');
  440. if (false === $pos || 1 === preg_match(self::REGEXP_INVALID_HOST_CHARS, rawurldecode(substr($ipHost, $pos)))) {
  441. return false;
  442. }
  443. $ipHost = substr($ipHost, 0, $pos);
  444. return false !== filter_var($ipHost, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)
  445. && str_starts_with((string)inet_pton($ipHost), self::ZONE_ID_ADDRESS_BLOCK);
  446. }
  447. }