Idn.php 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com> and Trevor Rowbotham <trevor.rowbotham@pm.me>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Polyfill\Intl\Idn;
  11. use Symfony\Polyfill\Intl\Idn\Resources\unidata\DisallowedRanges;
  12. use Symfony\Polyfill\Intl\Idn\Resources\unidata\Regex;
  13. /**
  14. * @see https://www.unicode.org/reports/tr46/
  15. *
  16. * @internal
  17. */
  18. final class Idn
  19. {
  20. public const ERROR_EMPTY_LABEL = 1;
  21. public const ERROR_LABEL_TOO_LONG = 2;
  22. public const ERROR_DOMAIN_NAME_TOO_LONG = 4;
  23. public const ERROR_LEADING_HYPHEN = 8;
  24. public const ERROR_TRAILING_HYPHEN = 0x10;
  25. public const ERROR_HYPHEN_3_4 = 0x20;
  26. public const ERROR_LEADING_COMBINING_MARK = 0x40;
  27. public const ERROR_DISALLOWED = 0x80;
  28. public const ERROR_PUNYCODE = 0x100;
  29. public const ERROR_LABEL_HAS_DOT = 0x200;
  30. public const ERROR_INVALID_ACE_LABEL = 0x400;
  31. public const ERROR_BIDI = 0x800;
  32. public const ERROR_CONTEXTJ = 0x1000;
  33. public const ERROR_CONTEXTO_PUNCTUATION = 0x2000;
  34. public const ERROR_CONTEXTO_DIGITS = 0x4000;
  35. public const INTL_IDNA_VARIANT_2003 = 0;
  36. public const INTL_IDNA_VARIANT_UTS46 = 1;
  37. public const IDNA_DEFAULT = 0;
  38. public const IDNA_ALLOW_UNASSIGNED = 1;
  39. public const IDNA_USE_STD3_RULES = 2;
  40. public const IDNA_CHECK_BIDI = 4;
  41. public const IDNA_CHECK_CONTEXTJ = 8;
  42. public const IDNA_NONTRANSITIONAL_TO_ASCII = 16;
  43. public const IDNA_NONTRANSITIONAL_TO_UNICODE = 32;
  44. public const MAX_DOMAIN_SIZE = 253;
  45. public const MAX_LABEL_SIZE = 63;
  46. public const BASE = 36;
  47. public const TMIN = 1;
  48. public const TMAX = 26;
  49. public const SKEW = 38;
  50. public const DAMP = 700;
  51. public const INITIAL_BIAS = 72;
  52. public const INITIAL_N = 128;
  53. public const DELIMITER = '-';
  54. public const MAX_INT = 2147483647;
  55. /**
  56. * Contains the numeric value of a basic code point (for use in representing integers) in the
  57. * range 0 to BASE-1, or -1 if b is does not represent a value.
  58. *
  59. * @var array<int, int>
  60. */
  61. private static $basicToDigit = [
  62. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  63. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  64. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  65. 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
  66. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  67. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
  68. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  69. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
  70. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  71. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  72. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  73. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  74. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  75. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  76. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  77. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  78. ];
  79. /**
  80. * @var array<int, int>
  81. */
  82. private static $virama;
  83. /**
  84. * @var array<int, string>
  85. */
  86. private static $mapped;
  87. /**
  88. * @var array<int, bool>
  89. */
  90. private static $ignored;
  91. /**
  92. * @var array<int, string>
  93. */
  94. private static $deviation;
  95. /**
  96. * @var array<int, bool>
  97. */
  98. private static $disallowed;
  99. /**
  100. * @var array<int, string>
  101. */
  102. private static $disallowed_STD3_mapped;
  103. /**
  104. * @var array<int, bool>
  105. */
  106. private static $disallowed_STD3_valid;
  107. /**
  108. * @var bool
  109. */
  110. private static $mappingTableLoaded = false;
  111. /**
  112. * @see https://www.unicode.org/reports/tr46/#ToASCII
  113. *
  114. * @param string $domainName
  115. * @param int $options
  116. * @param int $variant
  117. * @param array $idna_info
  118. *
  119. * @return string|false
  120. */
  121. public static function idn_to_ascii($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = [])
  122. {
  123. if (\PHP_VERSION_ID > 80400 && '' === $domainName) {
  124. throw new \ValueError('idn_to_ascii(): Argument #1 ($domain) cannot be empty');
  125. }
  126. if (self::INTL_IDNA_VARIANT_2003 === $variant) {
  127. @trigger_error('idn_to_ascii(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED);
  128. }
  129. $options = [
  130. 'CheckHyphens' => true,
  131. 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI),
  132. 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ),
  133. 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES),
  134. 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_ASCII),
  135. 'VerifyDnsLength' => true,
  136. ];
  137. $info = new Info();
  138. $labels = self::process((string) $domainName, $options, $info);
  139. foreach ($labels as $i => $label) {
  140. // Only convert labels to punycode that contain non-ASCII code points
  141. if (1 === preg_match('/[^\x00-\x7F]/', $label)) {
  142. try {
  143. $label = 'xn--'.self::punycodeEncode($label);
  144. } catch (\Exception $e) {
  145. $info->errors |= self::ERROR_PUNYCODE;
  146. }
  147. $labels[$i] = $label;
  148. }
  149. }
  150. if ($options['VerifyDnsLength']) {
  151. self::validateDomainAndLabelLength($labels, $info);
  152. }
  153. $idna_info = [
  154. 'result' => implode('.', $labels),
  155. 'isTransitionalDifferent' => $info->transitionalDifferent,
  156. 'errors' => $info->errors,
  157. ];
  158. return 0 === $info->errors ? $idna_info['result'] : false;
  159. }
  160. /**
  161. * @see https://www.unicode.org/reports/tr46/#ToUnicode
  162. *
  163. * @param string $domainName
  164. * @param int $options
  165. * @param int $variant
  166. * @param array $idna_info
  167. *
  168. * @return string|false
  169. */
  170. public static function idn_to_utf8($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = [])
  171. {
  172. if (\PHP_VERSION_ID > 80400 && '' === $domainName) {
  173. throw new \ValueError('idn_to_utf8(): Argument #1 ($domain) cannot be empty');
  174. }
  175. if (self::INTL_IDNA_VARIANT_2003 === $variant) {
  176. @trigger_error('idn_to_utf8(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED);
  177. }
  178. $info = new Info();
  179. $labels = self::process((string) $domainName, [
  180. 'CheckHyphens' => true,
  181. 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI),
  182. 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ),
  183. 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES),
  184. 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_UNICODE),
  185. ], $info);
  186. $idna_info = [
  187. 'result' => implode('.', $labels),
  188. 'isTransitionalDifferent' => $info->transitionalDifferent,
  189. 'errors' => $info->errors,
  190. ];
  191. return 0 === $info->errors ? $idna_info['result'] : false;
  192. }
  193. /**
  194. * @param string $label
  195. *
  196. * @return bool
  197. */
  198. private static function isValidContextJ(array $codePoints, $label)
  199. {
  200. if (!isset(self::$virama)) {
  201. self::$virama = require __DIR__.\DIRECTORY_SEPARATOR.'Resources'.\DIRECTORY_SEPARATOR.'unidata'.\DIRECTORY_SEPARATOR.'virama.php';
  202. }
  203. $offset = 0;
  204. foreach ($codePoints as $i => $codePoint) {
  205. if (0x200C !== $codePoint && 0x200D !== $codePoint) {
  206. continue;
  207. }
  208. if (!isset($codePoints[$i - 1])) {
  209. return false;
  210. }
  211. // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
  212. if (isset(self::$virama[$codePoints[$i - 1]])) {
  213. continue;
  214. }
  215. // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C(Joining_Type:T)*(Joining_Type:{R,D})) Then
  216. // True;
  217. // Generated RegExp = ([Joining_Type:{L,D}][Joining_Type:T]*\u200C[Joining_Type:T]*)[Joining_Type:{R,D}]
  218. if (0x200C === $codePoint && 1 === preg_match(Regex::ZWNJ, $label, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
  219. $offset += \strlen($matches[1][0]);
  220. continue;
  221. }
  222. return false;
  223. }
  224. return true;
  225. }
  226. /**
  227. * @see https://www.unicode.org/reports/tr46/#ProcessingStepMap
  228. *
  229. * @param string $input
  230. * @param array<string, bool> $options
  231. *
  232. * @return string
  233. */
  234. private static function mapCodePoints($input, array $options, Info $info)
  235. {
  236. $str = '';
  237. $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules'];
  238. $transitional = $options['Transitional_Processing'];
  239. foreach (self::utf8Decode($input) as $codePoint) {
  240. $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules);
  241. switch ($data['status']) {
  242. case 'disallowed':
  243. case 'valid':
  244. $str .= mb_chr($codePoint, 'utf-8');
  245. break;
  246. case 'ignored':
  247. // Do nothing.
  248. break;
  249. case 'mapped':
  250. $str .= $transitional && 0x1E9E === $codePoint ? 'ss' : $data['mapping'];
  251. break;
  252. case 'deviation':
  253. $info->transitionalDifferent = true;
  254. $str .= ($transitional ? $data['mapping'] : mb_chr($codePoint, 'utf-8'));
  255. break;
  256. }
  257. }
  258. return $str;
  259. }
  260. /**
  261. * @see https://www.unicode.org/reports/tr46/#Processing
  262. *
  263. * @param string $domain
  264. * @param array<string, bool> $options
  265. *
  266. * @return array<int, string>
  267. */
  268. private static function process($domain, array $options, Info $info)
  269. {
  270. // If VerifyDnsLength is not set, we are doing ToUnicode otherwise we are doing ToASCII and
  271. // we need to respect the VerifyDnsLength option.
  272. $checkForEmptyLabels = !isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'];
  273. if ($checkForEmptyLabels && '' === $domain) {
  274. $info->errors |= self::ERROR_EMPTY_LABEL;
  275. return [$domain];
  276. }
  277. // Step 1. Map each code point in the domain name string
  278. $domain = self::mapCodePoints($domain, $options, $info);
  279. // Step 2. Normalize the domain name string to Unicode Normalization Form C.
  280. if (!\Normalizer::isNormalized($domain, \Normalizer::FORM_C)) {
  281. $domain = \Normalizer::normalize($domain, \Normalizer::FORM_C);
  282. }
  283. // Step 3. Break the string into labels at U+002E (.) FULL STOP.
  284. $labels = explode('.', $domain);
  285. $lastLabelIndex = \count($labels) - 1;
  286. // Step 4. Convert and validate each label in the domain name string.
  287. foreach ($labels as $i => $label) {
  288. $validationOptions = $options;
  289. if ('xn--' === substr($label, 0, 4)) {
  290. // Step 4.1. If the label contains any non-ASCII code point (i.e., a code point greater than U+007F),
  291. // record that there was an error, and continue with the next label.
  292. if (preg_match('/[^\x00-\x7F]/', $label)) {
  293. $info->errors |= self::ERROR_PUNYCODE;
  294. continue;
  295. }
  296. // Step 4.2. Attempt to convert the rest of the label to Unicode according to Punycode [RFC3492]. If
  297. // that conversion fails, record that there was an error, and continue
  298. // with the next label. Otherwise replace the original label in the string by the results of the
  299. // conversion.
  300. try {
  301. $label = self::punycodeDecode(substr($label, 4));
  302. } catch (\Exception $e) {
  303. $info->errors |= self::ERROR_PUNYCODE;
  304. continue;
  305. }
  306. $validationOptions['Transitional_Processing'] = false;
  307. $labels[$i] = $label;
  308. }
  309. self::validateLabel($label, $info, $validationOptions, $i > 0 && $i === $lastLabelIndex);
  310. }
  311. if ($info->bidiDomain && !$info->validBidiDomain) {
  312. $info->errors |= self::ERROR_BIDI;
  313. }
  314. // Any input domain name string that does not record an error has been successfully
  315. // processed according to this specification. Conversely, if an input domain_name string
  316. // causes an error, then the processing of the input domain_name string fails. Determining
  317. // what to do with error input is up to the caller, and not in the scope of this document.
  318. return $labels;
  319. }
  320. /**
  321. * @see https://tools.ietf.org/html/rfc5893#section-2
  322. *
  323. * @param string $label
  324. */
  325. private static function validateBidiLabel($label, Info $info)
  326. {
  327. if (1 === preg_match(Regex::RTL_LABEL, $label)) {
  328. $info->bidiDomain = true;
  329. // Step 1. The first character must be a character with Bidi property L, R, or AL.
  330. // If it has the R or AL property, it is an RTL label
  331. if (1 !== preg_match(Regex::BIDI_STEP_1_RTL, $label)) {
  332. $info->validBidiDomain = false;
  333. return;
  334. }
  335. // Step 2. In an RTL label, only characters with the Bidi properties R, AL, AN, EN, ES,
  336. // CS, ET, ON, BN, or NSM are allowed.
  337. if (1 === preg_match(Regex::BIDI_STEP_2, $label)) {
  338. $info->validBidiDomain = false;
  339. return;
  340. }
  341. // Step 3. In an RTL label, the end of the label must be a character with Bidi property
  342. // R, AL, EN, or AN, followed by zero or more characters with Bidi property NSM.
  343. if (1 !== preg_match(Regex::BIDI_STEP_3, $label)) {
  344. $info->validBidiDomain = false;
  345. return;
  346. }
  347. // Step 4. In an RTL label, if an EN is present, no AN may be present, and vice versa.
  348. if (1 === preg_match(Regex::BIDI_STEP_4_AN, $label) && 1 === preg_match(Regex::BIDI_STEP_4_EN, $label)) {
  349. $info->validBidiDomain = false;
  350. return;
  351. }
  352. return;
  353. }
  354. // We are a LTR label
  355. // Step 1. The first character must be a character with Bidi property L, R, or AL.
  356. // If it has the L property, it is an LTR label.
  357. if (1 !== preg_match(Regex::BIDI_STEP_1_LTR, $label)) {
  358. $info->validBidiDomain = false;
  359. return;
  360. }
  361. // Step 5. In an LTR label, only characters with the Bidi properties L, EN,
  362. // ES, CS, ET, ON, BN, or NSM are allowed.
  363. if (1 === preg_match(Regex::BIDI_STEP_5, $label)) {
  364. $info->validBidiDomain = false;
  365. return;
  366. }
  367. // Step 6.In an LTR label, the end of the label must be a character with Bidi property L or
  368. // EN, followed by zero or more characters with Bidi property NSM.
  369. if (1 !== preg_match(Regex::BIDI_STEP_6, $label)) {
  370. $info->validBidiDomain = false;
  371. return;
  372. }
  373. }
  374. /**
  375. * @param array<int, string> $labels
  376. */
  377. private static function validateDomainAndLabelLength(array $labels, Info $info)
  378. {
  379. $maxDomainSize = self::MAX_DOMAIN_SIZE;
  380. $length = \count($labels);
  381. // Number of "." delimiters.
  382. $domainLength = $length - 1;
  383. // If the last label is empty and it is not the first label, then it is the root label.
  384. // Increase the max size by 1, making it 254, to account for the root label's "."
  385. // delimiter. This also means we don't need to check the last label's length for being too
  386. // long.
  387. if ($length > 1 && '' === $labels[$length - 1]) {
  388. ++$maxDomainSize;
  389. --$length;
  390. }
  391. for ($i = 0; $i < $length; ++$i) {
  392. $bytes = \strlen($labels[$i]);
  393. $domainLength += $bytes;
  394. if ($bytes > self::MAX_LABEL_SIZE) {
  395. $info->errors |= self::ERROR_LABEL_TOO_LONG;
  396. }
  397. }
  398. if ($domainLength > $maxDomainSize) {
  399. $info->errors |= self::ERROR_DOMAIN_NAME_TOO_LONG;
  400. }
  401. }
  402. /**
  403. * @see https://www.unicode.org/reports/tr46/#Validity_Criteria
  404. *
  405. * @param string $label
  406. * @param array<string, bool> $options
  407. * @param bool $canBeEmpty
  408. */
  409. private static function validateLabel($label, Info $info, array $options, $canBeEmpty)
  410. {
  411. if ('' === $label) {
  412. if (!$canBeEmpty && (!isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'])) {
  413. $info->errors |= self::ERROR_EMPTY_LABEL;
  414. }
  415. return;
  416. }
  417. // Step 1. The label must be in Unicode Normalization Form C.
  418. if (!\Normalizer::isNormalized($label, \Normalizer::FORM_C)) {
  419. $info->errors |= self::ERROR_INVALID_ACE_LABEL;
  420. }
  421. $codePoints = self::utf8Decode($label);
  422. if ($options['CheckHyphens']) {
  423. // Step 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
  424. // in both the thrid and fourth positions.
  425. if (isset($codePoints[2], $codePoints[3]) && 0x002D === $codePoints[2] && 0x002D === $codePoints[3]) {
  426. $info->errors |= self::ERROR_HYPHEN_3_4;
  427. }
  428. // Step 3. If CheckHyphens, the label must neither begin nor end with a U+002D
  429. // HYPHEN-MINUS character.
  430. if ('-' === substr($label, 0, 1)) {
  431. $info->errors |= self::ERROR_LEADING_HYPHEN;
  432. }
  433. if ('-' === substr($label, -1, 1)) {
  434. $info->errors |= self::ERROR_TRAILING_HYPHEN;
  435. }
  436. } elseif ('xn--' === substr($label, 0, 4)) {
  437. $info->errors |= self::ERROR_PUNYCODE;
  438. }
  439. // Step 4. The label must not contain a U+002E (.) FULL STOP.
  440. if (false !== strpos($label, '.')) {
  441. $info->errors |= self::ERROR_LABEL_HAS_DOT;
  442. }
  443. // Step 5. The label must not begin with a combining mark, that is: General_Category=Mark.
  444. if (1 === preg_match(Regex::COMBINING_MARK, $label)) {
  445. $info->errors |= self::ERROR_LEADING_COMBINING_MARK;
  446. }
  447. // Step 6. Each code point in the label must only have certain status values according to
  448. // Section 5, IDNA Mapping Table:
  449. $transitional = $options['Transitional_Processing'];
  450. $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules'];
  451. foreach ($codePoints as $codePoint) {
  452. $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules);
  453. $status = $data['status'];
  454. if ('valid' === $status || (!$transitional && 'deviation' === $status)) {
  455. continue;
  456. }
  457. $info->errors |= self::ERROR_DISALLOWED;
  458. break;
  459. }
  460. // Step 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in
  461. // The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)
  462. // [IDNA2008].
  463. if ($options['CheckJoiners'] && !self::isValidContextJ($codePoints, $label)) {
  464. $info->errors |= self::ERROR_CONTEXTJ;
  465. }
  466. // Step 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must
  467. // satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2.
  468. if ($options['CheckBidi'] && (!$info->bidiDomain || $info->validBidiDomain)) {
  469. self::validateBidiLabel($label, $info);
  470. }
  471. }
  472. /**
  473. * @see https://tools.ietf.org/html/rfc3492#section-6.2
  474. *
  475. * @param string $input
  476. *
  477. * @return string
  478. */
  479. private static function punycodeDecode($input)
  480. {
  481. $n = self::INITIAL_N;
  482. $out = 0;
  483. $i = 0;
  484. $bias = self::INITIAL_BIAS;
  485. $lastDelimIndex = strrpos($input, self::DELIMITER);
  486. $b = false === $lastDelimIndex ? 0 : $lastDelimIndex;
  487. $inputLength = \strlen($input);
  488. $output = [];
  489. $bytes = array_map('ord', str_split($input));
  490. for ($j = 0; $j < $b; ++$j) {
  491. if ($bytes[$j] > 0x7F) {
  492. throw new \Exception('Invalid input');
  493. }
  494. $output[$out++] = $input[$j];
  495. }
  496. if ($b > 0) {
  497. ++$b;
  498. }
  499. for ($in = $b; $in < $inputLength; ++$out) {
  500. $oldi = $i;
  501. $w = 1;
  502. for ($k = self::BASE; /* no condition */; $k += self::BASE) {
  503. if ($in >= $inputLength) {
  504. throw new \Exception('Invalid input');
  505. }
  506. $digit = self::$basicToDigit[$bytes[$in++] & 0xFF];
  507. if ($digit < 0) {
  508. throw new \Exception('Invalid input');
  509. }
  510. if ($digit > intdiv(self::MAX_INT - $i, $w)) {
  511. throw new \Exception('Integer overflow');
  512. }
  513. $i += $digit * $w;
  514. if ($k <= $bias) {
  515. $t = self::TMIN;
  516. } elseif ($k >= $bias + self::TMAX) {
  517. $t = self::TMAX;
  518. } else {
  519. $t = $k - $bias;
  520. }
  521. if ($digit < $t) {
  522. break;
  523. }
  524. $baseMinusT = self::BASE - $t;
  525. if ($w > intdiv(self::MAX_INT, $baseMinusT)) {
  526. throw new \Exception('Integer overflow');
  527. }
  528. $w *= $baseMinusT;
  529. }
  530. $outPlusOne = $out + 1;
  531. $bias = self::adaptBias($i - $oldi, $outPlusOne, 0 === $oldi);
  532. if (intdiv($i, $outPlusOne) > self::MAX_INT - $n) {
  533. throw new \Exception('Integer overflow');
  534. }
  535. $n += intdiv($i, $outPlusOne);
  536. $i %= $outPlusOne;
  537. array_splice($output, $i++, 0, [mb_chr($n, 'utf-8')]);
  538. }
  539. return implode('', $output);
  540. }
  541. /**
  542. * @see https://tools.ietf.org/html/rfc3492#section-6.3
  543. *
  544. * @param string $input
  545. *
  546. * @return string
  547. */
  548. private static function punycodeEncode($input)
  549. {
  550. $n = self::INITIAL_N;
  551. $delta = 0;
  552. $out = 0;
  553. $bias = self::INITIAL_BIAS;
  554. $inputLength = 0;
  555. $output = '';
  556. $iter = self::utf8Decode($input);
  557. foreach ($iter as $codePoint) {
  558. ++$inputLength;
  559. if ($codePoint < 0x80) {
  560. $output .= \chr($codePoint);
  561. ++$out;
  562. }
  563. }
  564. $h = $out;
  565. $b = $out;
  566. if ($b > 0) {
  567. $output .= self::DELIMITER;
  568. ++$out;
  569. }
  570. while ($h < $inputLength) {
  571. $m = self::MAX_INT;
  572. foreach ($iter as $codePoint) {
  573. if ($codePoint >= $n && $codePoint < $m) {
  574. $m = $codePoint;
  575. }
  576. }
  577. if ($m - $n > intdiv(self::MAX_INT - $delta, $h + 1)) {
  578. throw new \Exception('Integer overflow');
  579. }
  580. $delta += ($m - $n) * ($h + 1);
  581. $n = $m;
  582. foreach ($iter as $codePoint) {
  583. if ($codePoint < $n && 0 === ++$delta) {
  584. throw new \Exception('Integer overflow');
  585. }
  586. if ($codePoint === $n) {
  587. $q = $delta;
  588. for ($k = self::BASE; /* no condition */; $k += self::BASE) {
  589. if ($k <= $bias) {
  590. $t = self::TMIN;
  591. } elseif ($k >= $bias + self::TMAX) {
  592. $t = self::TMAX;
  593. } else {
  594. $t = $k - $bias;
  595. }
  596. if ($q < $t) {
  597. break;
  598. }
  599. $qMinusT = $q - $t;
  600. $baseMinusT = self::BASE - $t;
  601. $output .= self::encodeDigit($t + $qMinusT % $baseMinusT, false);
  602. ++$out;
  603. $q = intdiv($qMinusT, $baseMinusT);
  604. }
  605. $output .= self::encodeDigit($q, false);
  606. ++$out;
  607. $bias = self::adaptBias($delta, $h + 1, $h === $b);
  608. $delta = 0;
  609. ++$h;
  610. }
  611. }
  612. ++$delta;
  613. ++$n;
  614. }
  615. return $output;
  616. }
  617. /**
  618. * @see https://tools.ietf.org/html/rfc3492#section-6.1
  619. *
  620. * @param int $delta
  621. * @param int $numPoints
  622. * @param bool $firstTime
  623. *
  624. * @return int
  625. */
  626. private static function adaptBias($delta, $numPoints, $firstTime)
  627. {
  628. // xxx >> 1 is a faster way of doing intdiv(xxx, 2)
  629. $delta = $firstTime ? intdiv($delta, self::DAMP) : $delta >> 1;
  630. $delta += intdiv($delta, $numPoints);
  631. $k = 0;
  632. while ($delta > ((self::BASE - self::TMIN) * self::TMAX) >> 1) {
  633. $delta = intdiv($delta, self::BASE - self::TMIN);
  634. $k += self::BASE;
  635. }
  636. return $k + intdiv((self::BASE - self::TMIN + 1) * $delta, $delta + self::SKEW);
  637. }
  638. /**
  639. * @param int $d
  640. * @param bool $flag
  641. *
  642. * @return string
  643. */
  644. private static function encodeDigit($d, $flag)
  645. {
  646. return \chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5));
  647. }
  648. /**
  649. * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any
  650. * invalid byte sequences will be replaced by a U+FFFD replacement code point.
  651. *
  652. * @see https://encoding.spec.whatwg.org/#utf-8-decoder
  653. *
  654. * @param string $input
  655. *
  656. * @return array<int, int>
  657. */
  658. private static function utf8Decode($input)
  659. {
  660. $bytesSeen = 0;
  661. $bytesNeeded = 0;
  662. $lowerBoundary = 0x80;
  663. $upperBoundary = 0xBF;
  664. $codePoint = 0;
  665. $codePoints = [];
  666. $length = \strlen($input);
  667. for ($i = 0; $i < $length; ++$i) {
  668. $byte = \ord($input[$i]);
  669. if (0 === $bytesNeeded) {
  670. if ($byte >= 0x00 && $byte <= 0x7F) {
  671. $codePoints[] = $byte;
  672. continue;
  673. }
  674. if ($byte >= 0xC2 && $byte <= 0xDF) {
  675. $bytesNeeded = 1;
  676. $codePoint = $byte & 0x1F;
  677. } elseif ($byte >= 0xE0 && $byte <= 0xEF) {
  678. if (0xE0 === $byte) {
  679. $lowerBoundary = 0xA0;
  680. } elseif (0xED === $byte) {
  681. $upperBoundary = 0x9F;
  682. }
  683. $bytesNeeded = 2;
  684. $codePoint = $byte & 0xF;
  685. } elseif ($byte >= 0xF0 && $byte <= 0xF4) {
  686. if (0xF0 === $byte) {
  687. $lowerBoundary = 0x90;
  688. } elseif (0xF4 === $byte) {
  689. $upperBoundary = 0x8F;
  690. }
  691. $bytesNeeded = 3;
  692. $codePoint = $byte & 0x7;
  693. } else {
  694. $codePoints[] = 0xFFFD;
  695. }
  696. continue;
  697. }
  698. if ($byte < $lowerBoundary || $byte > $upperBoundary) {
  699. $codePoint = 0;
  700. $bytesNeeded = 0;
  701. $bytesSeen = 0;
  702. $lowerBoundary = 0x80;
  703. $upperBoundary = 0xBF;
  704. --$i;
  705. $codePoints[] = 0xFFFD;
  706. continue;
  707. }
  708. $lowerBoundary = 0x80;
  709. $upperBoundary = 0xBF;
  710. $codePoint = ($codePoint << 6) | ($byte & 0x3F);
  711. if (++$bytesSeen !== $bytesNeeded) {
  712. continue;
  713. }
  714. $codePoints[] = $codePoint;
  715. $codePoint = 0;
  716. $bytesNeeded = 0;
  717. $bytesSeen = 0;
  718. }
  719. // String unexpectedly ended, so append a U+FFFD code point.
  720. if (0 !== $bytesNeeded) {
  721. $codePoints[] = 0xFFFD;
  722. }
  723. return $codePoints;
  724. }
  725. /**
  726. * @param int $codePoint
  727. * @param bool $useSTD3ASCIIRules
  728. *
  729. * @return array{status: string, mapping?: string}
  730. */
  731. private static function lookupCodePointStatus($codePoint, $useSTD3ASCIIRules)
  732. {
  733. if (!self::$mappingTableLoaded) {
  734. self::$mappingTableLoaded = true;
  735. self::$mapped = require __DIR__.'/Resources/unidata/mapped.php';
  736. self::$ignored = require __DIR__.'/Resources/unidata/ignored.php';
  737. self::$deviation = require __DIR__.'/Resources/unidata/deviation.php';
  738. self::$disallowed = require __DIR__.'/Resources/unidata/disallowed.php';
  739. self::$disallowed_STD3_mapped = require __DIR__.'/Resources/unidata/disallowed_STD3_mapped.php';
  740. self::$disallowed_STD3_valid = require __DIR__.'/Resources/unidata/disallowed_STD3_valid.php';
  741. }
  742. if (isset(self::$mapped[$codePoint])) {
  743. return ['status' => 'mapped', 'mapping' => self::$mapped[$codePoint]];
  744. }
  745. if (isset(self::$ignored[$codePoint])) {
  746. return ['status' => 'ignored'];
  747. }
  748. if (isset(self::$deviation[$codePoint])) {
  749. return ['status' => 'deviation', 'mapping' => self::$deviation[$codePoint]];
  750. }
  751. if (isset(self::$disallowed[$codePoint]) || DisallowedRanges::inRange($codePoint)) {
  752. return ['status' => 'disallowed'];
  753. }
  754. $isDisallowedMapped = isset(self::$disallowed_STD3_mapped[$codePoint]);
  755. if ($isDisallowedMapped || isset(self::$disallowed_STD3_valid[$codePoint])) {
  756. $status = 'disallowed';
  757. if (!$useSTD3ASCIIRules) {
  758. $status = $isDisallowedMapped ? 'mapped' : 'valid';
  759. }
  760. if ($isDisallowedMapped) {
  761. return ['status' => $status, 'mapping' => self::$disallowed_STD3_mapped[$codePoint]];
  762. }
  763. return ['status' => $status];
  764. }
  765. return ['status' => 'valid'];
  766. }
  767. }