From 17ee56c800e67ece0af767369236c88b9286d8a9 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 16:46:49 -0500 Subject: [PATCH 01/11] Adds IDNA support --- library/Pdp/Parser.php | 25 ++++++++++++++++++++----- tests/library/Pdp/Uri/UrlTest.php | 14 ++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/library/Pdp/Parser.php b/library/Pdp/Parser.php index 802e4b4d..03d69e13 100644 --- a/library/Pdp/Parser.php +++ b/library/Pdp/Parser.php @@ -20,7 +20,7 @@ */ class Parser { - const SCHEME_PATTERN = '#^(http|ftp)s?://#i'; + const SCHEME_PATTERN = '#^((http|ftp)s?://)#i'; /** * @var PublicSuffixList Public Suffix List @@ -57,10 +57,12 @@ public function parseUrl($url) 'fragment' => null, ); - if (preg_match(self::SCHEME_PATTERN, $url, $schemeMatches) === 0) { + if (preg_match(self::SCHEME_PATTERN, $url) === 0) { $url = 'http://' . preg_replace('#^//#', '', $url, 1); } + $url = $this->idnToAscii($url); + $parts = parse_url($url); if ($parts === false) { @@ -130,7 +132,7 @@ public function getPublicSuffix($host) return null; } - $host = strtolower($host); + $host = mb_strtolower($host); $parts = array_reverse(explode('.', $host)); $publicSuffix = array(); $publicSuffixList = $this->publicSuffixList; @@ -185,7 +187,7 @@ public function getRegisterableDomain($host) return null; } - $host = strtolower($host); + $host = mb_strtolower($host); $publicSuffix = $this->getPublicSuffix($host); if ($publicSuffix === null || $host == $publicSuffix) { @@ -207,7 +209,7 @@ public function getRegisterableDomain($host) */ public function getSubdomain($host) { - $host = strtolower($host); + $host = mb_strtolower($host); $registerableDomain = $this->getRegisterableDomain($host); if ($registerableDomain === null || $host == $registerableDomain) { @@ -221,4 +223,17 @@ public function getSubdomain($host) return implode('.', array_reverse($subdomainParts)); } + /** + * Convert IDNA URLs to ASCII - must strip the scheme and only convert the URL + * + * @param string $url URL to convert + * @return string ASCII URL + */ + protected function idnToAscii($url) + { + $split = preg_split(self::SCHEME_PATTERN, $url, -1, PREG_SPLIT_DELIM_CAPTURE); + $url = sprintf('%s%s', $split[1], idn_to_ascii($split[3])); + + return $url; + } } diff --git a/tests/library/Pdp/Uri/UrlTest.php b/tests/library/Pdp/Uri/UrlTest.php index c59b38ae..2f2d5cc1 100644 --- a/tests/library/Pdp/Uri/UrlTest.php +++ b/tests/library/Pdp/Uri/UrlTest.php @@ -126,4 +126,18 @@ public function testFtpUrlToString() $url = $this->parser->parseUrl($ftpUrl); $this->assertEquals($ftpUrl, $url->__toString()); } + + /** + * @group issue29 + * @see https://github.com/jeremykendall/php-domain-parser/issues/29 + */ + public function testIdnToAscii() + { + $idn = 'Яндекс.РФ'; + $expected = 'http://xn--d1acpjx3f.xn--p1ai'; + $url = $this->parser->parseUrl($idn); + $actual = $url->__toString(); + + $this->assertEquals($expected, $actual); + } } From d8c4e72f24d24f2b217f0e5bb64555b4b7b6c861 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 22:33:23 -0500 Subject: [PATCH 02/11] Removes urlencode for host part --- library/Pdp/Uri/Url.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/Pdp/Uri/Url.php b/library/Pdp/Uri/Url.php index 21162cf1..efd575e1 100644 --- a/library/Pdp/Uri/Url.php +++ b/library/Pdp/Uri/Url.php @@ -134,7 +134,7 @@ public function __toString() $host = $this->host->__toString(); if ($host) { - $url .= urlencode($host); + $url .= $host; } if ($this->port) { From 46d3254d7dd8de74912dea7355cdf607edf85df7 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 22:33:44 -0500 Subject: [PATCH 03/11] Updates test cases based on PSL provided test cases --- tests/library/Pdp/CheckPublicSuffixTest.php | 34 +++++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/tests/library/Pdp/CheckPublicSuffixTest.php b/tests/library/Pdp/CheckPublicSuffixTest.php index 74100a0d..43ddd863 100644 --- a/tests/library/Pdp/CheckPublicSuffixTest.php +++ b/tests/library/Pdp/CheckPublicSuffixTest.php @@ -3,9 +3,9 @@ namespace Pdp; /** - * This test case is based on the test data linked at + * This test case is based on the test data linked at * http://publicsuffix.org/list/ and provided by Rob Strading of Comodo. - * @link + * @link * http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1 */ class CheckPublicSuffixTest extends \PHPUnit_Framework_TestCase @@ -27,6 +27,8 @@ public function testPublicSuffixSpec() { // Test data from Rob Stradling at Comodo // http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1 + // Any copyright is dedicated to the Public Domain. + // http://creativecommons.org/publicdomain/zero/1.0/ // null input. $this->checkPublicSuffix(null, null); @@ -99,19 +101,39 @@ public function testPublicSuffixSpec() $this->checkPublicSuffix('k12.ak.us', null); $this->checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us'); $this->checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us'); + // IDN labels. + $this->checkPublicSuffix('食狮.com.cn', '食狮.com.cn'); + $this->checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn'); + $this->checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn'); + $this->checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn'); + $this->checkPublicSuffix('公司.cn', null); + $this->checkPublicSuffix('食狮.中国', '食狮.中国'); + $this->checkPublicSuffix('www.食狮.中国', '食狮.中国'); + $this->checkPublicSuffix('shishi.中国', 'shishi.中国'); + $this->checkPublicSuffix('中国', null); + // Same as above, but punycoded. + $this->checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn'); + $this->checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); + $this->checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); + $this->checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn'); + $this->checkPublicSuffix('xn--55qx5d.cn', null); + $this->checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); + $this->checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); + $this->checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s'); + $this->checkPublicSuffix('xn--fiqs8s', null); } /** - * This is my version of the checkPublicSuffix function referred to in the + * This is my version of the checkPublicSuffix function referred to in the * test instructions at the Public Suffix List project. * - * "You will need to define a checkPublicSuffix() function which takes as a - * parameter a domain name and the public suffix, runs your implementation + * "You will need to define a checkPublicSuffix() function which takes as a + * parameter a domain name and the public suffix, runs your implementation * on the domain name and checks the result is the public suffix expected." * * @link http://publicsuffix.org/list/ * - * @param string $input Domain and public suffix + * @param string $input Domain and public suffix * @param string $expected Expected result */ public function checkPublicSuffix($input, $expected) From 1573a180d754a843cb22c2950b9689fc6faeacbf Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 22:34:11 -0500 Subject: [PATCH 04/11] Updates PHP PSL --- data/public-suffix-list.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/public-suffix-list.php b/data/public-suffix-list.php index 2fb41cf3..f80403af 100644 --- a/data/public-suffix-list.php +++ b/data/public-suffix-list.php @@ -20287,4 +20287,4 @@ 'cern' => array ( ), -); \ No newline at end of file +); From cf790535bf67c0ee2e23d40f757c2ab3cb21b141 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 22:35:14 -0500 Subject: [PATCH 05/11] Updates parser to handle IDN URLs --- library/Pdp/Parser.php | 60 +++++++++++++++++++++---------- tests/library/Pdp/Uri/UrlTest.php | 2 +- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/library/Pdp/Parser.php b/library/Pdp/Parser.php index 03d69e13..d3a0a38b 100644 --- a/library/Pdp/Parser.php +++ b/library/Pdp/Parser.php @@ -61,13 +61,7 @@ public function parseUrl($url) $url = 'http://' . preg_replace('#^//#', '', $url, 1); } - $url = $this->idnToAscii($url); - - $parts = parse_url($url); - - if ($parts === false) { - throw new \InvalidArgumentException(sprintf('Invalid url %s', $url)); - } + $parts = $this->mbParseUrl($url); $elem = (array) $parts + $elem; @@ -93,6 +87,8 @@ public function parseUrl($url) */ public function parseHost($host) { + $host = mb_strtolower($host, 'UTF-8'); + $subdomain = null; $registerableDomain = null; $publicSuffix = null; @@ -132,7 +128,7 @@ public function getPublicSuffix($host) return null; } - $host = mb_strtolower($host); + $host = mb_strtolower($host, 'UTF-8'); $parts = array_reverse(explode('.', $host)); $publicSuffix = array(); $publicSuffixList = $this->publicSuffixList; @@ -187,7 +183,13 @@ public function getRegisterableDomain($host) return null; } - $host = mb_strtolower($host); + $punycoded = (strpos($host, 'xn--') !== false); + + if ($punycoded) { + $host = idn_to_utf8($host); + } + + $host = mb_strtolower($host, 'UTF-8'); $publicSuffix = $this->getPublicSuffix($host); if ($publicSuffix === null || $host == $publicSuffix) { @@ -198,7 +200,13 @@ public function getRegisterableDomain($host) $hostParts = array_reverse(explode('.', $host)); $registerableDomainParts = array_slice($hostParts, 0, count($publicSuffixParts) + 1); - return implode('.', array_reverse($registerableDomainParts)); + $registerableDomain = implode('.', array_reverse($registerableDomainParts)); + + if ($punycoded) { + $registerableDomain = idn_to_ascii($registerableDomain); + } + + return $registerableDomain; } /** @@ -209,7 +217,7 @@ public function getRegisterableDomain($host) */ public function getSubdomain($host) { - $host = mb_strtolower($host); + $host = mb_strtolower($host, 'UTF-8'); $registerableDomain = $this->getRegisterableDomain($host); if ($registerableDomain === null || $host == $registerableDomain) { @@ -224,16 +232,32 @@ public function getSubdomain($host) } /** - * Convert IDNA URLs to ASCII - must strip the scheme and only convert the URL + * UTF-8 aware parse_url() replacement. Taken from php.net manual comments. + * + * @link http://php.net/manual/en/function.parse-url.php#114817 * - * @param string $url URL to convert - * @return string ASCII URL + * @return array */ - protected function idnToAscii($url) + public function mbParseUrl($url) { - $split = preg_split(self::SCHEME_PATTERN, $url, -1, PREG_SPLIT_DELIM_CAPTURE); - $url = sprintf('%s%s', $split[1], idn_to_ascii($split[3])); + $enc_url = preg_replace_callback( + '%[^:/@?&=#]+%usD', + function ($matches) { + return urlencode($matches[0]); + }, + $url + ); + + $parts = parse_url($enc_url); + + if ($parts === false) { + throw new \InvalidArgumentException(sprintf('Invalid url %s', $url)); + } + + foreach ($parts as $name => $value) { + $parts[$name] = urldecode($value); + } - return $url; + return $parts; } } diff --git a/tests/library/Pdp/Uri/UrlTest.php b/tests/library/Pdp/Uri/UrlTest.php index 2f2d5cc1..6be76e18 100644 --- a/tests/library/Pdp/Uri/UrlTest.php +++ b/tests/library/Pdp/Uri/UrlTest.php @@ -134,7 +134,7 @@ public function testFtpUrlToString() public function testIdnToAscii() { $idn = 'Яндекс.РФ'; - $expected = 'http://xn--d1acpjx3f.xn--p1ai'; + $expected = 'http://яндекс.рф'; $url = $this->parser->parseUrl($idn); $actual = $url->__toString(); From 773cc01493175871ac3600056239fb7463f0c21e Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 22:37:15 -0500 Subject: [PATCH 06/11] Removes extra parentheses --- library/Pdp/Parser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/Pdp/Parser.php b/library/Pdp/Parser.php index d3a0a38b..a0408b08 100644 --- a/library/Pdp/Parser.php +++ b/library/Pdp/Parser.php @@ -20,7 +20,7 @@ */ class Parser { - const SCHEME_PATTERN = '#^((http|ftp)s?://)#i'; + const SCHEME_PATTERN = '#^(http|ftp)s?://#i'; /** * @var PublicSuffixList Public Suffix List From 09486a882dbc95cd680ea966e72bbfbaba6bcc33 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sat, 19 Jul 2014 23:02:05 -0500 Subject: [PATCH 07/11] Removes unnecessary calls to mb_strtolower --- library/Pdp/Parser.php | 4 +--- tests/library/Pdp/ParserTest.php | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/library/Pdp/Parser.php b/library/Pdp/Parser.php index a0408b08..66b7e5d3 100644 --- a/library/Pdp/Parser.php +++ b/library/Pdp/Parser.php @@ -128,7 +128,6 @@ public function getPublicSuffix($host) return null; } - $host = mb_strtolower($host, 'UTF-8'); $parts = array_reverse(explode('.', $host)); $publicSuffix = array(); $publicSuffixList = $this->publicSuffixList; @@ -217,7 +216,6 @@ public function getRegisterableDomain($host) */ public function getSubdomain($host) { - $host = mb_strtolower($host, 'UTF-8'); $registerableDomain = $this->getRegisterableDomain($host); if ($registerableDomain === null || $host == $registerableDomain) { @@ -238,7 +236,7 @@ public function getSubdomain($host) * * @return array */ - public function mbParseUrl($url) + protected function mbParseUrl($url) { $enc_url = preg_replace_callback( '%[^:/@?&=#]+%usD', diff --git a/tests/library/Pdp/ParserTest.php b/tests/library/Pdp/ParserTest.php index 6c556117..e5fda2e0 100644 --- a/tests/library/Pdp/ParserTest.php +++ b/tests/library/Pdp/ParserTest.php @@ -23,6 +23,7 @@ protected function tearDown() /** * @covers Pdp\Parser::parseUrl() + * @covers Pdp\Parser::mbParseUrl() */ public function testParseBadUrlThrowsInvalidArgumentException() { @@ -108,7 +109,7 @@ public function parseDataProvider() // url, public suffix, registerable domain, subdomain, host part return array( array('http://www.waxaudio.com.au/audio/albums/the_mashening', 'com.au', 'waxaudio.com.au', 'www', 'www.waxaudio.com.au'), - array('example.com', 'com', 'example.com', null, 'example.com'), + array('example.COM', 'com', 'example.com', null, 'example.com'), array('giant.yyyy', 'yyyy', 'giant.yyyy', null, 'giant.yyyy'), array('cea-law.co.il', 'co.il', 'cea-law.co.il', null, 'cea-law.co.il'), array('http://edition.cnn.com/WORLD/', 'com', 'cnn.com', 'edition', 'edition.cnn.com'), From 428718fedf3faefd65f5d2aadd4d44694dc4f2e0 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sun, 20 Jul 2014 17:11:12 -0500 Subject: [PATCH 08/11] Refactors mbParseUrl to allow for use of $component param --- library/Pdp/Parser.php | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/library/Pdp/Parser.php b/library/Pdp/Parser.php index 66b7e5d3..46bc081b 100644 --- a/library/Pdp/Parser.php +++ b/library/Pdp/Parser.php @@ -230,13 +230,19 @@ public function getSubdomain($host) } /** - * UTF-8 aware parse_url() replacement. Taken from php.net manual comments. + * UTF-8 aware parse_url() replacement. * - * @link http://php.net/manual/en/function.parse-url.php#114817 + * Taken from php.net manual comments {@link http://php.net/manual/en/function.parse-url.php#114817} * - * @return array + * @param string $url The URL to parse + * @param integer $component Specify one of PHP_URL_SCHEME, PHP_URL_HOST, + * PHP_URL_PORT, PHP_URL_USER, PHP_URL_PASS, PHP_URL_PATH, PHP_URL_QUERY or + * PHP_URL_FRAGMENT to retrieve just a specific URL component as a string + * (except when PHP_URL_PORT is given, in which case the return value will + * be an integer). + * @return mixed See parse_url documentation {@link http://us1.php.net/parse_url} */ - protected function mbParseUrl($url) + public function mbParseUrl($url, $component = -1) { $enc_url = preg_replace_callback( '%[^:/@?&=#]+%usD', @@ -246,14 +252,18 @@ function ($matches) { $url ); - $parts = parse_url($enc_url); + $parts = parse_url($enc_url, $component); if ($parts === false) { throw new \InvalidArgumentException(sprintf('Invalid url %s', $url)); } - foreach ($parts as $name => $value) { - $parts[$name] = urldecode($value); + if (is_array($parts)) { + foreach ($parts as $name => $value) { + $parts[$name] = urldecode($value); + } + } else { + $parts = urldecode($parts); } return $parts; From a20756229d1a8934a24ae2901b977fb3b6d1572b Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sun, 20 Jul 2014 17:11:45 -0500 Subject: [PATCH 09/11] Refactors and adds tests --- tests/library/Pdp/ParserTest.php | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tests/library/Pdp/ParserTest.php b/tests/library/Pdp/ParserTest.php index e5fda2e0..f0430eaf 100644 --- a/tests/library/Pdp/ParserTest.php +++ b/tests/library/Pdp/ParserTest.php @@ -28,7 +28,7 @@ protected function tearDown() public function testParseBadUrlThrowsInvalidArgumentException() { $this->setExpectedException( - '\InvalidArgumentException', + '\InvalidArgumentException', 'Invalid url http:///example.com' ); @@ -95,14 +95,17 @@ public function testGetSubdomain($url, $publicSuffix, $registerableDomain, $subd $this->assertSame($subdomain, $pdpUrl->host->subdomain); $this->assertSame($subdomain, $this->parser->getSubdomain($hostPart)); } - - /** + + /** * @dataProvider parseDataProvider - */ - public function testPHPparse_urlCanReturnCorrectHost($url, $publicSuffix, $registerableDomain, $subdomain, $hostPart) - { - $this->assertEquals($hostPart, parse_url('http://' . $hostPart, PHP_URL_HOST)); - } + */ + public function testMbParseUrlCanReturnCorrectHost($url, $publicSuffix, $registerableDomain, $subdomain, $hostPart) + { + $this->assertEquals( + $hostPart, + $this->parser->mbParseUrl('http://' . $hostPart, PHP_URL_HOST) + ); + } public function parseDataProvider() { @@ -140,6 +143,14 @@ public function parseDataProvider() array('test.museum', 'museum', 'test.museum', null, 'test.museum'), array('bob.smith.name', 'name', 'smith.name', 'bob', 'bob.smith.name'), array('tons.of.info', 'info', 'of.info', 'tons', 'tons.of.info'), + // Test IDN parsing + // Related to https://github.com/jeremykendall/php-domain-parser/issues/29 + array('http://Яндекс.РФ', 'рф', 'яндекс.рф', null, 'яндекс.рф'), + array('www.食狮.中国', '中国', '食狮.中国', 'www', 'www.食狮.中国'), + array('食狮.com.cn', 'com.cn', '食狮.com.cn', null, '食狮.com.cn'), + // Test punycode URLs + array('www.xn--85x722f.xn--fiqs8s', 'xn--fiqs8s', 'xn--85x722f.xn--fiqs8s', 'www', 'www.xn--85x722f.xn--fiqs8s'), + array('xn--85x722f.com.cn', 'com.cn', 'xn--85x722f.com.cn', null, 'xn--85x722f.com.cn'), ); } } From 06a0aae296997cfe079c1b2fdf4fe5f52471d32b Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sun, 20 Jul 2014 20:12:36 -0500 Subject: [PATCH 10/11] Updates comments --- tests/library/Pdp/ParserTest.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/library/Pdp/ParserTest.php b/tests/library/Pdp/ParserTest.php index f0430eaf..4d7ff26c 100644 --- a/tests/library/Pdp/ParserTest.php +++ b/tests/library/Pdp/ParserTest.php @@ -144,8 +144,9 @@ public function parseDataProvider() array('bob.smith.name', 'name', 'smith.name', 'bob', 'bob.smith.name'), array('tons.of.info', 'info', 'of.info', 'tons', 'tons.of.info'), // Test IDN parsing - // Related to https://github.com/jeremykendall/php-domain-parser/issues/29 + // BEGIN https://github.com/jeremykendall/php-domain-parser/issues/29 array('http://Яндекс.РФ', 'рф', 'яндекс.рф', null, 'яндекс.рф'), + // END https://github.com/jeremykendall/php-domain-parser/issues/29 array('www.食狮.中国', '中国', '食狮.中国', 'www', 'www.食狮.中国'), array('食狮.com.cn', 'com.cn', '食狮.com.cn', null, '食狮.com.cn'), // Test punycode URLs From 551aa8a6279a0610eebc46cb5909a907af0bc5f1 Mon Sep 17 00:00:00 2001 From: jeremykendall Date: Sun, 20 Jul 2014 20:30:03 -0500 Subject: [PATCH 11/11] Removes composer self-update from .travis.yml, changes update to install composer self-update has been causing tons of errors in my builds, so I'm scrapping it. --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index d571cfae..38b15594 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,5 @@ php: script: phpunit before_script: - - composer self-update - - composer update + - composer install - ./bin/pdp-psl