diff --git a/.travis.yml b/.travis.yml index d571cfae..38b15594 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,5 @@ php: script: phpunit before_script: - - composer self-update - - composer update + - composer install - ./bin/pdp-psl diff --git a/data/public-suffix-list.php b/data/public-suffix-list.php index 2fb41cf3..f80403af 100644 --- a/data/public-suffix-list.php +++ b/data/public-suffix-list.php @@ -20287,4 +20287,4 @@ 'cern' => array ( ), -); \ No newline at end of file +); diff --git a/library/Pdp/Parser.php b/library/Pdp/Parser.php index 802e4b4d..46bc081b 100644 --- a/library/Pdp/Parser.php +++ b/library/Pdp/Parser.php @@ -57,15 +57,11 @@ public function parseUrl($url) 'fragment' => null, ); - if (preg_match(self::SCHEME_PATTERN, $url, $schemeMatches) === 0) { + if (preg_match(self::SCHEME_PATTERN, $url) === 0) { $url = 'http://' . preg_replace('#^//#', '', $url, 1); } - $parts = parse_url($url); - - if ($parts === false) { - throw new \InvalidArgumentException(sprintf('Invalid url %s', $url)); - } + $parts = $this->mbParseUrl($url); $elem = (array) $parts + $elem; @@ -91,6 +87,8 @@ public function parseUrl($url) */ public function parseHost($host) { + $host = mb_strtolower($host, 'UTF-8'); + $subdomain = null; $registerableDomain = null; $publicSuffix = null; @@ -130,7 +128,6 @@ public function getPublicSuffix($host) return null; } - $host = strtolower($host); $parts = array_reverse(explode('.', $host)); $publicSuffix = array(); $publicSuffixList = $this->publicSuffixList; @@ -185,7 +182,13 @@ public function getRegisterableDomain($host) return null; } - $host = strtolower($host); + $punycoded = (strpos($host, 'xn--') !== false); + + if ($punycoded) { + $host = idn_to_utf8($host); + } + + $host = mb_strtolower($host, 'UTF-8'); $publicSuffix = $this->getPublicSuffix($host); if ($publicSuffix === null || $host == $publicSuffix) { @@ -196,7 +199,13 @@ public function getRegisterableDomain($host) $hostParts = array_reverse(explode('.', $host)); $registerableDomainParts = array_slice($hostParts, 0, count($publicSuffixParts) + 1); - return implode('.', array_reverse($registerableDomainParts)); + $registerableDomain = implode('.', array_reverse($registerableDomainParts)); + + if ($punycoded) { + $registerableDomain = idn_to_ascii($registerableDomain); + } + + return $registerableDomain; } /** @@ -207,7 +216,6 @@ public function getRegisterableDomain($host) */ public function getSubdomain($host) { - $host = strtolower($host); $registerableDomain = $this->getRegisterableDomain($host); if ($registerableDomain === null || $host == $registerableDomain) { @@ -221,4 +229,43 @@ public function getSubdomain($host) return implode('.', array_reverse($subdomainParts)); } + /** + * UTF-8 aware parse_url() replacement. + * + * Taken from php.net manual comments {@link http://php.net/manual/en/function.parse-url.php#114817} + * + * @param string $url The URL to parse + * @param integer $component Specify one of PHP_URL_SCHEME, PHP_URL_HOST, + * PHP_URL_PORT, PHP_URL_USER, PHP_URL_PASS, PHP_URL_PATH, PHP_URL_QUERY or + * PHP_URL_FRAGMENT to retrieve just a specific URL component as a string + * (except when PHP_URL_PORT is given, in which case the return value will + * be an integer). + * @return mixed See parse_url documentation {@link http://us1.php.net/parse_url} + */ + public function mbParseUrl($url, $component = -1) + { + $enc_url = preg_replace_callback( + '%[^:/@?&=#]+%usD', + function ($matches) { + return urlencode($matches[0]); + }, + $url + ); + + $parts = parse_url($enc_url, $component); + + if ($parts === false) { + throw new \InvalidArgumentException(sprintf('Invalid url %s', $url)); + } + + if (is_array($parts)) { + foreach ($parts as $name => $value) { + $parts[$name] = urldecode($value); + } + } else { + $parts = urldecode($parts); + } + + return $parts; + } } diff --git a/library/Pdp/Uri/Url.php b/library/Pdp/Uri/Url.php index 21162cf1..efd575e1 100644 --- a/library/Pdp/Uri/Url.php +++ b/library/Pdp/Uri/Url.php @@ -134,7 +134,7 @@ public function __toString() $host = $this->host->__toString(); if ($host) { - $url .= urlencode($host); + $url .= $host; } if ($this->port) { diff --git a/tests/library/Pdp/CheckPublicSuffixTest.php b/tests/library/Pdp/CheckPublicSuffixTest.php index 74100a0d..43ddd863 100644 --- a/tests/library/Pdp/CheckPublicSuffixTest.php +++ b/tests/library/Pdp/CheckPublicSuffixTest.php @@ -3,9 +3,9 @@ namespace Pdp; /** - * This test case is based on the test data linked at + * This test case is based on the test data linked at * http://publicsuffix.org/list/ and provided by Rob Strading of Comodo. - * @link + * @link * http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1 */ class CheckPublicSuffixTest extends \PHPUnit_Framework_TestCase @@ -27,6 +27,8 @@ public function testPublicSuffixSpec() { // Test data from Rob Stradling at Comodo // http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1 + // Any copyright is dedicated to the Public Domain. + // http://creativecommons.org/publicdomain/zero/1.0/ // null input. $this->checkPublicSuffix(null, null); @@ -99,19 +101,39 @@ public function testPublicSuffixSpec() $this->checkPublicSuffix('k12.ak.us', null); $this->checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us'); $this->checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us'); + // IDN labels. + $this->checkPublicSuffix('食狮.com.cn', '食狮.com.cn'); + $this->checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn'); + $this->checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn'); + $this->checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn'); + $this->checkPublicSuffix('公司.cn', null); + $this->checkPublicSuffix('食狮.中国', '食狮.中国'); + $this->checkPublicSuffix('www.食狮.中国', '食狮.中国'); + $this->checkPublicSuffix('shishi.中国', 'shishi.中国'); + $this->checkPublicSuffix('中国', null); + // Same as above, but punycoded. + $this->checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn'); + $this->checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); + $this->checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); + $this->checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn'); + $this->checkPublicSuffix('xn--55qx5d.cn', null); + $this->checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); + $this->checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); + $this->checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s'); + $this->checkPublicSuffix('xn--fiqs8s', null); } /** - * This is my version of the checkPublicSuffix function referred to in the + * This is my version of the checkPublicSuffix function referred to in the * test instructions at the Public Suffix List project. * - * "You will need to define a checkPublicSuffix() function which takes as a - * parameter a domain name and the public suffix, runs your implementation + * "You will need to define a checkPublicSuffix() function which takes as a + * parameter a domain name and the public suffix, runs your implementation * on the domain name and checks the result is the public suffix expected." * * @link http://publicsuffix.org/list/ * - * @param string $input Domain and public suffix + * @param string $input Domain and public suffix * @param string $expected Expected result */ public function checkPublicSuffix($input, $expected) diff --git a/tests/library/Pdp/ParserTest.php b/tests/library/Pdp/ParserTest.php index 6c556117..4d7ff26c 100644 --- a/tests/library/Pdp/ParserTest.php +++ b/tests/library/Pdp/ParserTest.php @@ -23,11 +23,12 @@ protected function tearDown() /** * @covers Pdp\Parser::parseUrl() + * @covers Pdp\Parser::mbParseUrl() */ public function testParseBadUrlThrowsInvalidArgumentException() { $this->setExpectedException( - '\InvalidArgumentException', + '\InvalidArgumentException', 'Invalid url http:///example.com' ); @@ -94,21 +95,24 @@ public function testGetSubdomain($url, $publicSuffix, $registerableDomain, $subd $this->assertSame($subdomain, $pdpUrl->host->subdomain); $this->assertSame($subdomain, $this->parser->getSubdomain($hostPart)); } - - /** + + /** * @dataProvider parseDataProvider - */ - public function testPHPparse_urlCanReturnCorrectHost($url, $publicSuffix, $registerableDomain, $subdomain, $hostPart) - { - $this->assertEquals($hostPart, parse_url('http://' . $hostPart, PHP_URL_HOST)); - } + */ + public function testMbParseUrlCanReturnCorrectHost($url, $publicSuffix, $registerableDomain, $subdomain, $hostPart) + { + $this->assertEquals( + $hostPart, + $this->parser->mbParseUrl('http://' . $hostPart, PHP_URL_HOST) + ); + } public function parseDataProvider() { // url, public suffix, registerable domain, subdomain, host part return array( array('http://www.waxaudio.com.au/audio/albums/the_mashening', 'com.au', 'waxaudio.com.au', 'www', 'www.waxaudio.com.au'), - array('example.com', 'com', 'example.com', null, 'example.com'), + array('example.COM', 'com', 'example.com', null, 'example.com'), array('giant.yyyy', 'yyyy', 'giant.yyyy', null, 'giant.yyyy'), array('cea-law.co.il', 'co.il', 'cea-law.co.il', null, 'cea-law.co.il'), array('http://edition.cnn.com/WORLD/', 'com', 'cnn.com', 'edition', 'edition.cnn.com'), @@ -139,6 +143,15 @@ public function parseDataProvider() array('test.museum', 'museum', 'test.museum', null, 'test.museum'), array('bob.smith.name', 'name', 'smith.name', 'bob', 'bob.smith.name'), array('tons.of.info', 'info', 'of.info', 'tons', 'tons.of.info'), + // Test IDN parsing + // BEGIN https://github.com/jeremykendall/php-domain-parser/issues/29 + array('http://Яндекс.РФ', 'рф', 'яндекс.рф', null, 'яндекс.рф'), + // END https://github.com/jeremykendall/php-domain-parser/issues/29 + array('www.食狮.中国', '中国', '食狮.中国', 'www', 'www.食狮.中国'), + array('食狮.com.cn', 'com.cn', '食狮.com.cn', null, '食狮.com.cn'), + // Test punycode URLs + array('www.xn--85x722f.xn--fiqs8s', 'xn--fiqs8s', 'xn--85x722f.xn--fiqs8s', 'www', 'www.xn--85x722f.xn--fiqs8s'), + array('xn--85x722f.com.cn', 'com.cn', 'xn--85x722f.com.cn', null, 'xn--85x722f.com.cn'), ); } } diff --git a/tests/library/Pdp/Uri/UrlTest.php b/tests/library/Pdp/Uri/UrlTest.php index c59b38ae..6be76e18 100644 --- a/tests/library/Pdp/Uri/UrlTest.php +++ b/tests/library/Pdp/Uri/UrlTest.php @@ -126,4 +126,18 @@ public function testFtpUrlToString() $url = $this->parser->parseUrl($ftpUrl); $this->assertEquals($ftpUrl, $url->__toString()); } + + /** + * @group issue29 + * @see https://github.com/jeremykendall/php-domain-parser/issues/29 + */ + public function testIdnToAscii() + { + $idn = 'Яндекс.РФ'; + $expected = 'http://яндекс.рф'; + $url = $this->parser->parseUrl($idn); + $actual = $url->__toString(); + + $this->assertEquals($expected, $actual); + } }