Skip to content

Commit

Permalink
Merge pull request #31 from jeremykendall/feature/idn-support
Browse files Browse the repository at this point in the history
Adds IDNA support
  • Loading branch information
jeremykendall committed Jul 21, 2014
2 parents a84c708 + 551aa8a commit bafb754
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 29 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,5 @@ php:
script: phpunit

before_script:
- composer self-update
- composer update
- composer install
- ./bin/pdp-psl
2 changes: 1 addition & 1 deletion data/public-suffix-list.php
Original file line number Diff line number Diff line change
Expand Up @@ -20287,4 +20287,4 @@
'cern' =>
array (
),
);
);
67 changes: 57 additions & 10 deletions library/Pdp/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,11 @@ public function parseUrl($url)
'fragment' => null,
);

if (preg_match(self::SCHEME_PATTERN, $url, $schemeMatches) === 0) {
if (preg_match(self::SCHEME_PATTERN, $url) === 0) {
$url = 'http://' . preg_replace('#^//#', '', $url, 1);
}

$parts = parse_url($url);

if ($parts === false) {
throw new \InvalidArgumentException(sprintf('Invalid url %s', $url));
}
$parts = $this->mbParseUrl($url);

$elem = (array) $parts + $elem;

Expand All @@ -91,6 +87,8 @@ public function parseUrl($url)
*/
public function parseHost($host)
{
$host = mb_strtolower($host, 'UTF-8');

$subdomain = null;
$registerableDomain = null;
$publicSuffix = null;
Expand Down Expand Up @@ -130,7 +128,6 @@ public function getPublicSuffix($host)
return null;
}

$host = strtolower($host);
$parts = array_reverse(explode('.', $host));
$publicSuffix = array();
$publicSuffixList = $this->publicSuffixList;
Expand Down Expand Up @@ -185,7 +182,13 @@ public function getRegisterableDomain($host)
return null;
}

$host = strtolower($host);
$punycoded = (strpos($host, 'xn--') !== false);

if ($punycoded) {
$host = idn_to_utf8($host);
}

$host = mb_strtolower($host, 'UTF-8');
$publicSuffix = $this->getPublicSuffix($host);

if ($publicSuffix === null || $host == $publicSuffix) {
Expand All @@ -196,7 +199,13 @@ public function getRegisterableDomain($host)
$hostParts = array_reverse(explode('.', $host));
$registerableDomainParts = array_slice($hostParts, 0, count($publicSuffixParts) + 1);

return implode('.', array_reverse($registerableDomainParts));
$registerableDomain = implode('.', array_reverse($registerableDomainParts));

if ($punycoded) {
$registerableDomain = idn_to_ascii($registerableDomain);
}

return $registerableDomain;
}

/**
Expand All @@ -207,7 +216,6 @@ public function getRegisterableDomain($host)
*/
public function getSubdomain($host)
{
$host = strtolower($host);
$registerableDomain = $this->getRegisterableDomain($host);

if ($registerableDomain === null || $host == $registerableDomain) {
Expand All @@ -221,4 +229,43 @@ public function getSubdomain($host)
return implode('.', array_reverse($subdomainParts));
}

/**
* UTF-8 aware parse_url() replacement.
*
* Taken from php.net manual comments {@link http://php.net/manual/en/function.parse-url.php#114817}
*
* @param string $url The URL to parse
* @param integer $component Specify one of PHP_URL_SCHEME, PHP_URL_HOST,
* PHP_URL_PORT, PHP_URL_USER, PHP_URL_PASS, PHP_URL_PATH, PHP_URL_QUERY or
* PHP_URL_FRAGMENT to retrieve just a specific URL component as a string
* (except when PHP_URL_PORT is given, in which case the return value will
* be an integer).
* @return mixed See parse_url documentation {@link http://us1.php.net/parse_url}
*/
public function mbParseUrl($url, $component = -1)
{
$enc_url = preg_replace_callback(
'%[^:/@?&=#]+%usD',
function ($matches) {
return urlencode($matches[0]);
},
$url
);

$parts = parse_url($enc_url, $component);

if ($parts === false) {
throw new \InvalidArgumentException(sprintf('Invalid url %s', $url));
}

if (is_array($parts)) {
foreach ($parts as $name => $value) {
$parts[$name] = urldecode($value);
}
} else {
$parts = urldecode($parts);
}

return $parts;
}
}
2 changes: 1 addition & 1 deletion library/Pdp/Uri/Url.php
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public function __toString()
$host = $this->host->__toString();

if ($host) {
$url .= urlencode($host);
$url .= $host;
}

if ($this->port) {
Expand Down
34 changes: 28 additions & 6 deletions tests/library/Pdp/CheckPublicSuffixTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
namespace Pdp;

/**
* This test case is based on the test data linked at
* This test case is based on the test data linked at
* http://publicsuffix.org/list/ and provided by Rob Strading of Comodo.
* @link
* @link
* http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1
*/
class CheckPublicSuffixTest extends \PHPUnit_Framework_TestCase
Expand All @@ -27,6 +27,8 @@ public function testPublicSuffixSpec()
{
// Test data from Rob Stradling at Comodo
// http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1
// Any copyright is dedicated to the Public Domain.
// http://creativecommons.org/publicdomain/zero/1.0/

// null input.
$this->checkPublicSuffix(null, null);
Expand Down Expand Up @@ -99,19 +101,39 @@ public function testPublicSuffixSpec()
$this->checkPublicSuffix('k12.ak.us', null);
$this->checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
$this->checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
// IDN labels.
$this->checkPublicSuffix('食狮.com.cn', '食狮.com.cn');
$this->checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn');
$this->checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn');
$this->checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn');
$this->checkPublicSuffix('公司.cn', null);
$this->checkPublicSuffix('食狮.中国', '食狮.中国');
$this->checkPublicSuffix('www.食狮.中国', '食狮.中国');
$this->checkPublicSuffix('shishi.中国', 'shishi.中国');
$this->checkPublicSuffix('中国', null);
// Same as above, but punycoded.
$this->checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn');
$this->checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn');
$this->checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn');
$this->checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn');
$this->checkPublicSuffix('xn--55qx5d.cn', null);
$this->checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s');
$this->checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s');
$this->checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s');
$this->checkPublicSuffix('xn--fiqs8s', null);
}

/**
* This is my version of the checkPublicSuffix function referred to in the
* This is my version of the checkPublicSuffix function referred to in the
* test instructions at the Public Suffix List project.
*
* "You will need to define a checkPublicSuffix() function which takes as a
* parameter a domain name and the public suffix, runs your implementation
* "You will need to define a checkPublicSuffix() function which takes as a
* parameter a domain name and the public suffix, runs your implementation
* on the domain name and checks the result is the public suffix expected."
*
* @link http://publicsuffix.org/list/
*
* @param string $input Domain and public suffix
* @param string $input Domain and public suffix
* @param string $expected Expected result
*/
public function checkPublicSuffix($input, $expected)
Expand Down
31 changes: 22 additions & 9 deletions tests/library/Pdp/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ protected function tearDown()

/**
* @covers Pdp\Parser::parseUrl()
* @covers Pdp\Parser::mbParseUrl()
*/
public function testParseBadUrlThrowsInvalidArgumentException()
{
$this->setExpectedException(
'\InvalidArgumentException',
'\InvalidArgumentException',
'Invalid url http:///example.com'
);

Expand Down Expand Up @@ -94,21 +95,24 @@ public function testGetSubdomain($url, $publicSuffix, $registerableDomain, $subd
$this->assertSame($subdomain, $pdpUrl->host->subdomain);
$this->assertSame($subdomain, $this->parser->getSubdomain($hostPart));
}
/**

/**
* @dataProvider parseDataProvider
*/
public function testPHPparse_urlCanReturnCorrectHost($url, $publicSuffix, $registerableDomain, $subdomain, $hostPart)
{
$this->assertEquals($hostPart, parse_url('http://' . $hostPart, PHP_URL_HOST));
}
*/
public function testMbParseUrlCanReturnCorrectHost($url, $publicSuffix, $registerableDomain, $subdomain, $hostPart)
{
$this->assertEquals(
$hostPart,
$this->parser->mbParseUrl('http://' . $hostPart, PHP_URL_HOST)
);
}

public function parseDataProvider()
{
// url, public suffix, registerable domain, subdomain, host part
return array(
array('http://www.waxaudio.com.au/audio/albums/the_mashening', 'com.au', 'waxaudio.com.au', 'www', 'www.waxaudio.com.au'),
array('example.com', 'com', 'example.com', null, 'example.com'),
array('example.COM', 'com', 'example.com', null, 'example.com'),
array('giant.yyyy', 'yyyy', 'giant.yyyy', null, 'giant.yyyy'),
array('cea-law.co.il', 'co.il', 'cea-law.co.il', null, 'cea-law.co.il'),
array('http://edition.cnn.com/WORLD/', 'com', 'cnn.com', 'edition', 'edition.cnn.com'),
Expand Down Expand Up @@ -139,6 +143,15 @@ public function parseDataProvider()
array('test.museum', 'museum', 'test.museum', null, 'test.museum'),
array('bob.smith.name', 'name', 'smith.name', 'bob', 'bob.smith.name'),
array('tons.of.info', 'info', 'of.info', 'tons', 'tons.of.info'),
// Test IDN parsing
// BEGIN https://github.com/jeremykendall/php-domain-parser/issues/29
array('http://Яндекс.РФ', 'рф', 'яндекс.рф', null, 'яндекс.рф'),
// END https://github.com/jeremykendall/php-domain-parser/issues/29
array('www.食狮.中国', '中国', '食狮.中国', 'www', 'www.食狮.中国'),
array('食狮.com.cn', 'com.cn', '食狮.com.cn', null, '食狮.com.cn'),
// Test punycode URLs
array('www.xn--85x722f.xn--fiqs8s', 'xn--fiqs8s', 'xn--85x722f.xn--fiqs8s', 'www', 'www.xn--85x722f.xn--fiqs8s'),
array('xn--85x722f.com.cn', 'com.cn', 'xn--85x722f.com.cn', null, 'xn--85x722f.com.cn'),
);
}
}
14 changes: 14 additions & 0 deletions tests/library/Pdp/Uri/UrlTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,18 @@ public function testFtpUrlToString()
$url = $this->parser->parseUrl($ftpUrl);
$this->assertEquals($ftpUrl, $url->__toString());
}

/**
* @group issue29
* @see https://github.com/jeremykendall/php-domain-parser/issues/29
*/
public function testIdnToAscii()
{
$idn = 'Яндекс.РФ';
$expected = 'http://яндекс.рф';
$url = $this->parser->parseUrl($idn);
$actual = $url->__toString();

$this->assertEquals($expected, $actual);
}
}

0 comments on commit bafb754

Please sign in to comment.