Skip to content

Commit

Permalink
Merge pull request #286 from yldoctrine/fix_sitemap_crawler
Browse files Browse the repository at this point in the history
Correctly extract all Sitemaps urls from robots.txt
  • Loading branch information
fhamborg authored Aug 12, 2024
2 parents e475bcf + 6d467c5 commit 758c008
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion newsplease/helper_classes/url_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# to improve performance, regex statements are compiled only once per module
re_www = re.compile(r"^(www.)")
re_domain = re.compile(r"[^/.]+\.[^/.]+$")
re_sitemap = re.compile(r"Sitemap:\s([^\r\n#]*)", re.MULTILINE)


class UrlExtractor(object):
Expand Down Expand Up @@ -194,7 +195,9 @@ def get_sitemap_urls(domain_url: str, allow_subdomains: bool, check_certificate:
url=domain_url, allow_subdomains=allow_subdomains, check_certificate=check_certificate
)
if robots_response and robots_response.getcode() == 200:
return [robots_response.url]
robots_content = robots_response.read().decode("utf-8")
sitemap_urls = re_sitemap.findall(robots_content)
return sitemap_urls
return UrlExtractor.check_sitemap_urls(domain_url=domain_url, check_certificate=check_certificate)

@staticmethod
Expand Down

0 comments on commit 758c008

Please sign in to comment.