Merge pull request #286 from yldoctrine/fix_sitemap_crawler

Correctly extract all Sitemaps urls from robots.txt
fhamborg · Aug 12, 2024 · 758c008 · 758c008
2 parents e475bcf + 6d467c5
commit 758c008
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/newsplease/helper_classes/url_extractor.py b/newsplease/helper_classes/url_extractor.py
@@ -28,6 +28,7 @@
 # to improve performance, regex statements are compiled only once per module
 re_www = re.compile(r"^(www.)")
 re_domain = re.compile(r"[^/.]+\.[^/.]+$")
+re_sitemap = re.compile(r"Sitemap:\s([^\r\n#]*)", re.MULTILINE)
 
 
 class UrlExtractor(object):
@@ -194,7 +195,9 @@ def get_sitemap_urls(domain_url: str, allow_subdomains: bool, check_certificate:
             url=domain_url, allow_subdomains=allow_subdomains, check_certificate=check_certificate
         )
         if robots_response and robots_response.getcode() == 200:
-            return [robots_response.url]
+            robots_content = robots_response.read().decode("utf-8")
+            sitemap_urls = re_sitemap.findall(robots_content)
+            return sitemap_urls
         return UrlExtractor.check_sitemap_urls(domain_url=domain_url, check_certificate=check_certificate)
 
     @staticmethod