-
Notifications
You must be signed in to change notification settings - Fork 344
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ability to extract urls from sitemap index files and handle compresse…
…d sitemap files
- Loading branch information
santhosh
committed
Feb 21, 2024
1 parent
e56f097
commit a05a313
Showing
1 changed file
with
39 additions
and
13 deletions.
There are no files selected for viewing
52 changes: 39 additions & 13 deletions
52
lib/shared/layers/python-sdk/python/genai_core/websites/sitemap.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,44 @@ | ||
import requests | ||
import defusedxml.ElementTree as ET | ||
import gzip | ||
import os | ||
|
||
def decompress_gzip_data(response): | ||
filename = f'/tmp/{hash(response.url)}.gzip' | ||
with open(filename, 'wb') as file: | ||
file.write(response.content) | ||
with gzip.open(filename, 'rb') as f: | ||
sitemap_xml = f.read() | ||
os.remove(filename) | ||
return sitemap_xml | ||
|
||
def extract_urls_from_sitemap(sitemap_url: str): | ||
response = requests.get(sitemap_url, timeout=30) | ||
sitemap = response.content | ||
def extract_urls_from_sitemap(sitemap_url): | ||
urls = [] | ||
try: | ||
response = requests.get(sitemap_url) | ||
if response.status_code != 200: | ||
print(f'Error while fetching sitemap data: {sitemap_url}') | ||
return [] | ||
|
||
# Handle sitemap with gzip compression | ||
if sitemap_url.lower().endswith('gz'): | ||
sitemap = decompress_gzip_data(response) | ||
else: | ||
sitemap = response.content | ||
root = ET.fromstring(sitemap) | ||
root_tag = root.tag.lower() | ||
|
||
root = ET.fromstring(sitemap) | ||
|
||
urls = [ | ||
elem.text | ||
for elem in root.findall( | ||
"{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc" | ||
) | ||
] | ||
|
||
return urls | ||
# if root element is sitemapindex, fetch individual sitemaps recursively | ||
if 'sitemapindex' in root_tag: | ||
for elem in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): | ||
links = extract_urls_from_sitemap(elem.text) | ||
urls.extend(links) | ||
elif 'urlset' in root_tag: | ||
for elem in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): | ||
urls.append(elem.text) | ||
else: | ||
print(f'No valid root tag found for sitemap: {sitemap_url}') | ||
except Exception as e: | ||
print(f'Error while processing sitemaps for {sitemap_url}',e) | ||
else: | ||
return urls |