Skip to content

Commit

Permalink
Ability to extract urls from sitemap index files and handle compresse…
Browse files Browse the repository at this point in the history
…d sitemap files
  • Loading branch information
santhosh committed Feb 21, 2024
1 parent e56f097 commit a05a313
Showing 1 changed file with 39 additions and 13 deletions.
52 changes: 39 additions & 13 deletions lib/shared/layers/python-sdk/python/genai_core/websites/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,44 @@
import requests
import defusedxml.ElementTree as ET
import gzip
import os

def decompress_gzip_data(response):
filename = f'/tmp/{hash(response.url)}.gzip'
with open(filename, 'wb') as file:
file.write(response.content)
with gzip.open(filename, 'rb') as f:
sitemap_xml = f.read()
os.remove(filename)
return sitemap_xml

def extract_urls_from_sitemap(sitemap_url: str):
response = requests.get(sitemap_url, timeout=30)
sitemap = response.content
def extract_urls_from_sitemap(sitemap_url):
urls = []
try:
response = requests.get(sitemap_url)
if response.status_code != 200:
print(f'Error while fetching sitemap data: {sitemap_url}')
return []

# Handle sitemap with gzip compression
if sitemap_url.lower().endswith('gz'):
sitemap = decompress_gzip_data(response)
else:
sitemap = response.content
root = ET.fromstring(sitemap)
root_tag = root.tag.lower()

root = ET.fromstring(sitemap)

urls = [
elem.text
for elem in root.findall(
"{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"
)
]

return urls
# if root element is sitemapindex, fetch individual sitemaps recursively
if 'sitemapindex' in root_tag:
for elem in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
links = extract_urls_from_sitemap(elem.text)
urls.extend(links)
elif 'urlset' in root_tag:
for elem in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
urls.append(elem.text)
else:
print(f'No valid root tag found for sitemap: {sitemap_url}')
except Exception as e:
print(f'Error while processing sitemaps for {sitemap_url}',e)
else:
return urls

0 comments on commit a05a313

Please sign in to comment.